{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.99486125385406, "eval_steps": 500, "global_step": 1215, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0041109969167523125, "grad_norm": 6.356808310456779, "learning_rate": 3.278688524590164e-07, "loss": 0.82, "step": 1 }, { "epoch": 0.008221993833504625, "grad_norm": 6.417518537271285, "learning_rate": 6.557377049180328e-07, "loss": 0.8218, "step": 2 }, { "epoch": 0.012332990750256937, "grad_norm": 6.266225501769135, "learning_rate": 9.836065573770493e-07, "loss": 0.7983, "step": 3 }, { "epoch": 0.01644398766700925, "grad_norm": 6.059283773388311, "learning_rate": 1.3114754098360657e-06, "loss": 0.7971, "step": 4 }, { "epoch": 0.020554984583761562, "grad_norm": 5.8622860351950585, "learning_rate": 1.6393442622950819e-06, "loss": 0.811, "step": 5 }, { "epoch": 0.024665981500513873, "grad_norm": 5.599946959024431, "learning_rate": 1.9672131147540985e-06, "loss": 0.7955, "step": 6 }, { "epoch": 0.02877697841726619, "grad_norm": 4.404619948223283, "learning_rate": 2.295081967213115e-06, "loss": 0.7786, "step": 7 }, { "epoch": 0.0328879753340185, "grad_norm": 3.922409083964933, "learning_rate": 2.6229508196721314e-06, "loss": 0.7401, "step": 8 }, { "epoch": 0.03699897225077081, "grad_norm": 2.2688697860924765, "learning_rate": 2.9508196721311478e-06, "loss": 0.7309, "step": 9 }, { "epoch": 0.041109969167523124, "grad_norm": 2.066008891071192, "learning_rate": 3.2786885245901638e-06, "loss": 0.719, "step": 10 }, { "epoch": 0.045220966084275435, "grad_norm": 1.9201877271634715, "learning_rate": 3.6065573770491806e-06, "loss": 0.7258, "step": 11 }, { "epoch": 0.04933196300102775, "grad_norm": 3.898309985774545, "learning_rate": 3.934426229508197e-06, "loss": 0.7205, "step": 12 }, { "epoch": 0.05344295991778006, "grad_norm": 4.0353502189938855, "learning_rate": 4.2622950819672135e-06, "loss": 0.7192, "step": 13 }, { "epoch": 0.05755395683453238, "grad_norm": 4.121703890013303, "learning_rate": 4.59016393442623e-06, "loss": 0.7244, "step": 14 }, { "epoch": 0.06166495375128469, "grad_norm": 3.928913042484364, "learning_rate": 4.918032786885246e-06, "loss": 0.6995, "step": 15 }, { "epoch": 0.065775950668037, "grad_norm": 3.142521725483005, "learning_rate": 5.245901639344263e-06, "loss": 0.6643, "step": 16 }, { "epoch": 0.0698869475847893, "grad_norm": 2.847209107159321, "learning_rate": 5.573770491803278e-06, "loss": 0.6531, "step": 17 }, { "epoch": 0.07399794450154162, "grad_norm": 2.2003209693474126, "learning_rate": 5.9016393442622956e-06, "loss": 0.66, "step": 18 }, { "epoch": 0.07810894141829394, "grad_norm": 1.336519700469157, "learning_rate": 6.229508196721312e-06, "loss": 0.6164, "step": 19 }, { "epoch": 0.08221993833504625, "grad_norm": 1.2400729726657767, "learning_rate": 6.5573770491803276e-06, "loss": 0.6071, "step": 20 }, { "epoch": 0.08633093525179857, "grad_norm": 1.4350908797178215, "learning_rate": 6.885245901639345e-06, "loss": 0.6066, "step": 21 }, { "epoch": 0.09044193216855087, "grad_norm": 1.4014758782073495, "learning_rate": 7.213114754098361e-06, "loss": 0.5995, "step": 22 }, { "epoch": 0.09455292908530319, "grad_norm": 1.1237552991193, "learning_rate": 7.540983606557377e-06, "loss": 0.5895, "step": 23 }, { "epoch": 0.0986639260020555, "grad_norm": 0.8497080531691873, "learning_rate": 7.868852459016394e-06, "loss": 0.588, "step": 24 }, { "epoch": 0.10277492291880781, "grad_norm": 0.9384127005244138, "learning_rate": 8.19672131147541e-06, "loss": 0.5767, "step": 25 }, { "epoch": 0.10688591983556012, "grad_norm": 0.8190287043666049, "learning_rate": 8.524590163934427e-06, "loss": 0.5698, "step": 26 }, { "epoch": 0.11099691675231244, "grad_norm": 0.6808839486547595, "learning_rate": 8.852459016393443e-06, "loss": 0.5795, "step": 27 }, { "epoch": 0.11510791366906475, "grad_norm": 0.7939592915072008, "learning_rate": 9.18032786885246e-06, "loss": 0.5642, "step": 28 }, { "epoch": 0.11921891058581706, "grad_norm": 0.7673957275771759, "learning_rate": 9.508196721311476e-06, "loss": 0.5505, "step": 29 }, { "epoch": 0.12332990750256938, "grad_norm": 0.542256505411903, "learning_rate": 9.836065573770493e-06, "loss": 0.5525, "step": 30 }, { "epoch": 0.12744090441932168, "grad_norm": 0.5835090756188929, "learning_rate": 1.0163934426229509e-05, "loss": 0.5394, "step": 31 }, { "epoch": 0.131551901336074, "grad_norm": 2.7254227477748034, "learning_rate": 1.0491803278688525e-05, "loss": 0.587, "step": 32 }, { "epoch": 0.13566289825282632, "grad_norm": 0.869479260973271, "learning_rate": 1.0819672131147544e-05, "loss": 0.5492, "step": 33 }, { "epoch": 0.1397738951695786, "grad_norm": 0.5177682668145469, "learning_rate": 1.1147540983606557e-05, "loss": 0.5325, "step": 34 }, { "epoch": 0.14388489208633093, "grad_norm": 0.511981609333191, "learning_rate": 1.1475409836065575e-05, "loss": 0.5486, "step": 35 }, { "epoch": 0.14799588900308325, "grad_norm": 0.5836498681132752, "learning_rate": 1.1803278688524591e-05, "loss": 0.5391, "step": 36 }, { "epoch": 0.15210688591983557, "grad_norm": 0.6259670640713604, "learning_rate": 1.2131147540983608e-05, "loss": 0.5342, "step": 37 }, { "epoch": 0.15621788283658788, "grad_norm": 0.4863673175391185, "learning_rate": 1.2459016393442624e-05, "loss": 0.5202, "step": 38 }, { "epoch": 0.16032887975334018, "grad_norm": 0.46984524173771686, "learning_rate": 1.2786885245901642e-05, "loss": 0.5275, "step": 39 }, { "epoch": 0.1644398766700925, "grad_norm": 0.4373961614640998, "learning_rate": 1.3114754098360655e-05, "loss": 0.5259, "step": 40 }, { "epoch": 0.1685508735868448, "grad_norm": 0.5382826043776142, "learning_rate": 1.3442622950819673e-05, "loss": 0.5289, "step": 41 }, { "epoch": 0.17266187050359713, "grad_norm": 0.49858273312303336, "learning_rate": 1.377049180327869e-05, "loss": 0.5034, "step": 42 }, { "epoch": 0.17677286742034942, "grad_norm": 0.4565898955129958, "learning_rate": 1.4098360655737706e-05, "loss": 0.4992, "step": 43 }, { "epoch": 0.18088386433710174, "grad_norm": 0.4250478862858475, "learning_rate": 1.4426229508196722e-05, "loss": 0.5099, "step": 44 }, { "epoch": 0.18499486125385406, "grad_norm": 0.5450828551582885, "learning_rate": 1.4754098360655739e-05, "loss": 0.5133, "step": 45 }, { "epoch": 0.18910585817060638, "grad_norm": 0.5082095741416903, "learning_rate": 1.5081967213114754e-05, "loss": 0.507, "step": 46 }, { "epoch": 0.1932168550873587, "grad_norm": 0.399668921358076, "learning_rate": 1.5409836065573772e-05, "loss": 0.499, "step": 47 }, { "epoch": 0.197327852004111, "grad_norm": 0.5857374299092792, "learning_rate": 1.5737704918032788e-05, "loss": 0.5182, "step": 48 }, { "epoch": 0.2014388489208633, "grad_norm": 0.4337541604951673, "learning_rate": 1.6065573770491805e-05, "loss": 0.5048, "step": 49 }, { "epoch": 0.20554984583761562, "grad_norm": 0.675490041268254, "learning_rate": 1.639344262295082e-05, "loss": 0.5091, "step": 50 }, { "epoch": 0.20966084275436794, "grad_norm": 0.44682409800475936, "learning_rate": 1.6721311475409837e-05, "loss": 0.4948, "step": 51 }, { "epoch": 0.21377183967112023, "grad_norm": 0.5243379991172152, "learning_rate": 1.7049180327868854e-05, "loss": 0.4959, "step": 52 }, { "epoch": 0.21788283658787255, "grad_norm": 0.48058870125487607, "learning_rate": 1.737704918032787e-05, "loss": 0.493, "step": 53 }, { "epoch": 0.22199383350462487, "grad_norm": 0.4009755414381969, "learning_rate": 1.7704918032786887e-05, "loss": 0.5005, "step": 54 }, { "epoch": 0.2261048304213772, "grad_norm": 0.4487072583979547, "learning_rate": 1.8032786885245903e-05, "loss": 0.5, "step": 55 }, { "epoch": 0.2302158273381295, "grad_norm": 0.43431903464010596, "learning_rate": 1.836065573770492e-05, "loss": 0.4822, "step": 56 }, { "epoch": 0.2343268242548818, "grad_norm": 0.4223425144399419, "learning_rate": 1.8688524590163936e-05, "loss": 0.4884, "step": 57 }, { "epoch": 0.23843782117163412, "grad_norm": 0.40422238831771906, "learning_rate": 1.9016393442622952e-05, "loss": 0.5064, "step": 58 }, { "epoch": 0.24254881808838644, "grad_norm": 0.4353031683109967, "learning_rate": 1.934426229508197e-05, "loss": 0.4844, "step": 59 }, { "epoch": 0.24665981500513876, "grad_norm": 0.5063299442881862, "learning_rate": 1.9672131147540985e-05, "loss": 0.4871, "step": 60 }, { "epoch": 0.25077081192189105, "grad_norm": 0.6405429501414496, "learning_rate": 2e-05, "loss": 0.4916, "step": 61 }, { "epoch": 0.25488180883864336, "grad_norm": 0.7398107392403913, "learning_rate": 2.0327868852459018e-05, "loss": 0.4967, "step": 62 }, { "epoch": 0.2589928057553957, "grad_norm": 0.6066259496387154, "learning_rate": 2.0655737704918034e-05, "loss": 0.4933, "step": 63 }, { "epoch": 0.263103802672148, "grad_norm": 0.6888615660905145, "learning_rate": 2.098360655737705e-05, "loss": 0.4849, "step": 64 }, { "epoch": 0.2672147995889003, "grad_norm": 0.6046305786161926, "learning_rate": 2.1311475409836067e-05, "loss": 0.4997, "step": 65 }, { "epoch": 0.27132579650565264, "grad_norm": 0.4755750596713722, "learning_rate": 2.1639344262295087e-05, "loss": 0.484, "step": 66 }, { "epoch": 0.27543679342240496, "grad_norm": 0.4901884477105443, "learning_rate": 2.1967213114754104e-05, "loss": 0.4714, "step": 67 }, { "epoch": 0.2795477903391572, "grad_norm": 0.5180862601664822, "learning_rate": 2.2295081967213113e-05, "loss": 0.4743, "step": 68 }, { "epoch": 0.28365878725590954, "grad_norm": 0.6341799796360953, "learning_rate": 2.2622950819672133e-05, "loss": 0.4837, "step": 69 }, { "epoch": 0.28776978417266186, "grad_norm": 0.7050713511862262, "learning_rate": 2.295081967213115e-05, "loss": 0.4732, "step": 70 }, { "epoch": 0.2918807810894142, "grad_norm": 0.46520327730925665, "learning_rate": 2.3278688524590166e-05, "loss": 0.4763, "step": 71 }, { "epoch": 0.2959917780061665, "grad_norm": 0.46570649065351716, "learning_rate": 2.3606557377049182e-05, "loss": 0.4729, "step": 72 }, { "epoch": 0.3001027749229188, "grad_norm": 0.5435122355995184, "learning_rate": 2.39344262295082e-05, "loss": 0.4673, "step": 73 }, { "epoch": 0.30421377183967113, "grad_norm": 0.7024832057525984, "learning_rate": 2.4262295081967215e-05, "loss": 0.4685, "step": 74 }, { "epoch": 0.30832476875642345, "grad_norm": 0.5982496336902186, "learning_rate": 2.459016393442623e-05, "loss": 0.4683, "step": 75 }, { "epoch": 0.31243576567317577, "grad_norm": 0.5579092038957036, "learning_rate": 2.4918032786885248e-05, "loss": 0.4818, "step": 76 }, { "epoch": 0.31654676258992803, "grad_norm": 0.75454502368708, "learning_rate": 2.5245901639344264e-05, "loss": 0.4745, "step": 77 }, { "epoch": 0.32065775950668035, "grad_norm": 0.9103711158770255, "learning_rate": 2.5573770491803284e-05, "loss": 0.4732, "step": 78 }, { "epoch": 0.32476875642343267, "grad_norm": 0.7230510862281725, "learning_rate": 2.59016393442623e-05, "loss": 0.4726, "step": 79 }, { "epoch": 0.328879753340185, "grad_norm": 0.661725190625586, "learning_rate": 2.622950819672131e-05, "loss": 0.4691, "step": 80 }, { "epoch": 0.3329907502569373, "grad_norm": 1.3040040424420736, "learning_rate": 2.655737704918033e-05, "loss": 0.4596, "step": 81 }, { "epoch": 0.3371017471736896, "grad_norm": 0.9247546415389841, "learning_rate": 2.6885245901639346e-05, "loss": 0.4687, "step": 82 }, { "epoch": 0.34121274409044194, "grad_norm": 0.6690753031478268, "learning_rate": 2.7213114754098363e-05, "loss": 0.475, "step": 83 }, { "epoch": 0.34532374100719426, "grad_norm": 0.8875577066120585, "learning_rate": 2.754098360655738e-05, "loss": 0.4779, "step": 84 }, { "epoch": 0.3494347379239466, "grad_norm": 0.9913820671901682, "learning_rate": 2.7868852459016396e-05, "loss": 0.4585, "step": 85 }, { "epoch": 0.35354573484069884, "grad_norm": 1.0406026178086218, "learning_rate": 2.8196721311475412e-05, "loss": 0.4777, "step": 86 }, { "epoch": 0.35765673175745116, "grad_norm": 0.8568856680996076, "learning_rate": 2.852459016393443e-05, "loss": 0.476, "step": 87 }, { "epoch": 0.3617677286742035, "grad_norm": 0.8578450824032388, "learning_rate": 2.8852459016393445e-05, "loss": 0.477, "step": 88 }, { "epoch": 0.3658787255909558, "grad_norm": 1.0997371612060205, "learning_rate": 2.918032786885246e-05, "loss": 0.4694, "step": 89 }, { "epoch": 0.3699897225077081, "grad_norm": 0.6710185323141514, "learning_rate": 2.9508196721311478e-05, "loss": 0.4664, "step": 90 }, { "epoch": 0.37410071942446044, "grad_norm": 0.8753359919001613, "learning_rate": 2.9836065573770498e-05, "loss": 0.4675, "step": 91 }, { "epoch": 0.37821171634121276, "grad_norm": 0.9060750756011728, "learning_rate": 3.0163934426229507e-05, "loss": 0.4577, "step": 92 }, { "epoch": 0.3823227132579651, "grad_norm": 0.8327145117934229, "learning_rate": 3.0491803278688527e-05, "loss": 0.4798, "step": 93 }, { "epoch": 0.3864337101747174, "grad_norm": 0.9953249905867948, "learning_rate": 3.0819672131147544e-05, "loss": 0.4588, "step": 94 }, { "epoch": 0.39054470709146966, "grad_norm": 0.790890207442512, "learning_rate": 3.1147540983606557e-05, "loss": 0.4813, "step": 95 }, { "epoch": 0.394655704008222, "grad_norm": 0.7071469844879325, "learning_rate": 3.1475409836065576e-05, "loss": 0.4715, "step": 96 }, { "epoch": 0.3987667009249743, "grad_norm": 0.683447139315226, "learning_rate": 3.180327868852459e-05, "loss": 0.4568, "step": 97 }, { "epoch": 0.4028776978417266, "grad_norm": 0.6863994738211686, "learning_rate": 3.213114754098361e-05, "loss": 0.4516, "step": 98 }, { "epoch": 0.40698869475847893, "grad_norm": 0.6443321732944037, "learning_rate": 3.245901639344263e-05, "loss": 0.4467, "step": 99 }, { "epoch": 0.41109969167523125, "grad_norm": 0.6015090752114448, "learning_rate": 3.278688524590164e-05, "loss": 0.4566, "step": 100 }, { "epoch": 0.41521068859198357, "grad_norm": 0.648925234921687, "learning_rate": 3.311475409836066e-05, "loss": 0.4598, "step": 101 }, { "epoch": 0.4193216855087359, "grad_norm": 0.5740497039935356, "learning_rate": 3.3442622950819675e-05, "loss": 0.4514, "step": 102 }, { "epoch": 0.4234326824254882, "grad_norm": 0.7433508320080534, "learning_rate": 3.3770491803278695e-05, "loss": 0.4555, "step": 103 }, { "epoch": 0.42754367934224047, "grad_norm": 0.9786371138605869, "learning_rate": 3.409836065573771e-05, "loss": 0.4724, "step": 104 }, { "epoch": 0.4316546762589928, "grad_norm": 1.16381322551552, "learning_rate": 3.442622950819672e-05, "loss": 0.4665, "step": 105 }, { "epoch": 0.4357656731757451, "grad_norm": 0.7033574666436274, "learning_rate": 3.475409836065574e-05, "loss": 0.4741, "step": 106 }, { "epoch": 0.4398766700924974, "grad_norm": 1.256476593209221, "learning_rate": 3.5081967213114754e-05, "loss": 0.476, "step": 107 }, { "epoch": 0.44398766700924974, "grad_norm": 0.5933957475473355, "learning_rate": 3.5409836065573773e-05, "loss": 0.4653, "step": 108 }, { "epoch": 0.44809866392600206, "grad_norm": 1.025564753787377, "learning_rate": 3.5737704918032786e-05, "loss": 0.47, "step": 109 }, { "epoch": 0.4522096608427544, "grad_norm": 1.0088674998209484, "learning_rate": 3.6065573770491806e-05, "loss": 0.4681, "step": 110 }, { "epoch": 0.4563206577595067, "grad_norm": 0.9216004942062503, "learning_rate": 3.6393442622950826e-05, "loss": 0.4546, "step": 111 }, { "epoch": 0.460431654676259, "grad_norm": 1.1054709646558805, "learning_rate": 3.672131147540984e-05, "loss": 0.4669, "step": 112 }, { "epoch": 0.4645426515930113, "grad_norm": 0.6642218594282759, "learning_rate": 3.704918032786886e-05, "loss": 0.4533, "step": 113 }, { "epoch": 0.4686536485097636, "grad_norm": 0.8356269646157981, "learning_rate": 3.737704918032787e-05, "loss": 0.4599, "step": 114 }, { "epoch": 0.4727646454265159, "grad_norm": 1.1650429141300205, "learning_rate": 3.770491803278689e-05, "loss": 0.448, "step": 115 }, { "epoch": 0.47687564234326824, "grad_norm": 0.6212175962293394, "learning_rate": 3.8032786885245905e-05, "loss": 0.4638, "step": 116 }, { "epoch": 0.48098663926002055, "grad_norm": 1.1965895951813037, "learning_rate": 3.836065573770492e-05, "loss": 0.4619, "step": 117 }, { "epoch": 0.4850976361767729, "grad_norm": 0.8457976781943612, "learning_rate": 3.868852459016394e-05, "loss": 0.4555, "step": 118 }, { "epoch": 0.4892086330935252, "grad_norm": 0.8463559301031214, "learning_rate": 3.901639344262295e-05, "loss": 0.4713, "step": 119 }, { "epoch": 0.4933196300102775, "grad_norm": 0.6654609587793014, "learning_rate": 3.934426229508197e-05, "loss": 0.4461, "step": 120 }, { "epoch": 0.49743062692702983, "grad_norm": 0.7698090467763701, "learning_rate": 3.9672131147540983e-05, "loss": 0.4627, "step": 121 }, { "epoch": 0.5015416238437821, "grad_norm": 0.5716155461137187, "learning_rate": 4e-05, "loss": 0.4576, "step": 122 }, { "epoch": 0.5056526207605344, "grad_norm": 0.5151376433722467, "learning_rate": 3.999991738495905e-05, "loss": 0.4485, "step": 123 }, { "epoch": 0.5097636176772867, "grad_norm": 0.561427237450996, "learning_rate": 3.9999669540518704e-05, "loss": 0.454, "step": 124 }, { "epoch": 0.513874614594039, "grad_norm": 0.6553799163893537, "learning_rate": 3.999925646872655e-05, "loss": 0.4523, "step": 125 }, { "epoch": 0.5179856115107914, "grad_norm": 0.7909652053854684, "learning_rate": 3.9998678172995157e-05, "loss": 0.4544, "step": 126 }, { "epoch": 0.5220966084275437, "grad_norm": 0.8388721187199466, "learning_rate": 3.999793465810214e-05, "loss": 0.4408, "step": 127 }, { "epoch": 0.526207605344296, "grad_norm": 0.7373151231076792, "learning_rate": 3.999702593019004e-05, "loss": 0.4596, "step": 128 }, { "epoch": 0.5303186022610483, "grad_norm": 0.9546826007376602, "learning_rate": 3.9995951996766316e-05, "loss": 0.459, "step": 129 }, { "epoch": 0.5344295991778006, "grad_norm": 0.9027549638128062, "learning_rate": 3.999471286670328e-05, "loss": 0.4537, "step": 130 }, { "epoch": 0.538540596094553, "grad_norm": 1.0477189023005884, "learning_rate": 3.9993308550238e-05, "loss": 0.4455, "step": 131 }, { "epoch": 0.5426515930113053, "grad_norm": 1.1744648733550076, "learning_rate": 3.999173905897226e-05, "loss": 0.4579, "step": 132 }, { "epoch": 0.5467625899280576, "grad_norm": 0.7693181453420259, "learning_rate": 3.99900044058724e-05, "loss": 0.4381, "step": 133 }, { "epoch": 0.5508735868448099, "grad_norm": 0.8932998819929917, "learning_rate": 3.998810460526927e-05, "loss": 0.4663, "step": 134 }, { "epoch": 0.5549845837615622, "grad_norm": 1.1927529473551686, "learning_rate": 3.998603967285808e-05, "loss": 0.456, "step": 135 }, { "epoch": 0.5590955806783144, "grad_norm": 0.7303627266940724, "learning_rate": 3.998380962569828e-05, "loss": 0.463, "step": 136 }, { "epoch": 0.5632065775950668, "grad_norm": 1.211411577139644, "learning_rate": 3.9981414482213405e-05, "loss": 0.4649, "step": 137 }, { "epoch": 0.5673175745118191, "grad_norm": 0.8811081756810052, "learning_rate": 3.997885426219096e-05, "loss": 0.4637, "step": 138 }, { "epoch": 0.5714285714285714, "grad_norm": 0.8713984959209414, "learning_rate": 3.99761289867822e-05, "loss": 0.4546, "step": 139 }, { "epoch": 0.5755395683453237, "grad_norm": 0.9788586712835476, "learning_rate": 3.9973238678501996e-05, "loss": 0.4475, "step": 140 }, { "epoch": 0.579650565262076, "grad_norm": 0.7533682906447463, "learning_rate": 3.997018336122866e-05, "loss": 0.4428, "step": 141 }, { "epoch": 0.5837615621788284, "grad_norm": 0.7461312001689725, "learning_rate": 3.9966963060203684e-05, "loss": 0.4494, "step": 142 }, { "epoch": 0.5878725590955807, "grad_norm": 0.6174187621354268, "learning_rate": 3.996357780203161e-05, "loss": 0.4504, "step": 143 }, { "epoch": 0.591983556012333, "grad_norm": 0.49003513931944637, "learning_rate": 3.9960027614679766e-05, "loss": 0.4427, "step": 144 }, { "epoch": 0.5960945529290853, "grad_norm": 0.5605653135696967, "learning_rate": 3.995631252747804e-05, "loss": 0.448, "step": 145 }, { "epoch": 0.6002055498458376, "grad_norm": 0.44488928569515496, "learning_rate": 3.9952432571118634e-05, "loss": 0.4467, "step": 146 }, { "epoch": 0.60431654676259, "grad_norm": 0.5026250568866101, "learning_rate": 3.994838777765582e-05, "loss": 0.4477, "step": 147 }, { "epoch": 0.6084275436793423, "grad_norm": 0.5172580934087974, "learning_rate": 3.9944178180505685e-05, "loss": 0.4523, "step": 148 }, { "epoch": 0.6125385405960946, "grad_norm": 0.42653534399915305, "learning_rate": 3.993980381444583e-05, "loss": 0.4461, "step": 149 }, { "epoch": 0.6166495375128469, "grad_norm": 0.5471101554510113, "learning_rate": 3.993526471561509e-05, "loss": 0.4434, "step": 150 }, { "epoch": 0.6207605344295992, "grad_norm": 0.44006741961102114, "learning_rate": 3.993056092151326e-05, "loss": 0.4433, "step": 151 }, { "epoch": 0.6248715313463515, "grad_norm": 0.54547408589461, "learning_rate": 3.9925692471000755e-05, "loss": 0.4512, "step": 152 }, { "epoch": 0.6289825282631039, "grad_norm": 0.5069055596939723, "learning_rate": 3.9920659404298285e-05, "loss": 0.4407, "step": 153 }, { "epoch": 0.6330935251798561, "grad_norm": 0.4361985691247893, "learning_rate": 3.991546176298657e-05, "loss": 0.443, "step": 154 }, { "epoch": 0.6372045220966084, "grad_norm": 0.6492040966314581, "learning_rate": 3.991009959000593e-05, "loss": 0.4643, "step": 155 }, { "epoch": 0.6413155190133607, "grad_norm": 0.662599709889411, "learning_rate": 3.990457292965598e-05, "loss": 0.4421, "step": 156 }, { "epoch": 0.645426515930113, "grad_norm": 0.7496440633893157, "learning_rate": 3.9898881827595255e-05, "loss": 0.4475, "step": 157 }, { "epoch": 0.6495375128468653, "grad_norm": 0.6565888966700488, "learning_rate": 3.989302633084081e-05, "loss": 0.4478, "step": 158 }, { "epoch": 0.6536485097636177, "grad_norm": 0.4914282437943478, "learning_rate": 3.988700648776786e-05, "loss": 0.4485, "step": 159 }, { "epoch": 0.65775950668037, "grad_norm": 0.5765645279846272, "learning_rate": 3.9880822348109365e-05, "loss": 0.4406, "step": 160 }, { "epoch": 0.6618705035971223, "grad_norm": 0.54508458239329, "learning_rate": 3.9874473962955625e-05, "loss": 0.4377, "step": 161 }, { "epoch": 0.6659815005138746, "grad_norm": 0.5467537029004771, "learning_rate": 3.986796138475383e-05, "loss": 0.4404, "step": 162 }, { "epoch": 0.6700924974306269, "grad_norm": 0.5927993483965615, "learning_rate": 3.986128466730769e-05, "loss": 0.4397, "step": 163 }, { "epoch": 0.6742034943473793, "grad_norm": 0.549384325928676, "learning_rate": 3.985444386577693e-05, "loss": 0.4447, "step": 164 }, { "epoch": 0.6783144912641316, "grad_norm": 0.7853091838958088, "learning_rate": 3.984743903667685e-05, "loss": 0.4323, "step": 165 }, { "epoch": 0.6824254881808839, "grad_norm": 0.7452346665717267, "learning_rate": 3.984027023787789e-05, "loss": 0.4418, "step": 166 }, { "epoch": 0.6865364850976362, "grad_norm": 0.6354950658254404, "learning_rate": 3.98329375286051e-05, "loss": 0.4462, "step": 167 }, { "epoch": 0.6906474820143885, "grad_norm": 0.6039082662120046, "learning_rate": 3.982544096943769e-05, "loss": 0.4387, "step": 168 }, { "epoch": 0.6947584789311408, "grad_norm": 0.658037323729766, "learning_rate": 3.9817780622308515e-05, "loss": 0.4442, "step": 169 }, { "epoch": 0.6988694758478932, "grad_norm": 0.4757882993447632, "learning_rate": 3.980995655050356e-05, "loss": 0.4432, "step": 170 }, { "epoch": 0.7029804727646455, "grad_norm": 0.44532932025468364, "learning_rate": 3.980196881866143e-05, "loss": 0.4414, "step": 171 }, { "epoch": 0.7070914696813977, "grad_norm": 0.45179948654666446, "learning_rate": 3.9793817492772806e-05, "loss": 0.4509, "step": 172 }, { "epoch": 0.71120246659815, "grad_norm": 0.4699683428704349, "learning_rate": 3.9785502640179905e-05, "loss": 0.4278, "step": 173 }, { "epoch": 0.7153134635149023, "grad_norm": 0.5201025709025265, "learning_rate": 3.97770243295759e-05, "loss": 0.4335, "step": 174 }, { "epoch": 0.7194244604316546, "grad_norm": 0.47266696898911464, "learning_rate": 3.9768382631004405e-05, "loss": 0.4501, "step": 175 }, { "epoch": 0.723535457348407, "grad_norm": 0.4422465802779614, "learning_rate": 3.975957761585883e-05, "loss": 0.446, "step": 176 }, { "epoch": 0.7276464542651593, "grad_norm": 0.5391358485913682, "learning_rate": 3.9750609356881865e-05, "loss": 0.4512, "step": 177 }, { "epoch": 0.7317574511819116, "grad_norm": 0.46555411315299916, "learning_rate": 3.974147792816481e-05, "loss": 0.4374, "step": 178 }, { "epoch": 0.7358684480986639, "grad_norm": 0.5347959929577083, "learning_rate": 3.9732183405146984e-05, "loss": 0.4368, "step": 179 }, { "epoch": 0.7399794450154162, "grad_norm": 0.5543063250157177, "learning_rate": 3.9722725864615156e-05, "loss": 0.4468, "step": 180 }, { "epoch": 0.7440904419321686, "grad_norm": 0.39521704775907723, "learning_rate": 3.971310538470282e-05, "loss": 0.4338, "step": 181 }, { "epoch": 0.7482014388489209, "grad_norm": 0.47237629534672426, "learning_rate": 3.9703322044889605e-05, "loss": 0.4369, "step": 182 }, { "epoch": 0.7523124357656732, "grad_norm": 0.434146415819749, "learning_rate": 3.969337592600062e-05, "loss": 0.4458, "step": 183 }, { "epoch": 0.7564234326824255, "grad_norm": 0.38836391572812273, "learning_rate": 3.968326711020578e-05, "loss": 0.4546, "step": 184 }, { "epoch": 0.7605344295991778, "grad_norm": 0.34969919974995534, "learning_rate": 3.967299568101908e-05, "loss": 0.4459, "step": 185 }, { "epoch": 0.7646454265159301, "grad_norm": 0.41064464728289324, "learning_rate": 3.9662561723298e-05, "loss": 0.4326, "step": 186 }, { "epoch": 0.7687564234326825, "grad_norm": 0.4616126051202659, "learning_rate": 3.9651965323242704e-05, "loss": 0.4492, "step": 187 }, { "epoch": 0.7728674203494348, "grad_norm": 0.49195669527847435, "learning_rate": 3.964120656839541e-05, "loss": 0.4276, "step": 188 }, { "epoch": 0.7769784172661871, "grad_norm": 0.3807633073682157, "learning_rate": 3.963028554763961e-05, "loss": 0.4428, "step": 189 }, { "epoch": 0.7810894141829393, "grad_norm": 0.3811255626131261, "learning_rate": 3.9619202351199356e-05, "loss": 0.4337, "step": 190 }, { "epoch": 0.7852004110996916, "grad_norm": 0.3612639948436137, "learning_rate": 3.960795707063852e-05, "loss": 0.4363, "step": 191 }, { "epoch": 0.789311408016444, "grad_norm": 0.4353027404982674, "learning_rate": 3.959654979886005e-05, "loss": 0.4365, "step": 192 }, { "epoch": 0.7934224049331963, "grad_norm": 0.37923924344854587, "learning_rate": 3.958498063010516e-05, "loss": 0.4277, "step": 193 }, { "epoch": 0.7975334018499486, "grad_norm": 0.49016416134919827, "learning_rate": 3.957324965995257e-05, "loss": 0.4189, "step": 194 }, { "epoch": 0.8016443987667009, "grad_norm": 0.3808642318097945, "learning_rate": 3.956135698531777e-05, "loss": 0.428, "step": 195 }, { "epoch": 0.8057553956834532, "grad_norm": 0.4706420424359872, "learning_rate": 3.9549302704452104e-05, "loss": 0.4355, "step": 196 }, { "epoch": 0.8098663926002055, "grad_norm": 0.5558683512038307, "learning_rate": 3.953708691694208e-05, "loss": 0.4219, "step": 197 }, { "epoch": 0.8139773895169579, "grad_norm": 0.5188467581658631, "learning_rate": 3.952470972370848e-05, "loss": 0.4369, "step": 198 }, { "epoch": 0.8180883864337102, "grad_norm": 0.4485136574531589, "learning_rate": 3.951217122700554e-05, "loss": 0.4206, "step": 199 }, { "epoch": 0.8221993833504625, "grad_norm": 0.4872982826961068, "learning_rate": 3.9499471530420086e-05, "loss": 0.4434, "step": 200 }, { "epoch": 0.8263103802672148, "grad_norm": 0.5704413227159343, "learning_rate": 3.9486610738870726e-05, "loss": 0.4332, "step": 201 }, { "epoch": 0.8304213771839671, "grad_norm": 0.6576571504037381, "learning_rate": 3.947358895860693e-05, "loss": 0.4282, "step": 202 }, { "epoch": 0.8345323741007195, "grad_norm": 0.5236083635603117, "learning_rate": 3.9460406297208204e-05, "loss": 0.4418, "step": 203 }, { "epoch": 0.8386433710174718, "grad_norm": 0.4856398721711883, "learning_rate": 3.944706286358315e-05, "loss": 0.4446, "step": 204 }, { "epoch": 0.8427543679342241, "grad_norm": 0.553946219409764, "learning_rate": 3.94335587679686e-05, "loss": 0.4421, "step": 205 }, { "epoch": 0.8468653648509764, "grad_norm": 0.559411380541318, "learning_rate": 3.94198941219287e-05, "loss": 0.4628, "step": 206 }, { "epoch": 0.8509763617677287, "grad_norm": 0.4879763317857753, "learning_rate": 3.940606903835398e-05, "loss": 0.442, "step": 207 }, { "epoch": 0.8550873586844809, "grad_norm": 0.5054384831570833, "learning_rate": 3.939208363146041e-05, "loss": 0.4262, "step": 208 }, { "epoch": 0.8591983556012333, "grad_norm": 0.5553954849786898, "learning_rate": 3.937793801678851e-05, "loss": 0.427, "step": 209 }, { "epoch": 0.8633093525179856, "grad_norm": 0.5872415310529557, "learning_rate": 3.936363231120231e-05, "loss": 0.4413, "step": 210 }, { "epoch": 0.8674203494347379, "grad_norm": 0.5889656491169154, "learning_rate": 3.934916663288847e-05, "loss": 0.4374, "step": 211 }, { "epoch": 0.8715313463514902, "grad_norm": 0.5289928076892064, "learning_rate": 3.9334541101355244e-05, "loss": 0.4393, "step": 212 }, { "epoch": 0.8756423432682425, "grad_norm": 0.5133117516646354, "learning_rate": 3.931975583743152e-05, "loss": 0.4207, "step": 213 }, { "epoch": 0.8797533401849948, "grad_norm": 0.48037331045870174, "learning_rate": 3.930481096326583e-05, "loss": 0.4175, "step": 214 }, { "epoch": 0.8838643371017472, "grad_norm": 0.5410217736097758, "learning_rate": 3.92897066023253e-05, "loss": 0.431, "step": 215 }, { "epoch": 0.8879753340184995, "grad_norm": 0.41649001377169803, "learning_rate": 3.927444287939467e-05, "loss": 0.4484, "step": 216 }, { "epoch": 0.8920863309352518, "grad_norm": 0.45628332224884727, "learning_rate": 3.925901992057525e-05, "loss": 0.4305, "step": 217 }, { "epoch": 0.8961973278520041, "grad_norm": 0.5227314578776049, "learning_rate": 3.924343785328388e-05, "loss": 0.4393, "step": 218 }, { "epoch": 0.9003083247687564, "grad_norm": 0.4530459458277021, "learning_rate": 3.9227696806251875e-05, "loss": 0.4382, "step": 219 }, { "epoch": 0.9044193216855088, "grad_norm": 0.4488315318208515, "learning_rate": 3.9211796909523953e-05, "loss": 0.4209, "step": 220 }, { "epoch": 0.9085303186022611, "grad_norm": 0.4369045769060924, "learning_rate": 3.9195738294457186e-05, "loss": 0.4357, "step": 221 }, { "epoch": 0.9126413155190134, "grad_norm": 0.3980678441937295, "learning_rate": 3.9179521093719876e-05, "loss": 0.4142, "step": 222 }, { "epoch": 0.9167523124357657, "grad_norm": 0.5003747978502763, "learning_rate": 3.91631454412905e-05, "loss": 0.4484, "step": 223 }, { "epoch": 0.920863309352518, "grad_norm": 0.43942976248272747, "learning_rate": 3.914661147245657e-05, "loss": 0.434, "step": 224 }, { "epoch": 0.9249743062692704, "grad_norm": 0.4174753367400882, "learning_rate": 3.912991932381355e-05, "loss": 0.4282, "step": 225 }, { "epoch": 0.9290853031860226, "grad_norm": 0.36920457252907907, "learning_rate": 3.91130691332637e-05, "loss": 0.4347, "step": 226 }, { "epoch": 0.9331963001027749, "grad_norm": 0.45392411540078437, "learning_rate": 3.9096061040014914e-05, "loss": 0.4135, "step": 227 }, { "epoch": 0.9373072970195272, "grad_norm": 0.4203872157822759, "learning_rate": 3.907889518457964e-05, "loss": 0.4422, "step": 228 }, { "epoch": 0.9414182939362795, "grad_norm": 0.391547280290097, "learning_rate": 3.9061571708773656e-05, "loss": 0.428, "step": 229 }, { "epoch": 0.9455292908530318, "grad_norm": 0.5746907556468481, "learning_rate": 3.9044090755714935e-05, "loss": 0.4273, "step": 230 }, { "epoch": 0.9496402877697842, "grad_norm": 0.5021218433821051, "learning_rate": 3.9026452469822435e-05, "loss": 0.4318, "step": 231 }, { "epoch": 0.9537512846865365, "grad_norm": 0.5118619524543895, "learning_rate": 3.900865699681494e-05, "loss": 0.4565, "step": 232 }, { "epoch": 0.9578622816032888, "grad_norm": 0.4269764449835691, "learning_rate": 3.899070448370981e-05, "loss": 0.4242, "step": 233 }, { "epoch": 0.9619732785200411, "grad_norm": 0.544830400097823, "learning_rate": 3.897259507882181e-05, "loss": 0.4308, "step": 234 }, { "epoch": 0.9660842754367934, "grad_norm": 0.5029148596149111, "learning_rate": 3.895432893176186e-05, "loss": 0.4283, "step": 235 }, { "epoch": 0.9701952723535457, "grad_norm": 0.6013585416586662, "learning_rate": 3.8935906193435814e-05, "loss": 0.4231, "step": 236 }, { "epoch": 0.9743062692702981, "grad_norm": 0.501625593569375, "learning_rate": 3.89173270160432e-05, "loss": 0.4335, "step": 237 }, { "epoch": 0.9784172661870504, "grad_norm": 0.6586654174152249, "learning_rate": 3.889859155307596e-05, "loss": 0.4365, "step": 238 }, { "epoch": 0.9825282631038027, "grad_norm": 0.6491985191825143, "learning_rate": 3.8879699959317204e-05, "loss": 0.428, "step": 239 }, { "epoch": 0.986639260020555, "grad_norm": 0.36412537479982626, "learning_rate": 3.8860652390839915e-05, "loss": 0.4258, "step": 240 }, { "epoch": 0.9907502569373073, "grad_norm": 0.562496507066076, "learning_rate": 3.884144900500565e-05, "loss": 0.4352, "step": 241 }, { "epoch": 0.9948612538540597, "grad_norm": 0.4838097185277804, "learning_rate": 3.882208996046327e-05, "loss": 0.4422, "step": 242 }, { "epoch": 0.998972250770812, "grad_norm": 0.4422917967441169, "learning_rate": 3.880257541714759e-05, "loss": 0.4273, "step": 243 }, { "epoch": 1.0030832476875642, "grad_norm": 0.5334045773924255, "learning_rate": 3.878290553627809e-05, "loss": 0.3969, "step": 244 }, { "epoch": 1.0071942446043165, "grad_norm": 0.5378794632121926, "learning_rate": 3.876308048035758e-05, "loss": 0.3903, "step": 245 }, { "epoch": 1.0113052415210688, "grad_norm": 0.5944247982125659, "learning_rate": 3.874310041317084e-05, "loss": 0.3866, "step": 246 }, { "epoch": 1.0154162384378211, "grad_norm": 0.5659631885785738, "learning_rate": 3.8722965499783265e-05, "loss": 0.3859, "step": 247 }, { "epoch": 1.0195272353545735, "grad_norm": 0.6678922530928978, "learning_rate": 3.8702675906539536e-05, "loss": 0.3975, "step": 248 }, { "epoch": 1.0236382322713258, "grad_norm": 0.6092071387321932, "learning_rate": 3.868223180106221e-05, "loss": 0.3805, "step": 249 }, { "epoch": 1.027749229188078, "grad_norm": 0.48801873476109786, "learning_rate": 3.866163335225034e-05, "loss": 0.3924, "step": 250 }, { "epoch": 1.0318602261048304, "grad_norm": 0.5338205820825612, "learning_rate": 3.8640880730278105e-05, "loss": 0.4015, "step": 251 }, { "epoch": 1.0359712230215827, "grad_norm": 0.47770709705325853, "learning_rate": 3.8619974106593365e-05, "loss": 0.3979, "step": 252 }, { "epoch": 1.040082219938335, "grad_norm": 0.6103179105115757, "learning_rate": 3.859891365391628e-05, "loss": 0.388, "step": 253 }, { "epoch": 1.0441932168550874, "grad_norm": 0.5427245439232725, "learning_rate": 3.8577699546237886e-05, "loss": 0.3811, "step": 254 }, { "epoch": 1.0483042137718397, "grad_norm": 0.532814479999278, "learning_rate": 3.8556331958818596e-05, "loss": 0.3872, "step": 255 }, { "epoch": 1.052415210688592, "grad_norm": 0.5339131788688589, "learning_rate": 3.853481106818683e-05, "loss": 0.3914, "step": 256 }, { "epoch": 1.0565262076053443, "grad_norm": 0.5060398381577083, "learning_rate": 3.851313705213751e-05, "loss": 0.3876, "step": 257 }, { "epoch": 1.0606372045220966, "grad_norm": 0.562896010283109, "learning_rate": 3.8491310089730614e-05, "loss": 0.3946, "step": 258 }, { "epoch": 1.064748201438849, "grad_norm": 0.40638744368816154, "learning_rate": 3.846933036128968e-05, "loss": 0.3809, "step": 259 }, { "epoch": 1.0688591983556013, "grad_norm": 0.5821981657729004, "learning_rate": 3.8447198048400325e-05, "loss": 0.4041, "step": 260 }, { "epoch": 1.0729701952723536, "grad_norm": 0.5613586250111681, "learning_rate": 3.8424913333908744e-05, "loss": 0.3834, "step": 261 }, { "epoch": 1.077081192189106, "grad_norm": 0.5210705488884988, "learning_rate": 3.840247640192019e-05, "loss": 0.4053, "step": 262 }, { "epoch": 1.0811921891058582, "grad_norm": 0.5223026076271566, "learning_rate": 3.837988743779747e-05, "loss": 0.4057, "step": 263 }, { "epoch": 1.0853031860226106, "grad_norm": 0.40771864088972815, "learning_rate": 3.8357146628159415e-05, "loss": 0.3759, "step": 264 }, { "epoch": 1.0894141829393629, "grad_norm": 0.5106166010616134, "learning_rate": 3.8334254160879296e-05, "loss": 0.3927, "step": 265 }, { "epoch": 1.0935251798561152, "grad_norm": 0.39265167645057447, "learning_rate": 3.8311210225083347e-05, "loss": 0.3772, "step": 266 }, { "epoch": 1.0976361767728675, "grad_norm": 0.5406659317819649, "learning_rate": 3.8288015011149126e-05, "loss": 0.3877, "step": 267 }, { "epoch": 1.1017471736896198, "grad_norm": 0.4396566495996877, "learning_rate": 3.826466871070399e-05, "loss": 0.3919, "step": 268 }, { "epoch": 1.1058581706063721, "grad_norm": 0.4071828889239751, "learning_rate": 3.82411715166235e-05, "loss": 0.3929, "step": 269 }, { "epoch": 1.1099691675231242, "grad_norm": 0.4408302571199858, "learning_rate": 3.821752362302982e-05, "loss": 0.3984, "step": 270 }, { "epoch": 1.1140801644398768, "grad_norm": 0.3944864874139757, "learning_rate": 3.8193725225290105e-05, "loss": 0.3791, "step": 271 }, { "epoch": 1.1181911613566289, "grad_norm": 0.5086637552588018, "learning_rate": 3.8169776520014935e-05, "loss": 0.3981, "step": 272 }, { "epoch": 1.1223021582733812, "grad_norm": 0.37495702811326503, "learning_rate": 3.814567770505663e-05, "loss": 0.399, "step": 273 }, { "epoch": 1.1264131551901335, "grad_norm": 0.5606532197558952, "learning_rate": 3.812142897950765e-05, "loss": 0.3919, "step": 274 }, { "epoch": 1.1305241521068858, "grad_norm": 0.5021460420776965, "learning_rate": 3.809703054369893e-05, "loss": 0.3884, "step": 275 }, { "epoch": 1.1346351490236382, "grad_norm": 0.42349030253760284, "learning_rate": 3.807248259919826e-05, "loss": 0.3834, "step": 276 }, { "epoch": 1.1387461459403905, "grad_norm": 0.4379650832741319, "learning_rate": 3.804778534880858e-05, "loss": 0.3907, "step": 277 }, { "epoch": 1.1428571428571428, "grad_norm": 0.49289600771412606, "learning_rate": 3.802293899656632e-05, "loss": 0.3813, "step": 278 }, { "epoch": 1.1469681397738951, "grad_norm": 0.42490353928620167, "learning_rate": 3.7997943747739735e-05, "loss": 0.3853, "step": 279 }, { "epoch": 1.1510791366906474, "grad_norm": 0.518693269655709, "learning_rate": 3.797279980882716e-05, "loss": 0.3982, "step": 280 }, { "epoch": 1.1551901336073997, "grad_norm": 0.43881434866880253, "learning_rate": 3.794750738755536e-05, "loss": 0.3926, "step": 281 }, { "epoch": 1.159301130524152, "grad_norm": 0.43485781066948115, "learning_rate": 3.792206669287776e-05, "loss": 0.3922, "step": 282 }, { "epoch": 1.1634121274409044, "grad_norm": 0.45793373109478, "learning_rate": 3.789647793497279e-05, "loss": 0.3949, "step": 283 }, { "epoch": 1.1675231243576567, "grad_norm": 0.4344299254818994, "learning_rate": 3.787074132524206e-05, "loss": 0.3895, "step": 284 }, { "epoch": 1.171634121274409, "grad_norm": 0.44080712302633035, "learning_rate": 3.784485707630868e-05, "loss": 0.3914, "step": 285 }, { "epoch": 1.1757451181911613, "grad_norm": 0.37695184690701744, "learning_rate": 3.781882540201547e-05, "loss": 0.3875, "step": 286 }, { "epoch": 1.1798561151079137, "grad_norm": 0.45884161347743313, "learning_rate": 3.7792646517423236e-05, "loss": 0.3744, "step": 287 }, { "epoch": 1.183967112024666, "grad_norm": 0.4017320092584037, "learning_rate": 3.7766320638808924e-05, "loss": 0.3922, "step": 288 }, { "epoch": 1.1880781089414183, "grad_norm": 0.4280615463958759, "learning_rate": 3.773984798366389e-05, "loss": 0.4006, "step": 289 }, { "epoch": 1.1921891058581706, "grad_norm": 0.3513007234774324, "learning_rate": 3.7713228770692084e-05, "loss": 0.3819, "step": 290 }, { "epoch": 1.196300102774923, "grad_norm": 0.4230975793009198, "learning_rate": 3.768646321980824e-05, "loss": 0.3819, "step": 291 }, { "epoch": 1.2004110996916753, "grad_norm": 0.38816726480644864, "learning_rate": 3.765955155213607e-05, "loss": 0.391, "step": 292 }, { "epoch": 1.2045220966084276, "grad_norm": 0.41570862488108373, "learning_rate": 3.763249399000643e-05, "loss": 0.3879, "step": 293 }, { "epoch": 1.20863309352518, "grad_norm": 0.5386712701377521, "learning_rate": 3.7605290756955476e-05, "loss": 0.404, "step": 294 }, { "epoch": 1.2127440904419322, "grad_norm": 0.4142838391786987, "learning_rate": 3.757794207772283e-05, "loss": 0.394, "step": 295 }, { "epoch": 1.2168550873586845, "grad_norm": 0.3918702689073396, "learning_rate": 3.755044817824971e-05, "loss": 0.3833, "step": 296 }, { "epoch": 1.2209660842754368, "grad_norm": 0.4790045020269064, "learning_rate": 3.752280928567709e-05, "loss": 0.3827, "step": 297 }, { "epoch": 1.2250770811921892, "grad_norm": 0.4071852105252518, "learning_rate": 3.749502562834379e-05, "loss": 0.3972, "step": 298 }, { "epoch": 1.2291880781089415, "grad_norm": 0.433522303940447, "learning_rate": 3.746709743578462e-05, "loss": 0.3985, "step": 299 }, { "epoch": 1.2332990750256938, "grad_norm": 0.4222484903892338, "learning_rate": 3.7439024938728435e-05, "loss": 0.384, "step": 300 }, { "epoch": 1.2374100719424461, "grad_norm": 0.42821966368019687, "learning_rate": 3.74108083690963e-05, "loss": 0.3908, "step": 301 }, { "epoch": 1.2415210688591984, "grad_norm": 0.5269787553817297, "learning_rate": 3.7382447959999514e-05, "loss": 0.3869, "step": 302 }, { "epoch": 1.2456320657759508, "grad_norm": 0.4206960432187445, "learning_rate": 3.7353943945737716e-05, "loss": 0.3984, "step": 303 }, { "epoch": 1.249743062692703, "grad_norm": 0.3963715348953228, "learning_rate": 3.7325296561796936e-05, "loss": 0.3908, "step": 304 }, { "epoch": 1.2538540596094552, "grad_norm": 0.5197873707406762, "learning_rate": 3.729650604484766e-05, "loss": 0.3789, "step": 305 }, { "epoch": 1.2579650565262077, "grad_norm": 0.391975059464178, "learning_rate": 3.7267572632742846e-05, "loss": 0.39, "step": 306 }, { "epoch": 1.2620760534429598, "grad_norm": 0.4297553917483092, "learning_rate": 3.7238496564516006e-05, "loss": 0.398, "step": 307 }, { "epoch": 1.2661870503597124, "grad_norm": 0.32415884735671224, "learning_rate": 3.720927808037921e-05, "loss": 0.385, "step": 308 }, { "epoch": 1.2702980472764644, "grad_norm": 0.4050462187721075, "learning_rate": 3.717991742172106e-05, "loss": 0.3801, "step": 309 }, { "epoch": 1.274409044193217, "grad_norm": 0.44040991415716113, "learning_rate": 3.7150414831104765e-05, "loss": 0.3936, "step": 310 }, { "epoch": 1.278520041109969, "grad_norm": 0.4117947843277416, "learning_rate": 3.712077055226611e-05, "loss": 0.3966, "step": 311 }, { "epoch": 1.2826310380267214, "grad_norm": 0.4039039643321521, "learning_rate": 3.7090984830111415e-05, "loss": 0.3863, "step": 312 }, { "epoch": 1.2867420349434737, "grad_norm": 0.39088426091872597, "learning_rate": 3.7061057910715546e-05, "loss": 0.4019, "step": 313 }, { "epoch": 1.290853031860226, "grad_norm": 0.3364663722128402, "learning_rate": 3.703099004131988e-05, "loss": 0.389, "step": 314 }, { "epoch": 1.2949640287769784, "grad_norm": 0.396387529395801, "learning_rate": 3.700078147033023e-05, "loss": 0.3826, "step": 315 }, { "epoch": 1.2990750256937307, "grad_norm": 0.4034497950317108, "learning_rate": 3.697043244731484e-05, "loss": 0.387, "step": 316 }, { "epoch": 1.303186022610483, "grad_norm": 0.45567545271356036, "learning_rate": 3.693994322300228e-05, "loss": 0.3903, "step": 317 }, { "epoch": 1.3072970195272353, "grad_norm": 0.36949826512347733, "learning_rate": 3.69093140492794e-05, "loss": 0.3907, "step": 318 }, { "epoch": 1.3114080164439876, "grad_norm": 0.3907383409192243, "learning_rate": 3.687854517918926e-05, "loss": 0.3884, "step": 319 }, { "epoch": 1.31551901336074, "grad_norm": 0.400771927655429, "learning_rate": 3.684763686692898e-05, "loss": 0.3897, "step": 320 }, { "epoch": 1.3196300102774923, "grad_norm": 0.28349821982969425, "learning_rate": 3.681658936784773e-05, "loss": 0.3819, "step": 321 }, { "epoch": 1.3237410071942446, "grad_norm": 0.3707057575475429, "learning_rate": 3.678540293844455e-05, "loss": 0.4029, "step": 322 }, { "epoch": 1.327852004110997, "grad_norm": 0.3571877683162145, "learning_rate": 3.675407783636624e-05, "loss": 0.3888, "step": 323 }, { "epoch": 1.3319630010277492, "grad_norm": 0.35550987334717343, "learning_rate": 3.672261432040527e-05, "loss": 0.388, "step": 324 }, { "epoch": 1.3360739979445015, "grad_norm": 0.33342159219384704, "learning_rate": 3.6691012650497605e-05, "loss": 0.3949, "step": 325 }, { "epoch": 1.3401849948612539, "grad_norm": 0.33685885775370095, "learning_rate": 3.665927308772057e-05, "loss": 0.3801, "step": 326 }, { "epoch": 1.3442959917780062, "grad_norm": 0.3507256755448898, "learning_rate": 3.6627395894290685e-05, "loss": 0.4011, "step": 327 }, { "epoch": 1.3484069886947585, "grad_norm": 0.3729058607264493, "learning_rate": 3.659538133356153e-05, "loss": 0.3841, "step": 328 }, { "epoch": 1.3525179856115108, "grad_norm": 0.35654106083716275, "learning_rate": 3.656322967002151e-05, "loss": 0.3798, "step": 329 }, { "epoch": 1.3566289825282631, "grad_norm": 0.34899761497768883, "learning_rate": 3.6530941169291744e-05, "loss": 0.3769, "step": 330 }, { "epoch": 1.3607399794450155, "grad_norm": 0.3500913238620904, "learning_rate": 3.649851609812379e-05, "loss": 0.4005, "step": 331 }, { "epoch": 1.3648509763617678, "grad_norm": 0.4370742910901644, "learning_rate": 3.646595472439753e-05, "loss": 0.3812, "step": 332 }, { "epoch": 1.36896197327852, "grad_norm": 0.4531455394409143, "learning_rate": 3.643325731711888e-05, "loss": 0.3949, "step": 333 }, { "epoch": 1.3730729701952724, "grad_norm": 0.3382908051688983, "learning_rate": 3.6400424146417604e-05, "loss": 0.3951, "step": 334 }, { "epoch": 1.3771839671120247, "grad_norm": 0.48124284386355537, "learning_rate": 3.6367455483545066e-05, "loss": 0.3886, "step": 335 }, { "epoch": 1.381294964028777, "grad_norm": 0.4073484304811201, "learning_rate": 3.633435160087202e-05, "loss": 0.3833, "step": 336 }, { "epoch": 1.3854059609455294, "grad_norm": 0.37602122616857575, "learning_rate": 3.6301112771886315e-05, "loss": 0.3947, "step": 337 }, { "epoch": 1.3895169578622815, "grad_norm": 0.35827821253734476, "learning_rate": 3.62677392711907e-05, "loss": 0.39, "step": 338 }, { "epoch": 1.393627954779034, "grad_norm": 0.40220244427058716, "learning_rate": 3.623423137450046e-05, "loss": 0.3912, "step": 339 }, { "epoch": 1.397738951695786, "grad_norm": 0.3520064901329717, "learning_rate": 3.620058935864123e-05, "loss": 0.3902, "step": 340 }, { "epoch": 1.4018499486125386, "grad_norm": 0.3470206706484027, "learning_rate": 3.616681350154666e-05, "loss": 0.3817, "step": 341 }, { "epoch": 1.4059609455292907, "grad_norm": 0.3684394787845421, "learning_rate": 3.613290408225615e-05, "loss": 0.3827, "step": 342 }, { "epoch": 1.4100719424460433, "grad_norm": 0.34749273169540446, "learning_rate": 3.609886138091247e-05, "loss": 0.3874, "step": 343 }, { "epoch": 1.4141829393627954, "grad_norm": 0.36675012503855753, "learning_rate": 3.606468567875957e-05, "loss": 0.3863, "step": 344 }, { "epoch": 1.418293936279548, "grad_norm": 0.3087501681001265, "learning_rate": 3.603037725814014e-05, "loss": 0.3878, "step": 345 }, { "epoch": 1.4224049331963, "grad_norm": 0.44389782737034467, "learning_rate": 3.599593640249334e-05, "loss": 0.3775, "step": 346 }, { "epoch": 1.4265159301130523, "grad_norm": 0.40184525645257135, "learning_rate": 3.5961363396352435e-05, "loss": 0.3878, "step": 347 }, { "epoch": 1.4306269270298047, "grad_norm": 0.3739195862038827, "learning_rate": 3.592665852534246e-05, "loss": 0.3882, "step": 348 }, { "epoch": 1.434737923946557, "grad_norm": 0.4667442608373335, "learning_rate": 3.589182207617785e-05, "loss": 0.3848, "step": 349 }, { "epoch": 1.4388489208633093, "grad_norm": 0.2954516082957151, "learning_rate": 3.5856854336660075e-05, "loss": 0.3737, "step": 350 }, { "epoch": 1.4429599177800616, "grad_norm": 0.3807234664590424, "learning_rate": 3.582175559567524e-05, "loss": 0.394, "step": 351 }, { "epoch": 1.447070914696814, "grad_norm": 0.37991638726226773, "learning_rate": 3.578652614319177e-05, "loss": 0.3924, "step": 352 }, { "epoch": 1.4511819116135662, "grad_norm": 0.4605647523732803, "learning_rate": 3.575116627025791e-05, "loss": 0.3895, "step": 353 }, { "epoch": 1.4552929085303186, "grad_norm": 0.34460237531655397, "learning_rate": 3.571567626899939e-05, "loss": 0.3979, "step": 354 }, { "epoch": 1.4594039054470709, "grad_norm": 0.37502366115502783, "learning_rate": 3.568005643261701e-05, "loss": 0.3865, "step": 355 }, { "epoch": 1.4635149023638232, "grad_norm": 0.361240868061172, "learning_rate": 3.5644307055384204e-05, "loss": 0.3927, "step": 356 }, { "epoch": 1.4676258992805755, "grad_norm": 0.36549527451613106, "learning_rate": 3.5608428432644574e-05, "loss": 0.3906, "step": 357 }, { "epoch": 1.4717368961973278, "grad_norm": 0.3841131971215696, "learning_rate": 3.557242086080953e-05, "loss": 0.3882, "step": 358 }, { "epoch": 1.4758478931140802, "grad_norm": 0.3053174499069298, "learning_rate": 3.5536284637355766e-05, "loss": 0.3882, "step": 359 }, { "epoch": 1.4799588900308325, "grad_norm": 0.42930198135043723, "learning_rate": 3.5500020060822844e-05, "loss": 0.39, "step": 360 }, { "epoch": 1.4840698869475848, "grad_norm": 0.3646026910744666, "learning_rate": 3.54636274308107e-05, "loss": 0.3919, "step": 361 }, { "epoch": 1.4881808838643371, "grad_norm": 0.4584181730800767, "learning_rate": 3.542710704797721e-05, "loss": 0.3841, "step": 362 }, { "epoch": 1.4922918807810894, "grad_norm": 0.3912766247821292, "learning_rate": 3.539045921403566e-05, "loss": 0.375, "step": 363 }, { "epoch": 1.4964028776978417, "grad_norm": 0.39561662555483357, "learning_rate": 3.5353684231752276e-05, "loss": 0.3884, "step": 364 }, { "epoch": 1.500513874614594, "grad_norm": 0.33669597693884484, "learning_rate": 3.531678240494373e-05, "loss": 0.3953, "step": 365 }, { "epoch": 1.5046248715313464, "grad_norm": 0.4156836645972758, "learning_rate": 3.5279754038474616e-05, "loss": 0.3864, "step": 366 }, { "epoch": 1.5087358684480987, "grad_norm": 0.3888603103920021, "learning_rate": 3.524259943825493e-05, "loss": 0.3864, "step": 367 }, { "epoch": 1.512846865364851, "grad_norm": 0.34153109888601435, "learning_rate": 3.5205318911237566e-05, "loss": 0.3829, "step": 368 }, { "epoch": 1.5169578622816033, "grad_norm": 0.4203599723923179, "learning_rate": 3.516791276541574e-05, "loss": 0.391, "step": 369 }, { "epoch": 1.5210688591983557, "grad_norm": 0.39707036421576897, "learning_rate": 3.5130381309820474e-05, "loss": 0.3852, "step": 370 }, { "epoch": 1.5251798561151078, "grad_norm": 0.35484540902249145, "learning_rate": 3.509272485451806e-05, "loss": 0.3813, "step": 371 }, { "epoch": 1.5292908530318603, "grad_norm": 0.35726960151965814, "learning_rate": 3.5054943710607435e-05, "loss": 0.3943, "step": 372 }, { "epoch": 1.5334018499486124, "grad_norm": 0.34918237917940137, "learning_rate": 3.50170381902177e-05, "loss": 0.3813, "step": 373 }, { "epoch": 1.537512846865365, "grad_norm": 0.3225637816337971, "learning_rate": 3.497900860650545e-05, "loss": 0.3818, "step": 374 }, { "epoch": 1.541623843782117, "grad_norm": 0.3243987867777615, "learning_rate": 3.494085527365224e-05, "loss": 0.3759, "step": 375 }, { "epoch": 1.5457348406988696, "grad_norm": 0.3158935559652955, "learning_rate": 3.4902578506861995e-05, "loss": 0.3893, "step": 376 }, { "epoch": 1.5498458376156217, "grad_norm": 0.3815644429337655, "learning_rate": 3.486417862235839e-05, "loss": 0.3905, "step": 377 }, { "epoch": 1.5539568345323742, "grad_norm": 0.3118180182058997, "learning_rate": 3.4825655937382216e-05, "loss": 0.3865, "step": 378 }, { "epoch": 1.5580678314491263, "grad_norm": 0.3841430312682266, "learning_rate": 3.4787010770188795e-05, "loss": 0.3932, "step": 379 }, { "epoch": 1.5621788283658788, "grad_norm": 0.39242889501386036, "learning_rate": 3.474824344004534e-05, "loss": 0.3906, "step": 380 }, { "epoch": 1.566289825282631, "grad_norm": 0.3632635332732287, "learning_rate": 3.4709354267228294e-05, "loss": 0.3783, "step": 381 }, { "epoch": 1.5704008221993835, "grad_norm": 0.4216314417617418, "learning_rate": 3.467034357302073e-05, "loss": 0.3816, "step": 382 }, { "epoch": 1.5745118191161356, "grad_norm": 0.410057885099804, "learning_rate": 3.463121167970966e-05, "loss": 0.3843, "step": 383 }, { "epoch": 1.5786228160328881, "grad_norm": 0.30821430114214227, "learning_rate": 3.4591958910583365e-05, "loss": 0.3871, "step": 384 }, { "epoch": 1.5827338129496402, "grad_norm": 0.3935547600639123, "learning_rate": 3.455258558992877e-05, "loss": 0.379, "step": 385 }, { "epoch": 1.5868448098663928, "grad_norm": 0.3250409563547552, "learning_rate": 3.451309204302873e-05, "loss": 0.3801, "step": 386 }, { "epoch": 1.5909558067831449, "grad_norm": 0.356548846946166, "learning_rate": 3.447347859615933e-05, "loss": 0.379, "step": 387 }, { "epoch": 1.5950668036998972, "grad_norm": 0.361163416612143, "learning_rate": 3.443374557658723e-05, "loss": 0.3745, "step": 388 }, { "epoch": 1.5991778006166495, "grad_norm": 0.3446242948127641, "learning_rate": 3.439389331256694e-05, "loss": 0.3807, "step": 389 }, { "epoch": 1.6032887975334018, "grad_norm": 0.33333480141647187, "learning_rate": 3.435392213333809e-05, "loss": 0.3832, "step": 390 }, { "epoch": 1.6073997944501541, "grad_norm": 0.38066181381339836, "learning_rate": 3.431383236912275e-05, "loss": 0.3692, "step": 391 }, { "epoch": 1.6115107913669064, "grad_norm": 0.3255044440323713, "learning_rate": 3.427362435112268e-05, "loss": 0.3728, "step": 392 }, { "epoch": 1.6156217882836588, "grad_norm": 0.41417790723734144, "learning_rate": 3.423329841151656e-05, "loss": 0.3868, "step": 393 }, { "epoch": 1.619732785200411, "grad_norm": 0.32170041659499554, "learning_rate": 3.4192854883457326e-05, "loss": 0.3724, "step": 394 }, { "epoch": 1.6238437821171634, "grad_norm": 0.3522389078445349, "learning_rate": 3.4152294101069345e-05, "loss": 0.3755, "step": 395 }, { "epoch": 1.6279547790339157, "grad_norm": 0.3154196575435205, "learning_rate": 3.411161639944568e-05, "loss": 0.3866, "step": 396 }, { "epoch": 1.632065775950668, "grad_norm": 0.3883625817054837, "learning_rate": 3.407082211464534e-05, "loss": 0.3842, "step": 397 }, { "epoch": 1.6361767728674204, "grad_norm": 0.32478029230772587, "learning_rate": 3.402991158369047e-05, "loss": 0.3856, "step": 398 }, { "epoch": 1.6402877697841727, "grad_norm": 0.33777536538509645, "learning_rate": 3.39888851445636e-05, "loss": 0.3738, "step": 399 }, { "epoch": 1.644398766700925, "grad_norm": 0.3645535574440166, "learning_rate": 3.394774313620481e-05, "loss": 0.3768, "step": 400 }, { "epoch": 1.6485097636176773, "grad_norm": 0.33553965225554366, "learning_rate": 3.390648589850897e-05, "loss": 0.3854, "step": 401 }, { "epoch": 1.6526207605344296, "grad_norm": 0.35131544263569836, "learning_rate": 3.386511377232293e-05, "loss": 0.383, "step": 402 }, { "epoch": 1.656731757451182, "grad_norm": 0.3083698384899604, "learning_rate": 3.382362709944268e-05, "loss": 0.3913, "step": 403 }, { "epoch": 1.6608427543679343, "grad_norm": 0.3444920510980315, "learning_rate": 3.3782026222610525e-05, "loss": 0.3912, "step": 404 }, { "epoch": 1.6649537512846866, "grad_norm": 0.3109066824781155, "learning_rate": 3.374031148551229e-05, "loss": 0.3785, "step": 405 }, { "epoch": 1.6690647482014387, "grad_norm": 0.393332877111885, "learning_rate": 3.3698483232774435e-05, "loss": 0.3811, "step": 406 }, { "epoch": 1.6731757451181912, "grad_norm": 0.35010985881480106, "learning_rate": 3.365654180996126e-05, "loss": 0.3765, "step": 407 }, { "epoch": 1.6772867420349433, "grad_norm": 0.3994860261819717, "learning_rate": 3.361448756357199e-05, "loss": 0.3855, "step": 408 }, { "epoch": 1.6813977389516959, "grad_norm": 0.4026873313554007, "learning_rate": 3.3572320841037945e-05, "loss": 0.3776, "step": 409 }, { "epoch": 1.685508735868448, "grad_norm": 0.3888166731552757, "learning_rate": 3.353004199071969e-05, "loss": 0.389, "step": 410 }, { "epoch": 1.6896197327852005, "grad_norm": 0.4229642214250034, "learning_rate": 3.348765136190412e-05, "loss": 0.3844, "step": 411 }, { "epoch": 1.6937307297019526, "grad_norm": 0.3719493753316055, "learning_rate": 3.344514930480158e-05, "loss": 0.3718, "step": 412 }, { "epoch": 1.6978417266187051, "grad_norm": 0.3750792470447336, "learning_rate": 3.3402536170542985e-05, "loss": 0.4017, "step": 413 }, { "epoch": 1.7019527235354572, "grad_norm": 0.37953623181883855, "learning_rate": 3.335981231117694e-05, "loss": 0.3786, "step": 414 }, { "epoch": 1.7060637204522098, "grad_norm": 0.42228613250314784, "learning_rate": 3.331697807966676e-05, "loss": 0.3902, "step": 415 }, { "epoch": 1.7101747173689619, "grad_norm": 0.33605301616513616, "learning_rate": 3.327403382988764e-05, "loss": 0.382, "step": 416 }, { "epoch": 1.7142857142857144, "grad_norm": 0.915407319860973, "learning_rate": 3.3230979916623667e-05, "loss": 0.3868, "step": 417 }, { "epoch": 1.7183967112024665, "grad_norm": 0.4415883382317921, "learning_rate": 3.318781669556493e-05, "loss": 0.4025, "step": 418 }, { "epoch": 1.722507708119219, "grad_norm": 0.29764556226533273, "learning_rate": 3.3144544523304545e-05, "loss": 0.3868, "step": 419 }, { "epoch": 1.7266187050359711, "grad_norm": 0.3864981881512229, "learning_rate": 3.310116375733575e-05, "loss": 0.3848, "step": 420 }, { "epoch": 1.7307297019527237, "grad_norm": 0.4237534589835872, "learning_rate": 3.3057674756048906e-05, "loss": 0.3884, "step": 421 }, { "epoch": 1.7348406988694758, "grad_norm": 0.30622506767945284, "learning_rate": 3.30140778787286e-05, "loss": 0.3962, "step": 422 }, { "epoch": 1.7389516957862283, "grad_norm": 0.3715870543554042, "learning_rate": 3.297037348555059e-05, "loss": 0.3804, "step": 423 }, { "epoch": 1.7430626927029804, "grad_norm": 0.3158873451974222, "learning_rate": 3.292656193757891e-05, "loss": 0.3808, "step": 424 }, { "epoch": 1.7471736896197327, "grad_norm": 3.5993500626700534, "learning_rate": 3.2882643596762847e-05, "loss": 0.3766, "step": 425 }, { "epoch": 1.751284686536485, "grad_norm": 0.41799220024756045, "learning_rate": 3.283861882593394e-05, "loss": 0.3629, "step": 426 }, { "epoch": 1.7553956834532374, "grad_norm": 0.4028165918419239, "learning_rate": 3.2794487988803024e-05, "loss": 0.3946, "step": 427 }, { "epoch": 1.7595066803699897, "grad_norm": 0.45312099756724705, "learning_rate": 3.275025144995719e-05, "loss": 0.3826, "step": 428 }, { "epoch": 1.763617677286742, "grad_norm": 0.3682320829470106, "learning_rate": 3.270590957485678e-05, "loss": 0.3822, "step": 429 }, { "epoch": 1.7677286742034943, "grad_norm": 0.432471521500914, "learning_rate": 3.266146272983238e-05, "loss": 0.379, "step": 430 }, { "epoch": 1.7718396711202467, "grad_norm": 0.49258814180632715, "learning_rate": 3.261691128208178e-05, "loss": 0.3781, "step": 431 }, { "epoch": 1.775950668036999, "grad_norm": 0.42039354140050533, "learning_rate": 3.2572255599666946e-05, "loss": 0.3858, "step": 432 }, { "epoch": 1.7800616649537513, "grad_norm": 0.4266657139962505, "learning_rate": 3.252749605151099e-05, "loss": 0.3889, "step": 433 }, { "epoch": 1.7841726618705036, "grad_norm": 0.4111540760053901, "learning_rate": 3.24826330073951e-05, "loss": 0.3828, "step": 434 }, { "epoch": 1.788283658787256, "grad_norm": 0.40054562650751135, "learning_rate": 3.2437666837955495e-05, "loss": 0.3821, "step": 435 }, { "epoch": 1.7923946557040082, "grad_norm": 0.4049883565747011, "learning_rate": 3.239259791468037e-05, "loss": 0.3782, "step": 436 }, { "epoch": 1.7965056526207606, "grad_norm": 0.3211989179680821, "learning_rate": 3.234742660990681e-05, "loss": 0.3886, "step": 437 }, { "epoch": 1.8006166495375129, "grad_norm": 0.3415159428416263, "learning_rate": 3.230215329681775e-05, "loss": 0.3865, "step": 438 }, { "epoch": 1.8047276464542652, "grad_norm": 0.30795596054473745, "learning_rate": 3.225677834943884e-05, "loss": 0.3798, "step": 439 }, { "epoch": 1.8088386433710175, "grad_norm": 0.3527630027822489, "learning_rate": 3.22113021426354e-05, "loss": 0.371, "step": 440 }, { "epoch": 1.8129496402877698, "grad_norm": 0.38597884530280835, "learning_rate": 3.216572505210929e-05, "loss": 0.386, "step": 441 }, { "epoch": 1.8170606372045222, "grad_norm": 0.35477892953521534, "learning_rate": 3.2120047454395845e-05, "loss": 0.3837, "step": 442 }, { "epoch": 1.8211716341212743, "grad_norm": 0.34702546052353167, "learning_rate": 3.207426972686071e-05, "loss": 0.3892, "step": 443 }, { "epoch": 1.8252826310380268, "grad_norm": 0.30619045437996395, "learning_rate": 3.202839224769678e-05, "loss": 0.3911, "step": 444 }, { "epoch": 1.829393627954779, "grad_norm": 0.308117763052393, "learning_rate": 3.198241539592103e-05, "loss": 0.388, "step": 445 }, { "epoch": 1.8335046248715314, "grad_norm": 0.3813900684937835, "learning_rate": 3.1936339551371416e-05, "loss": 0.3733, "step": 446 }, { "epoch": 1.8376156217882835, "grad_norm": 0.37451609765152405, "learning_rate": 3.1890165094703704e-05, "loss": 0.382, "step": 447 }, { "epoch": 1.841726618705036, "grad_norm": 0.3343641229801653, "learning_rate": 3.184389240738838e-05, "loss": 0.3843, "step": 448 }, { "epoch": 1.8458376156217882, "grad_norm": 0.3671589935937082, "learning_rate": 3.179752187170741e-05, "loss": 0.3914, "step": 449 }, { "epoch": 1.8499486125385407, "grad_norm": 0.3997665963907156, "learning_rate": 3.1751053870751184e-05, "loss": 0.3843, "step": 450 }, { "epoch": 1.8540596094552928, "grad_norm": 0.3253860699538578, "learning_rate": 3.1704488788415274e-05, "loss": 0.3855, "step": 451 }, { "epoch": 1.8581706063720453, "grad_norm": 0.3876573196918091, "learning_rate": 3.16578270093973e-05, "loss": 0.386, "step": 452 }, { "epoch": 1.8622816032887974, "grad_norm": 0.38773352168091224, "learning_rate": 3.1611068919193756e-05, "loss": 0.3783, "step": 453 }, { "epoch": 1.86639260020555, "grad_norm": 0.31178745493034576, "learning_rate": 3.1564214904096774e-05, "loss": 0.385, "step": 454 }, { "epoch": 1.870503597122302, "grad_norm": 0.5723565190756046, "learning_rate": 3.1517265351191e-05, "loss": 0.3841, "step": 455 }, { "epoch": 1.8746145940390546, "grad_norm": 0.46777702023370726, "learning_rate": 3.147022064835036e-05, "loss": 0.385, "step": 456 }, { "epoch": 1.8787255909558067, "grad_norm": 0.48665108942706403, "learning_rate": 3.142308118423485e-05, "loss": 0.3808, "step": 457 }, { "epoch": 1.8828365878725593, "grad_norm": 0.5712910353884142, "learning_rate": 3.1375847348287365e-05, "loss": 0.3898, "step": 458 }, { "epoch": 1.8869475847893113, "grad_norm": 0.4140740468743388, "learning_rate": 3.132851953073041e-05, "loss": 0.3823, "step": 459 }, { "epoch": 1.8910585817060637, "grad_norm": 0.4981734090282241, "learning_rate": 3.128109812256296e-05, "loss": 0.379, "step": 460 }, { "epoch": 1.895169578622816, "grad_norm": 0.3901725689405749, "learning_rate": 3.1233583515557166e-05, "loss": 0.3802, "step": 461 }, { "epoch": 1.8992805755395683, "grad_norm": 0.39288710655716796, "learning_rate": 3.118597610225514e-05, "loss": 0.3648, "step": 462 }, { "epoch": 1.9033915724563206, "grad_norm": 0.3943553998150945, "learning_rate": 3.113827627596575e-05, "loss": 0.3845, "step": 463 }, { "epoch": 1.907502569373073, "grad_norm": 0.3815649604071033, "learning_rate": 3.1090484430761275e-05, "loss": 0.3968, "step": 464 }, { "epoch": 1.9116135662898253, "grad_norm": 0.4341716993023021, "learning_rate": 3.104260096147426e-05, "loss": 0.3825, "step": 465 }, { "epoch": 1.9157245632065776, "grad_norm": 0.2885815389134767, "learning_rate": 3.099462626369418e-05, "loss": 0.379, "step": 466 }, { "epoch": 1.91983556012333, "grad_norm": 0.3706179172517124, "learning_rate": 3.094656073376419e-05, "loss": 0.3882, "step": 467 }, { "epoch": 1.9239465570400822, "grad_norm": 0.3443004696246589, "learning_rate": 3.0898404768777863e-05, "loss": 0.3855, "step": 468 }, { "epoch": 1.9280575539568345, "grad_norm": 0.29420490623628953, "learning_rate": 3.0850158766575907e-05, "loss": 0.3843, "step": 469 }, { "epoch": 1.9321685508735869, "grad_norm": 0.3876924984247156, "learning_rate": 3.080182312574286e-05, "loss": 0.3746, "step": 470 }, { "epoch": 1.9362795477903392, "grad_norm": 0.28747642038559285, "learning_rate": 3.075339824560382e-05, "loss": 0.3718, "step": 471 }, { "epoch": 1.9403905447070915, "grad_norm": 0.32380146376848085, "learning_rate": 3.070488452622113e-05, "loss": 0.3934, "step": 472 }, { "epoch": 1.9445015416238438, "grad_norm": 0.32465438985148803, "learning_rate": 3.0656282368391086e-05, "loss": 0.3729, "step": 473 }, { "epoch": 1.9486125385405961, "grad_norm": 0.35563518327266175, "learning_rate": 3.0607592173640615e-05, "loss": 0.3795, "step": 474 }, { "epoch": 1.9527235354573484, "grad_norm": 0.331866902145928, "learning_rate": 3.055881434422395e-05, "loss": 0.3981, "step": 475 }, { "epoch": 1.9568345323741008, "grad_norm": 0.3342210067868538, "learning_rate": 3.0509949283119348e-05, "loss": 0.3717, "step": 476 }, { "epoch": 1.960945529290853, "grad_norm": 0.33585492528175326, "learning_rate": 3.0460997394025694e-05, "loss": 0.3993, "step": 477 }, { "epoch": 1.9650565262076052, "grad_norm": 0.31011270781830746, "learning_rate": 3.0411959081359223e-05, "loss": 0.3865, "step": 478 }, { "epoch": 1.9691675231243577, "grad_norm": 0.35530598698818877, "learning_rate": 3.036283475025016e-05, "loss": 0.3784, "step": 479 }, { "epoch": 1.9732785200411098, "grad_norm": 0.3734052740131826, "learning_rate": 3.031362480653937e-05, "loss": 0.3762, "step": 480 }, { "epoch": 1.9773895169578624, "grad_norm": 0.3014940676108034, "learning_rate": 3.0264329656775e-05, "loss": 0.3757, "step": 481 }, { "epoch": 1.9815005138746145, "grad_norm": 0.3512117145148321, "learning_rate": 3.021494970820912e-05, "loss": 0.3827, "step": 482 }, { "epoch": 1.985611510791367, "grad_norm": 0.37355802443996994, "learning_rate": 3.01654853687944e-05, "loss": 0.3642, "step": 483 }, { "epoch": 1.989722507708119, "grad_norm": 0.2861615252457176, "learning_rate": 3.011593704718067e-05, "loss": 0.3963, "step": 484 }, { "epoch": 1.9938335046248716, "grad_norm": 0.3745753953644458, "learning_rate": 3.0066305152711598e-05, "loss": 0.3878, "step": 485 }, { "epoch": 1.9979445015416237, "grad_norm": 0.26150625589651816, "learning_rate": 3.0016590095421273e-05, "loss": 0.3721, "step": 486 }, { "epoch": 2.0020554984583763, "grad_norm": 0.3918210766291012, "learning_rate": 2.9966792286030853e-05, "loss": 0.3396, "step": 487 }, { "epoch": 2.0061664953751284, "grad_norm": 0.315886174265335, "learning_rate": 2.9916912135945147e-05, "loss": 0.3326, "step": 488 }, { "epoch": 2.010277492291881, "grad_norm": 0.5139005301093035, "learning_rate": 2.986695005724921e-05, "loss": 0.3331, "step": 489 }, { "epoch": 2.014388489208633, "grad_norm": 0.4039956111942429, "learning_rate": 2.9816906462704963e-05, "loss": 0.3318, "step": 490 }, { "epoch": 2.0184994861253855, "grad_norm": 0.3643964107370674, "learning_rate": 2.9766781765747775e-05, "loss": 0.331, "step": 491 }, { "epoch": 2.0226104830421376, "grad_norm": 0.36816703584916016, "learning_rate": 2.971657638048302e-05, "loss": 0.3318, "step": 492 }, { "epoch": 2.02672147995889, "grad_norm": 0.42271803167213406, "learning_rate": 2.966629072168271e-05, "loss": 0.3344, "step": 493 }, { "epoch": 2.0308324768756423, "grad_norm": 0.3930653905455099, "learning_rate": 2.9615925204782006e-05, "loss": 0.3177, "step": 494 }, { "epoch": 2.034943473792395, "grad_norm": 0.40048080993718765, "learning_rate": 2.9565480245875845e-05, "loss": 0.3358, "step": 495 }, { "epoch": 2.039054470709147, "grad_norm": 0.3484759664627585, "learning_rate": 2.9514956261715458e-05, "loss": 0.3303, "step": 496 }, { "epoch": 2.0431654676258995, "grad_norm": 1.2952724176128951, "learning_rate": 2.9464353669704943e-05, "loss": 0.3683, "step": 497 }, { "epoch": 2.0472764645426516, "grad_norm": 0.3554075988337682, "learning_rate": 2.9413672887897828e-05, "loss": 0.34, "step": 498 }, { "epoch": 2.051387461459404, "grad_norm": 0.5002880458311052, "learning_rate": 2.936291433499359e-05, "loss": 0.3304, "step": 499 }, { "epoch": 2.055498458376156, "grad_norm": 0.37407564917246083, "learning_rate": 2.9312078430334228e-05, "loss": 0.3358, "step": 500 }, { "epoch": 2.0596094552929087, "grad_norm": 0.4077464439245042, "learning_rate": 2.926116559390078e-05, "loss": 0.3232, "step": 501 }, { "epoch": 2.063720452209661, "grad_norm": 0.3809714773736691, "learning_rate": 2.921017624630984e-05, "loss": 0.323, "step": 502 }, { "epoch": 2.0678314491264134, "grad_norm": 0.311417329278231, "learning_rate": 2.9159110808810125e-05, "loss": 0.3192, "step": 503 }, { "epoch": 2.0719424460431655, "grad_norm": 0.44030503896413653, "learning_rate": 2.9107969703278952e-05, "loss": 0.3354, "step": 504 }, { "epoch": 2.0760534429599176, "grad_norm": 0.3036316440870094, "learning_rate": 2.905675335221877e-05, "loss": 0.3299, "step": 505 }, { "epoch": 2.08016443987667, "grad_norm": 0.4247647928977746, "learning_rate": 2.900546217875368e-05, "loss": 0.3288, "step": 506 }, { "epoch": 2.084275436793422, "grad_norm": 0.3390034407042314, "learning_rate": 2.895409660662592e-05, "loss": 0.3328, "step": 507 }, { "epoch": 2.0883864337101747, "grad_norm": 0.4990365827984489, "learning_rate": 2.8902657060192366e-05, "loss": 0.3376, "step": 508 }, { "epoch": 2.092497430626927, "grad_norm": 0.5173705351976455, "learning_rate": 2.8851143964421048e-05, "loss": 0.3356, "step": 509 }, { "epoch": 2.0966084275436794, "grad_norm": 0.5151102205707064, "learning_rate": 2.879955774488762e-05, "loss": 0.332, "step": 510 }, { "epoch": 2.1007194244604315, "grad_norm": 0.43417520836094964, "learning_rate": 2.8747898827771846e-05, "loss": 0.3389, "step": 511 }, { "epoch": 2.104830421377184, "grad_norm": 0.5355654606933186, "learning_rate": 2.8696167639854073e-05, "loss": 0.341, "step": 512 }, { "epoch": 2.108941418293936, "grad_norm": 0.4367393823993611, "learning_rate": 2.864436460851173e-05, "loss": 0.3299, "step": 513 }, { "epoch": 2.1130524152106887, "grad_norm": 0.45783141095235763, "learning_rate": 2.8592490161715768e-05, "loss": 0.3191, "step": 514 }, { "epoch": 2.1171634121274407, "grad_norm": 0.42545114058633565, "learning_rate": 2.8540544728027145e-05, "loss": 0.3145, "step": 515 }, { "epoch": 2.1212744090441933, "grad_norm": 0.3661488589187853, "learning_rate": 2.8488528736593278e-05, "loss": 0.3275, "step": 516 }, { "epoch": 2.1253854059609454, "grad_norm": 0.5060616601470208, "learning_rate": 2.843644261714448e-05, "loss": 0.3384, "step": 517 }, { "epoch": 2.129496402877698, "grad_norm": 0.3580510131662911, "learning_rate": 2.8384286799990452e-05, "loss": 0.3296, "step": 518 }, { "epoch": 2.13360739979445, "grad_norm": 0.45075270681673163, "learning_rate": 2.8332061716016692e-05, "loss": 0.32, "step": 519 }, { "epoch": 2.1377183967112026, "grad_norm": 0.4708082264494772, "learning_rate": 2.8279767796680934e-05, "loss": 0.3332, "step": 520 }, { "epoch": 2.1418293936279547, "grad_norm": 0.35417572710043976, "learning_rate": 2.8227405474009616e-05, "loss": 0.325, "step": 521 }, { "epoch": 2.145940390544707, "grad_norm": 0.5178072041280041, "learning_rate": 2.817497518059428e-05, "loss": 0.3286, "step": 522 }, { "epoch": 2.1500513874614593, "grad_norm": 0.333153745006992, "learning_rate": 2.8122477349588005e-05, "loss": 0.3247, "step": 523 }, { "epoch": 2.154162384378212, "grad_norm": 0.5499040672396817, "learning_rate": 2.8069912414701842e-05, "loss": 0.3338, "step": 524 }, { "epoch": 2.158273381294964, "grad_norm": 0.29956362280088755, "learning_rate": 2.8017280810201213e-05, "loss": 0.3307, "step": 525 }, { "epoch": 2.1623843782117165, "grad_norm": 0.4276269537060341, "learning_rate": 2.7964582970902338e-05, "loss": 0.3263, "step": 526 }, { "epoch": 2.1664953751284686, "grad_norm": 0.3311312720633184, "learning_rate": 2.7911819332168627e-05, "loss": 0.3302, "step": 527 }, { "epoch": 2.170606372045221, "grad_norm": 0.32361165480350135, "learning_rate": 2.78589903299071e-05, "loss": 0.3307, "step": 528 }, { "epoch": 2.174717368961973, "grad_norm": 0.33128441736832326, "learning_rate": 2.7806096400564775e-05, "loss": 0.3234, "step": 529 }, { "epoch": 2.1788283658787257, "grad_norm": 0.2945513597575282, "learning_rate": 2.7753137981125068e-05, "loss": 0.3354, "step": 530 }, { "epoch": 2.182939362795478, "grad_norm": 0.35769877925150756, "learning_rate": 2.7700115509104176e-05, "loss": 0.336, "step": 531 }, { "epoch": 2.1870503597122304, "grad_norm": 0.3065613778661335, "learning_rate": 2.7647029422547465e-05, "loss": 0.3326, "step": 532 }, { "epoch": 2.1911613566289825, "grad_norm": 0.31377341167653106, "learning_rate": 2.7593880160025864e-05, "loss": 0.3354, "step": 533 }, { "epoch": 2.195272353545735, "grad_norm": 0.28252520009349796, "learning_rate": 2.754066816063222e-05, "loss": 0.3194, "step": 534 }, { "epoch": 2.199383350462487, "grad_norm": 0.2792714767775337, "learning_rate": 2.7487393863977687e-05, "loss": 0.3369, "step": 535 }, { "epoch": 2.2034943473792397, "grad_norm": 0.28232252447629436, "learning_rate": 2.7434057710188077e-05, "loss": 0.3157, "step": 536 }, { "epoch": 2.2076053442959918, "grad_norm": 0.25752143372328223, "learning_rate": 2.738066013990025e-05, "loss": 0.3153, "step": 537 }, { "epoch": 2.2117163412127443, "grad_norm": 0.297593757050134, "learning_rate": 2.732720159425845e-05, "loss": 0.3296, "step": 538 }, { "epoch": 2.2158273381294964, "grad_norm": 0.2758026361391992, "learning_rate": 2.7273682514910668e-05, "loss": 0.3247, "step": 539 }, { "epoch": 2.2199383350462485, "grad_norm": 0.3422530970797541, "learning_rate": 2.7220103344004995e-05, "loss": 0.3293, "step": 540 }, { "epoch": 2.224049331963001, "grad_norm": 0.3668216989996492, "learning_rate": 2.7166464524185977e-05, "loss": 0.3419, "step": 541 }, { "epoch": 2.2281603288797536, "grad_norm": 0.3353880633821636, "learning_rate": 2.7112766498590944e-05, "loss": 0.3277, "step": 542 }, { "epoch": 2.2322713257965057, "grad_norm": 0.3990592979092236, "learning_rate": 2.705900971084635e-05, "loss": 0.3352, "step": 543 }, { "epoch": 2.2363823227132578, "grad_norm": 0.34649020190108354, "learning_rate": 2.7005194605064122e-05, "loss": 0.3334, "step": 544 }, { "epoch": 2.2404933196300103, "grad_norm": 0.30730771299144677, "learning_rate": 2.6951321625837975e-05, "loss": 0.3299, "step": 545 }, { "epoch": 2.2446043165467624, "grad_norm": 0.3875753398426506, "learning_rate": 2.6897391218239746e-05, "loss": 0.3338, "step": 546 }, { "epoch": 2.248715313463515, "grad_norm": 0.27365792996452604, "learning_rate": 2.6843403827815714e-05, "loss": 0.3353, "step": 547 }, { "epoch": 2.252826310380267, "grad_norm": 0.3913041787492654, "learning_rate": 2.6789359900582935e-05, "loss": 0.3274, "step": 548 }, { "epoch": 2.2569373072970196, "grad_norm": 0.25296864218831433, "learning_rate": 2.673525988302553e-05, "loss": 0.344, "step": 549 }, { "epoch": 2.2610483042137717, "grad_norm": 0.3931054005221806, "learning_rate": 2.6681104222091018e-05, "loss": 0.3387, "step": 550 }, { "epoch": 2.265159301130524, "grad_norm": 0.25058187158942646, "learning_rate": 2.662689336518661e-05, "loss": 0.3306, "step": 551 }, { "epoch": 2.2692702980472763, "grad_norm": 0.34466530037047466, "learning_rate": 2.6572627760175523e-05, "loss": 0.334, "step": 552 }, { "epoch": 2.273381294964029, "grad_norm": 0.27034275974079125, "learning_rate": 2.6518307855373276e-05, "loss": 0.3245, "step": 553 }, { "epoch": 2.277492291880781, "grad_norm": 0.32305382508070213, "learning_rate": 2.6463934099543992e-05, "loss": 0.3337, "step": 554 }, { "epoch": 2.2816032887975335, "grad_norm": 0.2943172520547782, "learning_rate": 2.6409506941896665e-05, "loss": 0.336, "step": 555 }, { "epoch": 2.2857142857142856, "grad_norm": 0.3589575171976915, "learning_rate": 2.6355026832081493e-05, "loss": 0.331, "step": 556 }, { "epoch": 2.289825282631038, "grad_norm": 0.32434981120796447, "learning_rate": 2.6300494220186113e-05, "loss": 0.3318, "step": 557 }, { "epoch": 2.2939362795477902, "grad_norm": 0.3207602935494296, "learning_rate": 2.6245909556731937e-05, "loss": 0.3244, "step": 558 }, { "epoch": 2.2980472764645428, "grad_norm": 0.2994113594865251, "learning_rate": 2.6191273292670372e-05, "loss": 0.3342, "step": 559 }, { "epoch": 2.302158273381295, "grad_norm": 0.3188506407975691, "learning_rate": 2.6136585879379145e-05, "loss": 0.3394, "step": 560 }, { "epoch": 2.3062692702980474, "grad_norm": 0.3098768532791999, "learning_rate": 2.608184776865854e-05, "loss": 0.3289, "step": 561 }, { "epoch": 2.3103802672147995, "grad_norm": 0.30655390743691074, "learning_rate": 2.602705941272769e-05, "loss": 0.322, "step": 562 }, { "epoch": 2.314491264131552, "grad_norm": 0.3063020702615511, "learning_rate": 2.597222126422081e-05, "loss": 0.3332, "step": 563 }, { "epoch": 2.318602261048304, "grad_norm": 0.3156706802866886, "learning_rate": 2.5917333776183503e-05, "loss": 0.3228, "step": 564 }, { "epoch": 2.3227132579650567, "grad_norm": 0.28723121088598647, "learning_rate": 2.586239740206897e-05, "loss": 0.3197, "step": 565 }, { "epoch": 2.3268242548818088, "grad_norm": 0.30433476866497944, "learning_rate": 2.5807412595734283e-05, "loss": 0.3279, "step": 566 }, { "epoch": 2.3309352517985613, "grad_norm": 0.25756394575456126, "learning_rate": 2.5752379811436655e-05, "loss": 0.324, "step": 567 }, { "epoch": 2.3350462487153134, "grad_norm": 0.3098697810639567, "learning_rate": 2.5697299503829657e-05, "loss": 0.3347, "step": 568 }, { "epoch": 2.339157245632066, "grad_norm": 0.30837147852538255, "learning_rate": 2.5642172127959475e-05, "loss": 0.3292, "step": 569 }, { "epoch": 2.343268242548818, "grad_norm": 0.3069265524451522, "learning_rate": 2.558699813926115e-05, "loss": 0.3323, "step": 570 }, { "epoch": 2.3473792394655706, "grad_norm": 0.32407359000068336, "learning_rate": 2.5531777993554813e-05, "loss": 0.3317, "step": 571 }, { "epoch": 2.3514902363823227, "grad_norm": 0.3118278827236543, "learning_rate": 2.5476512147041926e-05, "loss": 0.3428, "step": 572 }, { "epoch": 2.3556012332990752, "grad_norm": 0.3342461379651357, "learning_rate": 2.5421201056301507e-05, "loss": 0.3284, "step": 573 }, { "epoch": 2.3597122302158273, "grad_norm": 0.29958642203118996, "learning_rate": 2.5365845178286358e-05, "loss": 0.3275, "step": 574 }, { "epoch": 2.3638232271325794, "grad_norm": 0.3328808710382115, "learning_rate": 2.5310444970319292e-05, "loss": 0.3301, "step": 575 }, { "epoch": 2.367934224049332, "grad_norm": 0.3782109254880134, "learning_rate": 2.525500089008936e-05, "loss": 0.3375, "step": 576 }, { "epoch": 2.3720452209660845, "grad_norm": 0.3451832289715049, "learning_rate": 2.5199513395648047e-05, "loss": 0.3207, "step": 577 }, { "epoch": 2.3761562178828366, "grad_norm": 0.34430102536898843, "learning_rate": 2.5143982945405527e-05, "loss": 0.3335, "step": 578 }, { "epoch": 2.3802672147995887, "grad_norm": 0.3480033297872511, "learning_rate": 2.5088409998126827e-05, "loss": 0.3364, "step": 579 }, { "epoch": 2.3843782117163412, "grad_norm": 0.2864218297613634, "learning_rate": 2.5032795012928093e-05, "loss": 0.3296, "step": 580 }, { "epoch": 2.3884892086330938, "grad_norm": 0.27656367755763744, "learning_rate": 2.4977138449272746e-05, "loss": 0.3252, "step": 581 }, { "epoch": 2.392600205549846, "grad_norm": 0.3210933436925842, "learning_rate": 2.4921440766967718e-05, "loss": 0.3292, "step": 582 }, { "epoch": 2.396711202466598, "grad_norm": 0.2695136624951651, "learning_rate": 2.4865702426159633e-05, "loss": 0.3345, "step": 583 }, { "epoch": 2.4008221993833505, "grad_norm": 0.2853367762196653, "learning_rate": 2.4809923887331028e-05, "loss": 0.3272, "step": 584 }, { "epoch": 2.4049331963001026, "grad_norm": 0.26413255067697416, "learning_rate": 2.4754105611296534e-05, "loss": 0.3244, "step": 585 }, { "epoch": 2.409044193216855, "grad_norm": 0.2788852049644498, "learning_rate": 2.4698248059199056e-05, "loss": 0.3211, "step": 586 }, { "epoch": 2.4131551901336072, "grad_norm": 0.2839043038975584, "learning_rate": 2.4642351692505998e-05, "loss": 0.3227, "step": 587 }, { "epoch": 2.41726618705036, "grad_norm": 0.26795507769344473, "learning_rate": 2.4586416973005414e-05, "loss": 0.3255, "step": 588 }, { "epoch": 2.421377183967112, "grad_norm": 0.28061689093119546, "learning_rate": 2.453044436280223e-05, "loss": 0.3297, "step": 589 }, { "epoch": 2.4254881808838644, "grad_norm": 0.2768666834694627, "learning_rate": 2.4474434324314388e-05, "loss": 0.3351, "step": 590 }, { "epoch": 2.4295991778006165, "grad_norm": 0.2992581902997908, "learning_rate": 2.4418387320269047e-05, "loss": 0.3185, "step": 591 }, { "epoch": 2.433710174717369, "grad_norm": 0.2582002464494716, "learning_rate": 2.4362303813698766e-05, "loss": 0.3262, "step": 592 }, { "epoch": 2.437821171634121, "grad_norm": 0.288633230170238, "learning_rate": 2.4306184267937654e-05, "loss": 0.3317, "step": 593 }, { "epoch": 2.4419321685508737, "grad_norm": 0.2642364711177551, "learning_rate": 2.425002914661758e-05, "loss": 0.3325, "step": 594 }, { "epoch": 2.446043165467626, "grad_norm": 0.28079190224987655, "learning_rate": 2.419383891366431e-05, "loss": 0.3186, "step": 595 }, { "epoch": 2.4501541623843783, "grad_norm": 0.2517139036005103, "learning_rate": 2.4137614033293676e-05, "loss": 0.3325, "step": 596 }, { "epoch": 2.4542651593011304, "grad_norm": 0.3048176393508488, "learning_rate": 2.408135497000776e-05, "loss": 0.3258, "step": 597 }, { "epoch": 2.458376156217883, "grad_norm": 0.2553179117187841, "learning_rate": 2.4025062188591046e-05, "loss": 0.3286, "step": 598 }, { "epoch": 2.462487153134635, "grad_norm": 0.297522330860201, "learning_rate": 2.3968736154106574e-05, "loss": 0.3257, "step": 599 }, { "epoch": 2.4665981500513876, "grad_norm": 0.26227969960383657, "learning_rate": 2.3912377331892112e-05, "loss": 0.3348, "step": 600 }, { "epoch": 2.4707091469681397, "grad_norm": 0.2678339048494993, "learning_rate": 2.3855986187556295e-05, "loss": 0.3247, "step": 601 }, { "epoch": 2.4748201438848922, "grad_norm": 0.2782462750099432, "learning_rate": 2.3799563186974802e-05, "loss": 0.3288, "step": 602 }, { "epoch": 2.4789311408016443, "grad_norm": 0.26942196233507953, "learning_rate": 2.374310879628647e-05, "loss": 0.3343, "step": 603 }, { "epoch": 2.483042137718397, "grad_norm": 0.25192857481609987, "learning_rate": 2.3686623481889496e-05, "loss": 0.3355, "step": 604 }, { "epoch": 2.487153134635149, "grad_norm": 0.27024432726841424, "learning_rate": 2.3630107710437526e-05, "loss": 0.3296, "step": 605 }, { "epoch": 2.4912641315519015, "grad_norm": 0.2491507090752715, "learning_rate": 2.3573561948835836e-05, "loss": 0.3421, "step": 606 }, { "epoch": 2.4953751284686536, "grad_norm": 0.3126612318343971, "learning_rate": 2.3516986664237474e-05, "loss": 0.3254, "step": 607 }, { "epoch": 2.499486125385406, "grad_norm": 0.2643267374371664, "learning_rate": 2.3460382324039377e-05, "loss": 0.3272, "step": 608 }, { "epoch": 2.5035971223021583, "grad_norm": 0.2786020179741824, "learning_rate": 2.3403749395878542e-05, "loss": 0.3292, "step": 609 }, { "epoch": 2.5077081192189103, "grad_norm": 0.2873861028514028, "learning_rate": 2.3347088347628128e-05, "loss": 0.3307, "step": 610 }, { "epoch": 2.511819116135663, "grad_norm": 0.2772071301023664, "learning_rate": 2.3290399647393628e-05, "loss": 0.324, "step": 611 }, { "epoch": 2.5159301130524154, "grad_norm": 0.27537902676408144, "learning_rate": 2.3233683763508957e-05, "loss": 0.3343, "step": 612 }, { "epoch": 2.5200411099691675, "grad_norm": 0.2859207355422494, "learning_rate": 2.317694116453263e-05, "loss": 0.34, "step": 613 }, { "epoch": 2.5241521068859196, "grad_norm": 0.2743835098944321, "learning_rate": 2.3120172319243864e-05, "loss": 0.3338, "step": 614 }, { "epoch": 2.528263103802672, "grad_norm": 0.27595779721707764, "learning_rate": 2.3063377696638707e-05, "loss": 0.3311, "step": 615 }, { "epoch": 2.5323741007194247, "grad_norm": 0.26843945933414415, "learning_rate": 2.300655776592616e-05, "loss": 0.3335, "step": 616 }, { "epoch": 2.536485097636177, "grad_norm": 0.25648610540979605, "learning_rate": 2.294971299652432e-05, "loss": 0.3235, "step": 617 }, { "epoch": 2.540596094552929, "grad_norm": 0.3013681074148862, "learning_rate": 2.2892843858056474e-05, "loss": 0.3321, "step": 618 }, { "epoch": 2.5447070914696814, "grad_norm": 0.24919313896376655, "learning_rate": 2.283595082034725e-05, "loss": 0.3167, "step": 619 }, { "epoch": 2.548818088386434, "grad_norm": 0.2688625414735968, "learning_rate": 2.2779034353418707e-05, "loss": 0.3324, "step": 620 }, { "epoch": 2.552929085303186, "grad_norm": 0.26263149016674175, "learning_rate": 2.2722094927486472e-05, "loss": 0.3286, "step": 621 }, { "epoch": 2.557040082219938, "grad_norm": 0.2823135658125824, "learning_rate": 2.2665133012955844e-05, "loss": 0.3383, "step": 622 }, { "epoch": 2.5611510791366907, "grad_norm": 0.276217133090313, "learning_rate": 2.2608149080417913e-05, "loss": 0.3289, "step": 623 }, { "epoch": 2.565262076053443, "grad_norm": 0.2850565964189695, "learning_rate": 2.2551143600645672e-05, "loss": 0.3244, "step": 624 }, { "epoch": 2.5693730729701953, "grad_norm": 0.26362750208519725, "learning_rate": 2.249411704459013e-05, "loss": 0.3361, "step": 625 }, { "epoch": 2.5734840698869474, "grad_norm": 0.2798643173147311, "learning_rate": 2.2437069883376404e-05, "loss": 0.3142, "step": 626 }, { "epoch": 2.5775950668037, "grad_norm": 0.2563288966482464, "learning_rate": 2.238000258829986e-05, "loss": 0.324, "step": 627 }, { "epoch": 2.581706063720452, "grad_norm": 0.30508227611107025, "learning_rate": 2.2322915630822184e-05, "loss": 0.3226, "step": 628 }, { "epoch": 2.5858170606372046, "grad_norm": 0.25091522041815256, "learning_rate": 2.226580948256751e-05, "loss": 0.3315, "step": 629 }, { "epoch": 2.5899280575539567, "grad_norm": 0.32219820646356984, "learning_rate": 2.2208684615318515e-05, "loss": 0.3291, "step": 630 }, { "epoch": 2.5940390544707093, "grad_norm": 0.2642592817496665, "learning_rate": 2.2151541501012526e-05, "loss": 0.3348, "step": 631 }, { "epoch": 2.5981500513874614, "grad_norm": 0.2696044140880529, "learning_rate": 2.2094380611737615e-05, "loss": 0.336, "step": 632 }, { "epoch": 2.602261048304214, "grad_norm": 0.2778886911936094, "learning_rate": 2.20372024197287e-05, "loss": 0.3221, "step": 633 }, { "epoch": 2.606372045220966, "grad_norm": 0.26840999460258913, "learning_rate": 2.1980007397363653e-05, "loss": 0.3283, "step": 634 }, { "epoch": 2.6104830421377185, "grad_norm": 0.30677147336816346, "learning_rate": 2.1922796017159382e-05, "loss": 0.3391, "step": 635 }, { "epoch": 2.6145940390544706, "grad_norm": 0.26204192651719005, "learning_rate": 2.186556875176794e-05, "loss": 0.3181, "step": 636 }, { "epoch": 2.618705035971223, "grad_norm": 0.29330464889106106, "learning_rate": 2.1808326073972618e-05, "loss": 0.3334, "step": 637 }, { "epoch": 2.6228160328879753, "grad_norm": 0.3611585390826276, "learning_rate": 2.1751068456684026e-05, "loss": 0.3328, "step": 638 }, { "epoch": 2.626927029804728, "grad_norm": 0.32081188768018193, "learning_rate": 2.1693796372936207e-05, "loss": 0.3348, "step": 639 }, { "epoch": 2.63103802672148, "grad_norm": 0.3466957904122417, "learning_rate": 2.1636510295882723e-05, "loss": 0.3287, "step": 640 }, { "epoch": 2.635149023638232, "grad_norm": 0.3325190809270464, "learning_rate": 2.1579210698792724e-05, "loss": 0.3357, "step": 641 }, { "epoch": 2.6392600205549845, "grad_norm": 0.323613824705376, "learning_rate": 2.1521898055047065e-05, "loss": 0.3254, "step": 642 }, { "epoch": 2.643371017471737, "grad_norm": 0.31700835111056935, "learning_rate": 2.1464572838134393e-05, "loss": 0.3405, "step": 643 }, { "epoch": 2.647482014388489, "grad_norm": 0.31194064100135144, "learning_rate": 2.1407235521647216e-05, "loss": 0.3337, "step": 644 }, { "epoch": 2.6515930113052413, "grad_norm": 0.291054868309333, "learning_rate": 2.134988657927802e-05, "loss": 0.3223, "step": 645 }, { "epoch": 2.655704008221994, "grad_norm": 0.28960930247219024, "learning_rate": 2.129252648481532e-05, "loss": 0.3399, "step": 646 }, { "epoch": 2.6598150051387464, "grad_norm": 0.262272292175284, "learning_rate": 2.123515571213977e-05, "loss": 0.3199, "step": 647 }, { "epoch": 2.6639260020554985, "grad_norm": 0.3430422990168527, "learning_rate": 2.1177774735220246e-05, "loss": 0.3211, "step": 648 }, { "epoch": 2.6680369989722506, "grad_norm": 0.24490577578554293, "learning_rate": 2.1120384028109928e-05, "loss": 0.3347, "step": 649 }, { "epoch": 2.672147995889003, "grad_norm": 0.3135561697948168, "learning_rate": 2.106298406494237e-05, "loss": 0.337, "step": 650 }, { "epoch": 2.6762589928057556, "grad_norm": 0.2536708220913538, "learning_rate": 2.1005575319927606e-05, "loss": 0.3286, "step": 651 }, { "epoch": 2.6803699897225077, "grad_norm": 0.2905534330712754, "learning_rate": 2.094815826734822e-05, "loss": 0.3344, "step": 652 }, { "epoch": 2.68448098663926, "grad_norm": 0.255577529722107, "learning_rate": 2.089073338155542e-05, "loss": 0.3347, "step": 653 }, { "epoch": 2.6885919835560124, "grad_norm": 0.3169225043435795, "learning_rate": 2.0833301136965138e-05, "loss": 0.3368, "step": 654 }, { "epoch": 2.692702980472765, "grad_norm": 0.24523301662966585, "learning_rate": 2.0775862008054102e-05, "loss": 0.3317, "step": 655 }, { "epoch": 2.696813977389517, "grad_norm": 0.3377639592657221, "learning_rate": 2.0718416469355917e-05, "loss": 0.3327, "step": 656 }, { "epoch": 2.700924974306269, "grad_norm": 0.2760670088077706, "learning_rate": 2.066096499545712e-05, "loss": 0.3254, "step": 657 }, { "epoch": 2.7050359712230216, "grad_norm": 0.33508069878850794, "learning_rate": 2.0603508060993306e-05, "loss": 0.3324, "step": 658 }, { "epoch": 2.7091469681397737, "grad_norm": 0.2888367467526053, "learning_rate": 2.0546046140645178e-05, "loss": 0.33, "step": 659 }, { "epoch": 2.7132579650565263, "grad_norm": 0.2851449912230599, "learning_rate": 2.0488579709134623e-05, "loss": 0.3375, "step": 660 }, { "epoch": 2.7173689619732784, "grad_norm": 0.28857625298935113, "learning_rate": 2.04311092412208e-05, "loss": 0.3324, "step": 661 }, { "epoch": 2.721479958890031, "grad_norm": 0.3099583754195003, "learning_rate": 2.0373635211696214e-05, "loss": 0.331, "step": 662 }, { "epoch": 2.725590955806783, "grad_norm": 0.3033491487781276, "learning_rate": 2.0316158095382797e-05, "loss": 0.3354, "step": 663 }, { "epoch": 2.7297019527235356, "grad_norm": 0.2619645136131476, "learning_rate": 2.0258678367127972e-05, "loss": 0.3236, "step": 664 }, { "epoch": 2.7338129496402876, "grad_norm": 0.30087165164939694, "learning_rate": 2.0201196501800768e-05, "loss": 0.3232, "step": 665 }, { "epoch": 2.73792394655704, "grad_norm": 0.2738871408955049, "learning_rate": 2.0143712974287838e-05, "loss": 0.3277, "step": 666 }, { "epoch": 2.7420349434737923, "grad_norm": 0.29720770878473823, "learning_rate": 2.0086228259489578e-05, "loss": 0.3419, "step": 667 }, { "epoch": 2.746145940390545, "grad_norm": 0.24745189549975016, "learning_rate": 2.0028742832316202e-05, "loss": 0.3241, "step": 668 }, { "epoch": 2.750256937307297, "grad_norm": 0.26535109416407787, "learning_rate": 1.99712571676838e-05, "loss": 0.3206, "step": 669 }, { "epoch": 2.7543679342240495, "grad_norm": 0.28264530686991374, "learning_rate": 1.9913771740510426e-05, "loss": 0.3441, "step": 670 }, { "epoch": 2.7584789311408016, "grad_norm": 0.2439511505193991, "learning_rate": 1.9856287025712172e-05, "loss": 0.3327, "step": 671 }, { "epoch": 2.762589928057554, "grad_norm": 0.25559437564690174, "learning_rate": 1.979880349819924e-05, "loss": 0.3325, "step": 672 }, { "epoch": 2.766700924974306, "grad_norm": 0.22699680175413017, "learning_rate": 1.974132163287203e-05, "loss": 0.329, "step": 673 }, { "epoch": 2.7708119218910587, "grad_norm": 0.2582116792070818, "learning_rate": 1.9683841904617217e-05, "loss": 0.3319, "step": 674 }, { "epoch": 2.774922918807811, "grad_norm": 0.24489677360477968, "learning_rate": 1.9626364788303796e-05, "loss": 0.3313, "step": 675 }, { "epoch": 2.779033915724563, "grad_norm": 0.29973564906267575, "learning_rate": 1.956889075877921e-05, "loss": 0.3359, "step": 676 }, { "epoch": 2.7831449126413155, "grad_norm": 0.24725710379682117, "learning_rate": 1.9511420290865387e-05, "loss": 0.3269, "step": 677 }, { "epoch": 2.787255909558068, "grad_norm": 0.254114192213977, "learning_rate": 1.945395385935483e-05, "loss": 0.3315, "step": 678 }, { "epoch": 2.79136690647482, "grad_norm": 0.2500901168277256, "learning_rate": 1.9396491939006693e-05, "loss": 0.317, "step": 679 }, { "epoch": 2.795477903391572, "grad_norm": 0.2506907960447071, "learning_rate": 1.9339035004542883e-05, "loss": 0.3355, "step": 680 }, { "epoch": 2.7995889003083247, "grad_norm": 0.254060820338398, "learning_rate": 1.9281583530644087e-05, "loss": 0.3274, "step": 681 }, { "epoch": 2.8036998972250773, "grad_norm": 0.22470917674479732, "learning_rate": 1.9224137991945898e-05, "loss": 0.3161, "step": 682 }, { "epoch": 2.8078108941418294, "grad_norm": 0.25617294354628883, "learning_rate": 1.9166698863034865e-05, "loss": 0.3326, "step": 683 }, { "epoch": 2.8119218910585815, "grad_norm": 0.24637096854415516, "learning_rate": 1.910926661844459e-05, "loss": 0.3306, "step": 684 }, { "epoch": 2.816032887975334, "grad_norm": 0.23065590461427085, "learning_rate": 1.905184173265179e-05, "loss": 0.3285, "step": 685 }, { "epoch": 2.8201438848920866, "grad_norm": 0.25223738900179504, "learning_rate": 1.89944246800724e-05, "loss": 0.3315, "step": 686 }, { "epoch": 2.8242548818088387, "grad_norm": 0.2813788401987118, "learning_rate": 1.8937015935057637e-05, "loss": 0.343, "step": 687 }, { "epoch": 2.8283658787255908, "grad_norm": 0.23658155464390826, "learning_rate": 1.887961597189008e-05, "loss": 0.3361, "step": 688 }, { "epoch": 2.8324768756423433, "grad_norm": 0.2560263043866784, "learning_rate": 1.8822225264779757e-05, "loss": 0.336, "step": 689 }, { "epoch": 2.836587872559096, "grad_norm": 0.24171456841261904, "learning_rate": 1.8764844287860235e-05, "loss": 0.3155, "step": 690 }, { "epoch": 2.840698869475848, "grad_norm": 0.2709130278349106, "learning_rate": 1.8707473515184686e-05, "loss": 0.3347, "step": 691 }, { "epoch": 2.8448098663926, "grad_norm": 0.2389464904458257, "learning_rate": 1.8650113420721985e-05, "loss": 0.3261, "step": 692 }, { "epoch": 2.8489208633093526, "grad_norm": 0.23853438478287736, "learning_rate": 1.8592764478352788e-05, "loss": 0.3269, "step": 693 }, { "epoch": 2.8530318602261047, "grad_norm": 0.24002347978417551, "learning_rate": 1.8535427161865617e-05, "loss": 0.3273, "step": 694 }, { "epoch": 2.857142857142857, "grad_norm": 0.24648736679259559, "learning_rate": 1.8478101944952946e-05, "loss": 0.3336, "step": 695 }, { "epoch": 2.8612538540596093, "grad_norm": 0.2539305109029615, "learning_rate": 1.842078930120729e-05, "loss": 0.3285, "step": 696 }, { "epoch": 2.865364850976362, "grad_norm": 0.25402455868598073, "learning_rate": 1.836348970411729e-05, "loss": 0.3415, "step": 697 }, { "epoch": 2.869475847893114, "grad_norm": 0.2557721072489133, "learning_rate": 1.8306203627063803e-05, "loss": 0.3324, "step": 698 }, { "epoch": 2.8735868448098665, "grad_norm": 0.24149579536024937, "learning_rate": 1.8248931543315974e-05, "loss": 0.3327, "step": 699 }, { "epoch": 2.8776978417266186, "grad_norm": 0.2618118707433648, "learning_rate": 1.8191673926027386e-05, "loss": 0.336, "step": 700 }, { "epoch": 2.881808838643371, "grad_norm": 0.22607821223026145, "learning_rate": 1.813443124823206e-05, "loss": 0.3366, "step": 701 }, { "epoch": 2.885919835560123, "grad_norm": 0.2939881318908373, "learning_rate": 1.807720398284062e-05, "loss": 0.3279, "step": 702 }, { "epoch": 2.8900308324768758, "grad_norm": 0.2260123918569214, "learning_rate": 1.801999260263635e-05, "loss": 0.3337, "step": 703 }, { "epoch": 2.894141829393628, "grad_norm": 0.24275524474075025, "learning_rate": 1.7962797580271303e-05, "loss": 0.3244, "step": 704 }, { "epoch": 2.8982528263103804, "grad_norm": 0.2665250980386111, "learning_rate": 1.790561938826239e-05, "loss": 0.3341, "step": 705 }, { "epoch": 2.9023638232271325, "grad_norm": 0.2439607446787407, "learning_rate": 1.784845849898748e-05, "loss": 0.3177, "step": 706 }, { "epoch": 2.906474820143885, "grad_norm": 0.22414006570173825, "learning_rate": 1.7791315384681488e-05, "loss": 0.3199, "step": 707 }, { "epoch": 2.910585817060637, "grad_norm": 0.29428160100150474, "learning_rate": 1.7734190517432498e-05, "loss": 0.3276, "step": 708 }, { "epoch": 2.9146968139773897, "grad_norm": 0.23652581393109087, "learning_rate": 1.7677084369177823e-05, "loss": 0.3252, "step": 709 }, { "epoch": 2.9188078108941418, "grad_norm": 0.2638103273242896, "learning_rate": 1.7619997411700146e-05, "loss": 0.3269, "step": 710 }, { "epoch": 2.9229188078108943, "grad_norm": 0.23905327112200858, "learning_rate": 1.7562930116623602e-05, "loss": 0.3221, "step": 711 }, { "epoch": 2.9270298047276464, "grad_norm": 0.24681045434883284, "learning_rate": 1.750588295540988e-05, "loss": 0.3265, "step": 712 }, { "epoch": 2.931140801644399, "grad_norm": 0.22345235272171315, "learning_rate": 1.7448856399354335e-05, "loss": 0.331, "step": 713 }, { "epoch": 2.935251798561151, "grad_norm": 0.2620096260034816, "learning_rate": 1.7391850919582097e-05, "loss": 0.3133, "step": 714 }, { "epoch": 2.939362795477903, "grad_norm": 0.24255411379013975, "learning_rate": 1.733486698704417e-05, "loss": 0.3345, "step": 715 }, { "epoch": 2.9434737923946557, "grad_norm": 0.2677764953102307, "learning_rate": 1.7277905072513538e-05, "loss": 0.3125, "step": 716 }, { "epoch": 2.947584789311408, "grad_norm": 0.23338472896647094, "learning_rate": 1.7220965646581304e-05, "loss": 0.3329, "step": 717 }, { "epoch": 2.9516957862281603, "grad_norm": 0.256626369930556, "learning_rate": 1.7164049179652762e-05, "loss": 0.342, "step": 718 }, { "epoch": 2.9558067831449124, "grad_norm": 0.24204486389961907, "learning_rate": 1.7107156141943536e-05, "loss": 0.3317, "step": 719 }, { "epoch": 2.959917780061665, "grad_norm": 0.23154113899503126, "learning_rate": 1.7050287003475684e-05, "loss": 0.338, "step": 720 }, { "epoch": 2.9640287769784175, "grad_norm": 0.2515108652826035, "learning_rate": 1.699344223407384e-05, "loss": 0.3221, "step": 721 }, { "epoch": 2.9681397738951696, "grad_norm": 0.2508115805000718, "learning_rate": 1.6936622303361292e-05, "loss": 0.3272, "step": 722 }, { "epoch": 2.9722507708119217, "grad_norm": 0.24166143586067093, "learning_rate": 1.6879827680756132e-05, "loss": 0.3283, "step": 723 }, { "epoch": 2.9763617677286742, "grad_norm": 0.25924430328001846, "learning_rate": 1.682305883546737e-05, "loss": 0.3297, "step": 724 }, { "epoch": 2.9804727646454268, "grad_norm": 0.26555305739163787, "learning_rate": 1.6766316236491046e-05, "loss": 0.3314, "step": 725 }, { "epoch": 2.984583761562179, "grad_norm": 0.24257798147799245, "learning_rate": 1.6709600352606382e-05, "loss": 0.3238, "step": 726 }, { "epoch": 2.988694758478931, "grad_norm": 0.2772710049751061, "learning_rate": 1.665291165237188e-05, "loss": 0.3274, "step": 727 }, { "epoch": 2.9928057553956835, "grad_norm": 0.22542030051008188, "learning_rate": 1.6596250604121468e-05, "loss": 0.3328, "step": 728 }, { "epoch": 2.996916752312436, "grad_norm": 0.27500500626350044, "learning_rate": 1.653961767596063e-05, "loss": 0.3335, "step": 729 }, { "epoch": 3.001027749229188, "grad_norm": 0.270309023452525, "learning_rate": 1.6483013335762536e-05, "loss": 0.2946, "step": 730 }, { "epoch": 3.0051387461459402, "grad_norm": 0.31524306413172215, "learning_rate": 1.6426438051164168e-05, "loss": 0.2781, "step": 731 }, { "epoch": 3.0092497430626928, "grad_norm": 0.5781264673297727, "learning_rate": 1.636989228956248e-05, "loss": 0.2843, "step": 732 }, { "epoch": 3.013360739979445, "grad_norm": 0.3024860387001426, "learning_rate": 1.631337651811051e-05, "loss": 0.2747, "step": 733 }, { "epoch": 3.0174717368961974, "grad_norm": 0.3934556205957313, "learning_rate": 1.6256891203713533e-05, "loss": 0.2728, "step": 734 }, { "epoch": 3.0215827338129495, "grad_norm": 0.3129040186390879, "learning_rate": 1.6200436813025208e-05, "loss": 0.2736, "step": 735 }, { "epoch": 3.025693730729702, "grad_norm": 0.35042448311035457, "learning_rate": 1.6144013812443712e-05, "loss": 0.288, "step": 736 }, { "epoch": 3.029804727646454, "grad_norm": 0.32431756468327383, "learning_rate": 1.60876226681079e-05, "loss": 0.2675, "step": 737 }, { "epoch": 3.0339157245632067, "grad_norm": 0.3106864020345642, "learning_rate": 1.6031263845893436e-05, "loss": 0.2696, "step": 738 }, { "epoch": 3.038026721479959, "grad_norm": 0.2918561704720298, "learning_rate": 1.5974937811408964e-05, "loss": 0.2806, "step": 739 }, { "epoch": 3.0421377183967113, "grad_norm": 0.2824200303583143, "learning_rate": 1.5918645029992237e-05, "loss": 0.2669, "step": 740 }, { "epoch": 3.0462487153134634, "grad_norm": 0.2751071529215866, "learning_rate": 1.5862385966706324e-05, "loss": 0.2827, "step": 741 }, { "epoch": 3.050359712230216, "grad_norm": 0.2659741638733503, "learning_rate": 1.580616108633569e-05, "loss": 0.2772, "step": 742 }, { "epoch": 3.054470709146968, "grad_norm": 0.2705999112071291, "learning_rate": 1.5749970853382416e-05, "loss": 0.2813, "step": 743 }, { "epoch": 3.0585817060637206, "grad_norm": 0.2678804686149958, "learning_rate": 1.5693815732062346e-05, "loss": 0.2786, "step": 744 }, { "epoch": 3.0626927029804727, "grad_norm": 0.28891827034365974, "learning_rate": 1.563769618630124e-05, "loss": 0.2781, "step": 745 }, { "epoch": 3.0668036998972252, "grad_norm": 0.2608278243848426, "learning_rate": 1.558161267973096e-05, "loss": 0.2811, "step": 746 }, { "epoch": 3.0709146968139773, "grad_norm": 0.28779231459872495, "learning_rate": 1.552556567568562e-05, "loss": 0.2731, "step": 747 }, { "epoch": 3.07502569373073, "grad_norm": 0.27172400888062603, "learning_rate": 1.5469555637197775e-05, "loss": 0.273, "step": 748 }, { "epoch": 3.079136690647482, "grad_norm": 0.284714544394066, "learning_rate": 1.541358302699459e-05, "loss": 0.2737, "step": 749 }, { "epoch": 3.0832476875642345, "grad_norm": 0.27108204101689876, "learning_rate": 1.535764830749401e-05, "loss": 0.2719, "step": 750 }, { "epoch": 3.0873586844809866, "grad_norm": 0.26218248343837663, "learning_rate": 1.5301751940800947e-05, "loss": 0.2702, "step": 751 }, { "epoch": 3.091469681397739, "grad_norm": 0.27396785993274086, "learning_rate": 1.5245894388703473e-05, "loss": 0.2746, "step": 752 }, { "epoch": 3.0955806783144912, "grad_norm": 0.26457645017367387, "learning_rate": 1.5190076112668975e-05, "loss": 0.2741, "step": 753 }, { "epoch": 3.099691675231244, "grad_norm": 0.2669417946440861, "learning_rate": 1.5134297573840373e-05, "loss": 0.2609, "step": 754 }, { "epoch": 3.103802672147996, "grad_norm": 0.24350309961263825, "learning_rate": 1.507855923303229e-05, "loss": 0.2683, "step": 755 }, { "epoch": 3.1079136690647484, "grad_norm": 0.24218499055629, "learning_rate": 1.5022861550727261e-05, "loss": 0.2753, "step": 756 }, { "epoch": 3.1120246659815005, "grad_norm": 0.23884760385788692, "learning_rate": 1.4967204987071916e-05, "loss": 0.2674, "step": 757 }, { "epoch": 3.1161356628982526, "grad_norm": 0.24427494625864407, "learning_rate": 1.491159000187318e-05, "loss": 0.2766, "step": 758 }, { "epoch": 3.120246659815005, "grad_norm": 0.22462887698775066, "learning_rate": 1.4856017054594487e-05, "loss": 0.2817, "step": 759 }, { "epoch": 3.1243576567317572, "grad_norm": 0.22935229598945833, "learning_rate": 1.4800486604351953e-05, "loss": 0.2692, "step": 760 }, { "epoch": 3.12846865364851, "grad_norm": 0.2283641464528615, "learning_rate": 1.4744999109910642e-05, "loss": 0.2881, "step": 761 }, { "epoch": 3.132579650565262, "grad_norm": 0.2402319884401938, "learning_rate": 1.4689555029680706e-05, "loss": 0.2811, "step": 762 }, { "epoch": 3.1366906474820144, "grad_norm": 0.24689907618158027, "learning_rate": 1.4634154821713642e-05, "loss": 0.2748, "step": 763 }, { "epoch": 3.1408016443987665, "grad_norm": 0.21908840749036268, "learning_rate": 1.4578798943698495e-05, "loss": 0.2775, "step": 764 }, { "epoch": 3.144912641315519, "grad_norm": 0.2706839692520901, "learning_rate": 1.4523487852958078e-05, "loss": 0.274, "step": 765 }, { "epoch": 3.149023638232271, "grad_norm": 0.21829989516446477, "learning_rate": 1.4468222006445194e-05, "loss": 0.2846, "step": 766 }, { "epoch": 3.1531346351490237, "grad_norm": 0.254462615428386, "learning_rate": 1.4413001860738857e-05, "loss": 0.2751, "step": 767 }, { "epoch": 3.157245632065776, "grad_norm": 0.22996784925457855, "learning_rate": 1.4357827872040533e-05, "loss": 0.2763, "step": 768 }, { "epoch": 3.1613566289825283, "grad_norm": 0.24916315195392996, "learning_rate": 1.4302700496170348e-05, "loss": 0.273, "step": 769 }, { "epoch": 3.1654676258992804, "grad_norm": 0.2394151755505642, "learning_rate": 1.424762018856335e-05, "loss": 0.2733, "step": 770 }, { "epoch": 3.169578622816033, "grad_norm": 0.24554198081740938, "learning_rate": 1.4192587404265723e-05, "loss": 0.2739, "step": 771 }, { "epoch": 3.173689619732785, "grad_norm": 0.23659011712793626, "learning_rate": 1.4137602597931039e-05, "loss": 0.2819, "step": 772 }, { "epoch": 3.1778006166495376, "grad_norm": 0.22564337444058383, "learning_rate": 1.4082666223816503e-05, "loss": 0.2792, "step": 773 }, { "epoch": 3.1819116135662897, "grad_norm": 0.2317146590014487, "learning_rate": 1.4027778735779194e-05, "loss": 0.2777, "step": 774 }, { "epoch": 3.1860226104830422, "grad_norm": 0.2150695765539657, "learning_rate": 1.397294058727232e-05, "loss": 0.2765, "step": 775 }, { "epoch": 3.1901336073997943, "grad_norm": 0.23401579346325868, "learning_rate": 1.3918152231341466e-05, "loss": 0.2859, "step": 776 }, { "epoch": 3.194244604316547, "grad_norm": 0.22190869981693315, "learning_rate": 1.3863414120620866e-05, "loss": 0.2739, "step": 777 }, { "epoch": 3.198355601233299, "grad_norm": 0.24505629782483931, "learning_rate": 1.3808726707329636e-05, "loss": 0.2854, "step": 778 }, { "epoch": 3.2024665981500515, "grad_norm": 0.23527894624102066, "learning_rate": 1.3754090443268073e-05, "loss": 0.2739, "step": 779 }, { "epoch": 3.2065775950668036, "grad_norm": 0.250451185169838, "learning_rate": 1.3699505779813885e-05, "loss": 0.2779, "step": 780 }, { "epoch": 3.210688591983556, "grad_norm": 0.24199976474098944, "learning_rate": 1.3644973167918509e-05, "loss": 0.2819, "step": 781 }, { "epoch": 3.2147995889003083, "grad_norm": 0.2295279753606739, "learning_rate": 1.3590493058103334e-05, "loss": 0.2912, "step": 782 }, { "epoch": 3.218910585817061, "grad_norm": 0.24479637435880175, "learning_rate": 1.353606590045601e-05, "loss": 0.2625, "step": 783 }, { "epoch": 3.223021582733813, "grad_norm": 0.22366855597040158, "learning_rate": 1.3481692144626723e-05, "loss": 0.2716, "step": 784 }, { "epoch": 3.2271325796505654, "grad_norm": 0.23386804780243653, "learning_rate": 1.3427372239824478e-05, "loss": 0.2833, "step": 785 }, { "epoch": 3.2312435765673175, "grad_norm": 0.21138476617701588, "learning_rate": 1.3373106634813395e-05, "loss": 0.2815, "step": 786 }, { "epoch": 3.23535457348407, "grad_norm": 0.21868921231797736, "learning_rate": 1.3318895777908989e-05, "loss": 0.2737, "step": 787 }, { "epoch": 3.239465570400822, "grad_norm": 0.22091301457511603, "learning_rate": 1.3264740116974477e-05, "loss": 0.2784, "step": 788 }, { "epoch": 3.2435765673175747, "grad_norm": 0.21177976498652176, "learning_rate": 1.3210640099417071e-05, "loss": 0.2677, "step": 789 }, { "epoch": 3.247687564234327, "grad_norm": 0.22075747796505304, "learning_rate": 1.3156596172184291e-05, "loss": 0.2843, "step": 790 }, { "epoch": 3.2517985611510793, "grad_norm": 0.2093957071643158, "learning_rate": 1.3102608781760262e-05, "loss": 0.2783, "step": 791 }, { "epoch": 3.2559095580678314, "grad_norm": 0.2502487297475507, "learning_rate": 1.3048678374162033e-05, "loss": 0.2764, "step": 792 }, { "epoch": 3.2600205549845835, "grad_norm": 0.21828424241121014, "learning_rate": 1.2994805394935883e-05, "loss": 0.2783, "step": 793 }, { "epoch": 3.264131551901336, "grad_norm": 0.2225059796962467, "learning_rate": 1.2940990289153654e-05, "loss": 0.2818, "step": 794 }, { "epoch": 3.2682425488180886, "grad_norm": 0.2384933128418085, "learning_rate": 1.2887233501409062e-05, "loss": 0.2715, "step": 795 }, { "epoch": 3.2723535457348407, "grad_norm": 0.23881527823885554, "learning_rate": 1.283353547581403e-05, "loss": 0.2815, "step": 796 }, { "epoch": 3.276464542651593, "grad_norm": 0.21990598524463273, "learning_rate": 1.2779896655995012e-05, "loss": 0.2649, "step": 797 }, { "epoch": 3.2805755395683454, "grad_norm": 0.22014058980246703, "learning_rate": 1.2726317485089345e-05, "loss": 0.2857, "step": 798 }, { "epoch": 3.2846865364850975, "grad_norm": 0.2128741761301097, "learning_rate": 1.2672798405741565e-05, "loss": 0.2744, "step": 799 }, { "epoch": 3.28879753340185, "grad_norm": 0.2093902141995586, "learning_rate": 1.261933986009976e-05, "loss": 0.2714, "step": 800 }, { "epoch": 3.292908530318602, "grad_norm": 0.2317393277270657, "learning_rate": 1.2565942289811926e-05, "loss": 0.2821, "step": 801 }, { "epoch": 3.2970195272353546, "grad_norm": 0.2124614991624517, "learning_rate": 1.2512606136022316e-05, "loss": 0.2684, "step": 802 }, { "epoch": 3.3011305241521067, "grad_norm": 0.2237964393839327, "learning_rate": 1.245933183936778e-05, "loss": 0.28, "step": 803 }, { "epoch": 3.3052415210688593, "grad_norm": 0.20179137845865386, "learning_rate": 1.2406119839974137e-05, "loss": 0.2791, "step": 804 }, { "epoch": 3.3093525179856114, "grad_norm": 0.21344599872935055, "learning_rate": 1.2352970577452536e-05, "loss": 0.282, "step": 805 }, { "epoch": 3.313463514902364, "grad_norm": 0.21405309956045562, "learning_rate": 1.2299884490895829e-05, "loss": 0.2705, "step": 806 }, { "epoch": 3.317574511819116, "grad_norm": 0.20836540998453448, "learning_rate": 1.2246862018874937e-05, "loss": 0.2675, "step": 807 }, { "epoch": 3.3216855087358685, "grad_norm": 0.21917814502090704, "learning_rate": 1.2193903599435229e-05, "loss": 0.2867, "step": 808 }, { "epoch": 3.3257965056526206, "grad_norm": 0.21478503443145303, "learning_rate": 1.2141009670092905e-05, "loss": 0.263, "step": 809 }, { "epoch": 3.329907502569373, "grad_norm": 0.24017325608140172, "learning_rate": 1.2088180667831378e-05, "loss": 0.285, "step": 810 }, { "epoch": 3.3340184994861253, "grad_norm": 0.21263315635103802, "learning_rate": 1.2035417029097669e-05, "loss": 0.2794, "step": 811 }, { "epoch": 3.338129496402878, "grad_norm": 0.2208436673519513, "learning_rate": 1.198271918979879e-05, "loss": 0.2661, "step": 812 }, { "epoch": 3.34224049331963, "grad_norm": 0.21410801362761014, "learning_rate": 1.1930087585298163e-05, "loss": 0.2691, "step": 813 }, { "epoch": 3.3463514902363825, "grad_norm": 0.2189540505149734, "learning_rate": 1.1877522650412002e-05, "loss": 0.2777, "step": 814 }, { "epoch": 3.3504624871531345, "grad_norm": 0.2235412920660751, "learning_rate": 1.1825024819405728e-05, "loss": 0.2829, "step": 815 }, { "epoch": 3.354573484069887, "grad_norm": 0.22891833469755685, "learning_rate": 1.177259452599039e-05, "loss": 0.2883, "step": 816 }, { "epoch": 3.358684480986639, "grad_norm": 0.20951091444066108, "learning_rate": 1.1720232203319072e-05, "loss": 0.2703, "step": 817 }, { "epoch": 3.3627954779033917, "grad_norm": 0.2291000642315933, "learning_rate": 1.1667938283983318e-05, "loss": 0.2818, "step": 818 }, { "epoch": 3.366906474820144, "grad_norm": 0.24820675241373585, "learning_rate": 1.1615713200009555e-05, "loss": 0.2894, "step": 819 }, { "epoch": 3.3710174717368964, "grad_norm": 0.2112186174561992, "learning_rate": 1.1563557382855527e-05, "loss": 0.2765, "step": 820 }, { "epoch": 3.3751284686536485, "grad_norm": 0.23516315367694957, "learning_rate": 1.1511471263406727e-05, "loss": 0.2783, "step": 821 }, { "epoch": 3.379239465570401, "grad_norm": 0.20429288664608256, "learning_rate": 1.1459455271972855e-05, "loss": 0.2826, "step": 822 }, { "epoch": 3.383350462487153, "grad_norm": 0.22120456160119745, "learning_rate": 1.1407509838284234e-05, "loss": 0.2702, "step": 823 }, { "epoch": 3.3874614594039056, "grad_norm": 0.22196158784290934, "learning_rate": 1.1355635391488273e-05, "loss": 0.2816, "step": 824 }, { "epoch": 3.3915724563206577, "grad_norm": 0.23198563181248005, "learning_rate": 1.130383236014593e-05, "loss": 0.2807, "step": 825 }, { "epoch": 3.3956834532374103, "grad_norm": 0.21328959797566183, "learning_rate": 1.1252101172228161e-05, "loss": 0.2812, "step": 826 }, { "epoch": 3.3997944501541624, "grad_norm": 0.20829955768200162, "learning_rate": 1.1200442255112382e-05, "loss": 0.2781, "step": 827 }, { "epoch": 3.4039054470709145, "grad_norm": 0.210021918847506, "learning_rate": 1.1148856035578954e-05, "loss": 0.2793, "step": 828 }, { "epoch": 3.408016443987667, "grad_norm": 0.21953053255099053, "learning_rate": 1.1097342939807639e-05, "loss": 0.2826, "step": 829 }, { "epoch": 3.4121274409044196, "grad_norm": 0.21197251533168365, "learning_rate": 1.1045903393374088e-05, "loss": 0.2678, "step": 830 }, { "epoch": 3.4162384378211716, "grad_norm": 0.22402861818250405, "learning_rate": 1.0994537821246322e-05, "loss": 0.2768, "step": 831 }, { "epoch": 3.4203494347379237, "grad_norm": 0.20866347607213415, "learning_rate": 1.0943246647781231e-05, "loss": 0.2822, "step": 832 }, { "epoch": 3.4244604316546763, "grad_norm": 0.20588546575745492, "learning_rate": 1.0892030296721053e-05, "loss": 0.274, "step": 833 }, { "epoch": 3.4285714285714284, "grad_norm": 0.21933896518445742, "learning_rate": 1.0840889191189881e-05, "loss": 0.2815, "step": 834 }, { "epoch": 3.432682425488181, "grad_norm": 0.21986260521948456, "learning_rate": 1.0789823753690165e-05, "loss": 0.265, "step": 835 }, { "epoch": 3.436793422404933, "grad_norm": 0.2472526595417136, "learning_rate": 1.073883440609923e-05, "loss": 0.2819, "step": 836 }, { "epoch": 3.4409044193216856, "grad_norm": 0.23044008878105163, "learning_rate": 1.0687921569665778e-05, "loss": 0.2743, "step": 837 }, { "epoch": 3.4450154162384377, "grad_norm": 0.2127401189830073, "learning_rate": 1.0637085665006416e-05, "loss": 0.2757, "step": 838 }, { "epoch": 3.44912641315519, "grad_norm": 0.23011524871297998, "learning_rate": 1.058632711210218e-05, "loss": 0.2867, "step": 839 }, { "epoch": 3.4532374100719423, "grad_norm": 0.2143448487687264, "learning_rate": 1.0535646330295064e-05, "loss": 0.2775, "step": 840 }, { "epoch": 3.457348406988695, "grad_norm": 0.2157327739595805, "learning_rate": 1.0485043738284543e-05, "loss": 0.2772, "step": 841 }, { "epoch": 3.461459403905447, "grad_norm": 0.21901388123050422, "learning_rate": 1.0434519754124155e-05, "loss": 0.2883, "step": 842 }, { "epoch": 3.4655704008221995, "grad_norm": 0.20706260357694797, "learning_rate": 1.0384074795217995e-05, "loss": 0.2729, "step": 843 }, { "epoch": 3.4696813977389516, "grad_norm": 0.20562057118619545, "learning_rate": 1.0333709278317295e-05, "loss": 0.2794, "step": 844 }, { "epoch": 3.473792394655704, "grad_norm": 0.2053621886084836, "learning_rate": 1.0283423619516984e-05, "loss": 0.2831, "step": 845 }, { "epoch": 3.477903391572456, "grad_norm": 0.21559967106224392, "learning_rate": 1.0233218234252233e-05, "loss": 0.2798, "step": 846 }, { "epoch": 3.4820143884892087, "grad_norm": 0.20693716417643127, "learning_rate": 1.0183093537295038e-05, "loss": 0.2834, "step": 847 }, { "epoch": 3.486125385405961, "grad_norm": 0.196045141198551, "learning_rate": 1.0133049942750794e-05, "loss": 0.2815, "step": 848 }, { "epoch": 3.4902363823227134, "grad_norm": 0.22184037078133786, "learning_rate": 1.0083087864054862e-05, "loss": 0.2782, "step": 849 }, { "epoch": 3.4943473792394655, "grad_norm": 0.21680925373572774, "learning_rate": 1.0033207713969152e-05, "loss": 0.2668, "step": 850 }, { "epoch": 3.498458376156218, "grad_norm": 0.20929159215700033, "learning_rate": 9.983409904578732e-06, "loss": 0.2771, "step": 851 }, { "epoch": 3.50256937307297, "grad_norm": 0.20085747960075442, "learning_rate": 9.93369484728841e-06, "loss": 0.2769, "step": 852 }, { "epoch": 3.5066803699897227, "grad_norm": 0.21740453796251422, "learning_rate": 9.884062952819336e-06, "loss": 0.2809, "step": 853 }, { "epoch": 3.5107913669064748, "grad_norm": 0.19826634602771384, "learning_rate": 9.834514631205607e-06, "loss": 0.2826, "step": 854 }, { "epoch": 3.5149023638232273, "grad_norm": 0.19929040918628962, "learning_rate": 9.785050291790886e-06, "loss": 0.27, "step": 855 }, { "epoch": 3.5190133607399794, "grad_norm": 0.20544687803262818, "learning_rate": 9.735670343225015e-06, "loss": 0.2759, "step": 856 }, { "epoch": 3.523124357656732, "grad_norm": 0.20100075338402584, "learning_rate": 9.68637519346064e-06, "loss": 0.2842, "step": 857 }, { "epoch": 3.527235354573484, "grad_norm": 0.19998157251828666, "learning_rate": 9.637165249749847e-06, "loss": 0.2677, "step": 858 }, { "epoch": 3.531346351490236, "grad_norm": 0.20946212814759255, "learning_rate": 9.588040918640784e-06, "loss": 0.2819, "step": 859 }, { "epoch": 3.5354573484069887, "grad_norm": 0.19305869769870324, "learning_rate": 9.539002605974315e-06, "loss": 0.2762, "step": 860 }, { "epoch": 3.539568345323741, "grad_norm": 0.22246584009743214, "learning_rate": 9.490050716880652e-06, "loss": 0.2761, "step": 861 }, { "epoch": 3.5436793422404933, "grad_norm": 0.2106791686837925, "learning_rate": 9.441185655776044e-06, "loss": 0.2836, "step": 862 }, { "epoch": 3.5477903391572454, "grad_norm": 0.20735417375234855, "learning_rate": 9.392407826359386e-06, "loss": 0.2797, "step": 863 }, { "epoch": 3.551901336073998, "grad_norm": 0.22319834142117814, "learning_rate": 9.343717631608913e-06, "loss": 0.2805, "step": 864 }, { "epoch": 3.5560123329907505, "grad_norm": 0.21387661139677305, "learning_rate": 9.295115473778871e-06, "loss": 0.2737, "step": 865 }, { "epoch": 3.5601233299075026, "grad_norm": 0.19614268534753534, "learning_rate": 9.246601754396184e-06, "loss": 0.2775, "step": 866 }, { "epoch": 3.5642343268242547, "grad_norm": 0.21426258046660832, "learning_rate": 9.198176874257147e-06, "loss": 0.2801, "step": 867 }, { "epoch": 3.568345323741007, "grad_norm": 0.20833350511079968, "learning_rate": 9.149841233424102e-06, "loss": 0.2903, "step": 868 }, { "epoch": 3.5724563206577598, "grad_norm": 0.2054636105867438, "learning_rate": 9.101595231222142e-06, "loss": 0.2714, "step": 869 }, { "epoch": 3.576567317574512, "grad_norm": 0.21677433378750463, "learning_rate": 9.053439266235817e-06, "loss": 0.2747, "step": 870 }, { "epoch": 3.580678314491264, "grad_norm": 0.21258620908795176, "learning_rate": 9.005373736305827e-06, "loss": 0.2866, "step": 871 }, { "epoch": 3.5847893114080165, "grad_norm": 0.21696485235415786, "learning_rate": 8.957399038525742e-06, "loss": 0.2768, "step": 872 }, { "epoch": 3.588900308324769, "grad_norm": 0.2048645071808934, "learning_rate": 8.909515569238727e-06, "loss": 0.2805, "step": 873 }, { "epoch": 3.593011305241521, "grad_norm": 0.2041872125610518, "learning_rate": 8.861723724034256e-06, "loss": 0.281, "step": 874 }, { "epoch": 3.597122302158273, "grad_norm": 0.22105486900940344, "learning_rate": 8.814023897744861e-06, "loss": 0.2722, "step": 875 }, { "epoch": 3.6012332990750258, "grad_norm": 0.20870597023983126, "learning_rate": 8.766416484442845e-06, "loss": 0.288, "step": 876 }, { "epoch": 3.605344295991778, "grad_norm": 0.22305369395665908, "learning_rate": 8.71890187743705e-06, "loss": 0.2833, "step": 877 }, { "epoch": 3.6094552929085304, "grad_norm": 0.20984704638631244, "learning_rate": 8.6714804692696e-06, "loss": 0.2815, "step": 878 }, { "epoch": 3.6135662898252825, "grad_norm": 0.205661449222605, "learning_rate": 8.624152651712647e-06, "loss": 0.2796, "step": 879 }, { "epoch": 3.617677286742035, "grad_norm": 0.21670437077691945, "learning_rate": 8.576918815765155e-06, "loss": 0.276, "step": 880 }, { "epoch": 3.621788283658787, "grad_norm": 0.21657223103082457, "learning_rate": 8.52977935164965e-06, "loss": 0.2793, "step": 881 }, { "epoch": 3.6258992805755397, "grad_norm": 0.19481374974543536, "learning_rate": 8.482734648808998e-06, "loss": 0.2828, "step": 882 }, { "epoch": 3.6300102774922918, "grad_norm": 0.22945439732292053, "learning_rate": 8.435785095903226e-06, "loss": 0.2767, "step": 883 }, { "epoch": 3.6341212744090443, "grad_norm": 0.21026587275904124, "learning_rate": 8.388931080806244e-06, "loss": 0.277, "step": 884 }, { "epoch": 3.6382322713257964, "grad_norm": 0.1905115883548281, "learning_rate": 8.342172990602692e-06, "loss": 0.2743, "step": 885 }, { "epoch": 3.642343268242549, "grad_norm": 0.21168502442048126, "learning_rate": 8.295511211584726e-06, "loss": 0.2684, "step": 886 }, { "epoch": 3.646454265159301, "grad_norm": 0.21498006850055293, "learning_rate": 8.248946129248821e-06, "loss": 0.2762, "step": 887 }, { "epoch": 3.6505652620760536, "grad_norm": 0.19489762757982362, "learning_rate": 8.202478128292594e-06, "loss": 0.279, "step": 888 }, { "epoch": 3.6546762589928057, "grad_norm": 0.21734478601458554, "learning_rate": 8.15610759261163e-06, "loss": 0.2743, "step": 889 }, { "epoch": 3.6587872559095582, "grad_norm": 0.2067502254964237, "learning_rate": 8.109834905296296e-06, "loss": 0.2687, "step": 890 }, { "epoch": 3.6628982528263103, "grad_norm": 0.20837277018256964, "learning_rate": 8.06366044862859e-06, "loss": 0.2776, "step": 891 }, { "epoch": 3.667009249743063, "grad_norm": 0.2024301743577271, "learning_rate": 8.017584604078974e-06, "loss": 0.2801, "step": 892 }, { "epoch": 3.671120246659815, "grad_norm": 0.21530744818182257, "learning_rate": 7.971607752303226e-06, "loss": 0.28, "step": 893 }, { "epoch": 3.675231243576567, "grad_norm": 0.2139811361890938, "learning_rate": 7.925730273139294e-06, "loss": 0.2712, "step": 894 }, { "epoch": 3.6793422404933196, "grad_norm": 0.20799988041239068, "learning_rate": 7.879952545604163e-06, "loss": 0.2926, "step": 895 }, { "epoch": 3.683453237410072, "grad_norm": 0.20418864938595824, "learning_rate": 7.834274947890715e-06, "loss": 0.2798, "step": 896 }, { "epoch": 3.6875642343268242, "grad_norm": 0.20416263025450562, "learning_rate": 7.78869785736461e-06, "loss": 0.2694, "step": 897 }, { "epoch": 3.6916752312435763, "grad_norm": 0.19066134679044647, "learning_rate": 7.74322165056117e-06, "loss": 0.2667, "step": 898 }, { "epoch": 3.695786228160329, "grad_norm": 0.2121667278903765, "learning_rate": 7.697846703182262e-06, "loss": 0.2784, "step": 899 }, { "epoch": 3.6998972250770814, "grad_norm": 0.2071705071263635, "learning_rate": 7.652573390093199e-06, "loss": 0.285, "step": 900 }, { "epoch": 3.7040082219938335, "grad_norm": 0.2035973761053005, "learning_rate": 7.607402085319644e-06, "loss": 0.2759, "step": 901 }, { "epoch": 3.7081192189105856, "grad_norm": 0.2083326930999411, "learning_rate": 7.562333162044508e-06, "loss": 0.2775, "step": 902 }, { "epoch": 3.712230215827338, "grad_norm": 0.21563075052521988, "learning_rate": 7.517366992604902e-06, "loss": 0.2767, "step": 903 }, { "epoch": 3.7163412127440907, "grad_norm": 0.20432779262539, "learning_rate": 7.4725039484890094e-06, "loss": 0.2874, "step": 904 }, { "epoch": 3.720452209660843, "grad_norm": 0.2047844251053815, "learning_rate": 7.427744400333053e-06, "loss": 0.2789, "step": 905 }, { "epoch": 3.724563206577595, "grad_norm": 0.2055231569256932, "learning_rate": 7.383088717918223e-06, "loss": 0.2748, "step": 906 }, { "epoch": 3.7286742034943474, "grad_norm": 0.20467879963763858, "learning_rate": 7.338537270167625e-06, "loss": 0.277, "step": 907 }, { "epoch": 3.7327852004111, "grad_norm": 0.21544746620927177, "learning_rate": 7.294090425143225e-06, "loss": 0.273, "step": 908 }, { "epoch": 3.736896197327852, "grad_norm": 0.2000666684512926, "learning_rate": 7.249748550042817e-06, "loss": 0.2806, "step": 909 }, { "epoch": 3.741007194244604, "grad_norm": 0.20770589378766816, "learning_rate": 7.20551201119698e-06, "loss": 0.2705, "step": 910 }, { "epoch": 3.7451181911613567, "grad_norm": 0.20437780757014407, "learning_rate": 7.161381174066065e-06, "loss": 0.2829, "step": 911 }, { "epoch": 3.749229188078109, "grad_norm": 0.19567720371080252, "learning_rate": 7.117356403237161e-06, "loss": 0.2813, "step": 912 }, { "epoch": 3.7533401849948613, "grad_norm": 0.19171574936304334, "learning_rate": 7.073438062421094e-06, "loss": 0.2782, "step": 913 }, { "epoch": 3.7574511819116134, "grad_norm": 0.20924848866916773, "learning_rate": 7.029626514449414e-06, "loss": 0.27, "step": 914 }, { "epoch": 3.761562178828366, "grad_norm": 0.20438696705099926, "learning_rate": 6.985922121271409e-06, "loss": 0.2728, "step": 915 }, { "epoch": 3.765673175745118, "grad_norm": 0.2084495335702813, "learning_rate": 6.942325243951098e-06, "loss": 0.2824, "step": 916 }, { "epoch": 3.7697841726618706, "grad_norm": 0.1993990523612008, "learning_rate": 6.898836242664262e-06, "loss": 0.282, "step": 917 }, { "epoch": 3.7738951695786227, "grad_norm": 0.19347775656849484, "learning_rate": 6.855455476695465e-06, "loss": 0.2706, "step": 918 }, { "epoch": 3.7780061664953752, "grad_norm": 0.20109622486576145, "learning_rate": 6.812183304435083e-06, "loss": 0.2801, "step": 919 }, { "epoch": 3.7821171634121273, "grad_norm": 0.18886838722656143, "learning_rate": 6.769020083376341e-06, "loss": 0.2721, "step": 920 }, { "epoch": 3.78622816032888, "grad_norm": 0.208430820513582, "learning_rate": 6.725966170112368e-06, "loss": 0.2686, "step": 921 }, { "epoch": 3.790339157245632, "grad_norm": 0.1967578418393911, "learning_rate": 6.6830219203332415e-06, "loss": 0.2721, "step": 922 }, { "epoch": 3.7944501541623845, "grad_norm": 0.2015892872246403, "learning_rate": 6.640187688823065e-06, "loss": 0.2792, "step": 923 }, { "epoch": 3.7985611510791366, "grad_norm": 0.1938822600108583, "learning_rate": 6.597463829457014e-06, "loss": 0.2799, "step": 924 }, { "epoch": 3.802672147995889, "grad_norm": 0.2023587740694427, "learning_rate": 6.554850695198427e-06, "loss": 0.2695, "step": 925 }, { "epoch": 3.8067831449126412, "grad_norm": 0.19570583847216003, "learning_rate": 6.512348638095887e-06, "loss": 0.2858, "step": 926 }, { "epoch": 3.810894141829394, "grad_norm": 0.19738903231975544, "learning_rate": 6.469958009280315e-06, "loss": 0.2681, "step": 927 }, { "epoch": 3.815005138746146, "grad_norm": 0.20083483818328293, "learning_rate": 6.4276791589620595e-06, "loss": 0.2852, "step": 928 }, { "epoch": 3.819116135662898, "grad_norm": 0.19273874331489446, "learning_rate": 6.385512436428021e-06, "loss": 0.2864, "step": 929 }, { "epoch": 3.8232271325796505, "grad_norm": 0.1869845010972472, "learning_rate": 6.343458190038747e-06, "loss": 0.2727, "step": 930 }, { "epoch": 3.827338129496403, "grad_norm": 0.19346715289339741, "learning_rate": 6.301516767225568e-06, "loss": 0.2739, "step": 931 }, { "epoch": 3.831449126413155, "grad_norm": 0.19227603993401987, "learning_rate": 6.259688514487718e-06, "loss": 0.2758, "step": 932 }, { "epoch": 3.8355601233299073, "grad_norm": 0.20411187735886127, "learning_rate": 6.217973777389483e-06, "loss": 0.2761, "step": 933 }, { "epoch": 3.83967112024666, "grad_norm": 0.18675711473098772, "learning_rate": 6.1763729005573284e-06, "loss": 0.2829, "step": 934 }, { "epoch": 3.8437821171634123, "grad_norm": 0.2123802835671684, "learning_rate": 6.134886227677073e-06, "loss": 0.2922, "step": 935 }, { "epoch": 3.8478931140801644, "grad_norm": 0.18956127541911397, "learning_rate": 6.093514101491034e-06, "loss": 0.2763, "step": 936 }, { "epoch": 3.8520041109969165, "grad_norm": 0.18788309236848885, "learning_rate": 6.052256863795198e-06, "loss": 0.2711, "step": 937 }, { "epoch": 3.856115107913669, "grad_norm": 0.19828249178491697, "learning_rate": 6.0111148554364084e-06, "loss": 0.2799, "step": 938 }, { "epoch": 3.8602261048304216, "grad_norm": 0.18431610567167325, "learning_rate": 5.970088416309532e-06, "loss": 0.2689, "step": 939 }, { "epoch": 3.8643371017471737, "grad_norm": 0.21004802063561837, "learning_rate": 5.929177885354665e-06, "loss": 0.279, "step": 940 }, { "epoch": 3.868448098663926, "grad_norm": 0.18145712447424242, "learning_rate": 5.888383600554326e-06, "loss": 0.2769, "step": 941 }, { "epoch": 3.8725590955806783, "grad_norm": 0.1998489072868665, "learning_rate": 5.8477058989306605e-06, "loss": 0.2902, "step": 942 }, { "epoch": 3.876670092497431, "grad_norm": 0.19349791063075825, "learning_rate": 5.807145116542678e-06, "loss": 0.2772, "step": 943 }, { "epoch": 3.880781089414183, "grad_norm": 0.20224120336775228, "learning_rate": 5.766701588483443e-06, "loss": 0.2766, "step": 944 }, { "epoch": 3.884892086330935, "grad_norm": 0.20201558369754713, "learning_rate": 5.726375648877329e-06, "loss": 0.2711, "step": 945 }, { "epoch": 3.8890030832476876, "grad_norm": 0.186362787594006, "learning_rate": 5.68616763087725e-06, "loss": 0.2637, "step": 946 }, { "epoch": 3.8931140801644397, "grad_norm": 0.18827723220330278, "learning_rate": 5.646077866661912e-06, "loss": 0.2728, "step": 947 }, { "epoch": 3.8972250770811923, "grad_norm": 0.20621122766057245, "learning_rate": 5.606106687433066e-06, "loss": 0.277, "step": 948 }, { "epoch": 3.9013360739979444, "grad_norm": 0.1997165387359167, "learning_rate": 5.5662544234127735e-06, "loss": 0.2852, "step": 949 }, { "epoch": 3.905447070914697, "grad_norm": 0.1986176597393475, "learning_rate": 5.526521403840677e-06, "loss": 0.2724, "step": 950 }, { "epoch": 3.909558067831449, "grad_norm": 0.19315083170854766, "learning_rate": 5.486907956971277e-06, "loss": 0.2654, "step": 951 }, { "epoch": 3.9136690647482015, "grad_norm": 0.19208269826966257, "learning_rate": 5.447414410071232e-06, "loss": 0.28, "step": 952 }, { "epoch": 3.9177800616649536, "grad_norm": 0.1986061425594109, "learning_rate": 5.40804108941664e-06, "loss": 0.2809, "step": 953 }, { "epoch": 3.921891058581706, "grad_norm": 0.18060496237659096, "learning_rate": 5.36878832029035e-06, "loss": 0.2753, "step": 954 }, { "epoch": 3.9260020554984583, "grad_norm": 0.19007144815119342, "learning_rate": 5.329656426979275e-06, "loss": 0.2844, "step": 955 }, { "epoch": 3.930113052415211, "grad_norm": 0.18228170358892676, "learning_rate": 5.290645732771711e-06, "loss": 0.2776, "step": 956 }, { "epoch": 3.934224049331963, "grad_norm": 0.20611317253574513, "learning_rate": 5.251756559954668e-06, "loss": 0.2752, "step": 957 }, { "epoch": 3.9383350462487154, "grad_norm": 0.19496510102086326, "learning_rate": 5.212989229811209e-06, "loss": 0.2703, "step": 958 }, { "epoch": 3.9424460431654675, "grad_norm": 0.18813827312165923, "learning_rate": 5.174344062617789e-06, "loss": 0.2817, "step": 959 }, { "epoch": 3.94655704008222, "grad_norm": 0.19091427031439173, "learning_rate": 5.135821377641616e-06, "loss": 0.2787, "step": 960 }, { "epoch": 3.950668036998972, "grad_norm": 0.1901592123123516, "learning_rate": 5.097421493138008e-06, "loss": 0.2766, "step": 961 }, { "epoch": 3.9547790339157247, "grad_norm": 0.1870098631363826, "learning_rate": 5.059144726347765e-06, "loss": 0.2728, "step": 962 }, { "epoch": 3.958890030832477, "grad_norm": 0.17796954931972553, "learning_rate": 5.020991393494558e-06, "loss": 0.2867, "step": 963 }, { "epoch": 3.963001027749229, "grad_norm": 0.19046713852280395, "learning_rate": 4.9829618097823055e-06, "loss": 0.2675, "step": 964 }, { "epoch": 3.9671120246659815, "grad_norm": 0.19367792434634498, "learning_rate": 4.945056289392565e-06, "loss": 0.2765, "step": 965 }, { "epoch": 3.971223021582734, "grad_norm": 0.18974765427392373, "learning_rate": 4.907275145481947e-06, "loss": 0.2731, "step": 966 }, { "epoch": 3.975334018499486, "grad_norm": 0.18889755922787974, "learning_rate": 4.8696186901795275e-06, "loss": 0.2817, "step": 967 }, { "epoch": 3.979445015416238, "grad_norm": 0.19028199023394596, "learning_rate": 4.832087234584266e-06, "loss": 0.2783, "step": 968 }, { "epoch": 3.9835560123329907, "grad_norm": 0.1964825876876656, "learning_rate": 4.794681088762438e-06, "loss": 0.2744, "step": 969 }, { "epoch": 3.9876670092497433, "grad_norm": 0.17957398832039587, "learning_rate": 4.757400561745069e-06, "loss": 0.2762, "step": 970 }, { "epoch": 3.9917780061664954, "grad_norm": 0.20987505932024647, "learning_rate": 4.720245961525387e-06, "loss": 0.2949, "step": 971 }, { "epoch": 3.9958890030832475, "grad_norm": 0.18879687589648914, "learning_rate": 4.683217595056275e-06, "loss": 0.2746, "step": 972 }, { "epoch": 4.0, "grad_norm": 1.707156689602904, "learning_rate": 4.646315768247731e-06, "loss": 0.2868, "step": 973 }, { "epoch": 4.0041109969167525, "grad_norm": 0.3673275720964706, "learning_rate": 4.609540785964348e-06, "loss": 0.2379, "step": 974 }, { "epoch": 4.008221993833504, "grad_norm": 0.26013071708722996, "learning_rate": 4.572892952022796e-06, "loss": 0.2495, "step": 975 }, { "epoch": 4.012332990750257, "grad_norm": 0.30039166221512403, "learning_rate": 4.5363725691893045e-06, "loss": 0.2434, "step": 976 }, { "epoch": 4.016443987667009, "grad_norm": 0.40331206801802966, "learning_rate": 4.499979939177164e-06, "loss": 0.2413, "step": 977 }, { "epoch": 4.020554984583762, "grad_norm": 0.2653915725640132, "learning_rate": 4.463715362644239e-06, "loss": 0.2415, "step": 978 }, { "epoch": 4.0246659815005135, "grad_norm": 0.2706794398843468, "learning_rate": 4.427579139190474e-06, "loss": 0.2353, "step": 979 }, { "epoch": 4.028776978417266, "grad_norm": 0.33800513453404296, "learning_rate": 4.391571567355428e-06, "loss": 0.244, "step": 980 }, { "epoch": 4.0328879753340185, "grad_norm": 0.2848868937309266, "learning_rate": 4.355692944615806e-06, "loss": 0.2446, "step": 981 }, { "epoch": 4.036998972250771, "grad_norm": 0.213052312700043, "learning_rate": 4.319943567382991e-06, "loss": 0.2446, "step": 982 }, { "epoch": 4.041109969167523, "grad_norm": 0.24448300665475436, "learning_rate": 4.28432373100061e-06, "loss": 0.2383, "step": 983 }, { "epoch": 4.045220966084275, "grad_norm": 0.28289541109409083, "learning_rate": 4.248833729742095e-06, "loss": 0.2335, "step": 984 }, { "epoch": 4.049331963001028, "grad_norm": 0.27075279957678594, "learning_rate": 4.2134738568082325e-06, "loss": 0.2388, "step": 985 }, { "epoch": 4.05344295991778, "grad_norm": 0.2271083193598205, "learning_rate": 4.1782444043247565e-06, "loss": 0.2386, "step": 986 }, { "epoch": 4.057553956834532, "grad_norm": 0.22324730717439883, "learning_rate": 4.143145663339932e-06, "loss": 0.2447, "step": 987 }, { "epoch": 4.061664953751285, "grad_norm": 0.26100760343340185, "learning_rate": 4.108177923822154e-06, "loss": 0.2426, "step": 988 }, { "epoch": 4.065775950668037, "grad_norm": 0.23257567018511596, "learning_rate": 4.073341474657544e-06, "loss": 0.2482, "step": 989 }, { "epoch": 4.06988694758479, "grad_norm": 0.1994071326027501, "learning_rate": 4.03863660364757e-06, "loss": 0.2389, "step": 990 }, { "epoch": 4.073997944501541, "grad_norm": 0.21371643270197568, "learning_rate": 4.004063597506664e-06, "loss": 0.2337, "step": 991 }, { "epoch": 4.078108941418294, "grad_norm": 0.24512669596399653, "learning_rate": 3.969622741859862e-06, "loss": 0.2477, "step": 992 }, { "epoch": 4.082219938335046, "grad_norm": 0.21744045295237915, "learning_rate": 3.935314321240433e-06, "loss": 0.2405, "step": 993 }, { "epoch": 4.086330935251799, "grad_norm": 0.20192278557379797, "learning_rate": 3.90113861908753e-06, "loss": 0.2394, "step": 994 }, { "epoch": 4.090441932168551, "grad_norm": 0.2027471703666848, "learning_rate": 3.867095917743862e-06, "loss": 0.2326, "step": 995 }, { "epoch": 4.094552929085303, "grad_norm": 0.20882580151186148, "learning_rate": 3.8331864984533404e-06, "loss": 0.2362, "step": 996 }, { "epoch": 4.098663926002056, "grad_norm": 0.1930471416017011, "learning_rate": 3.799410641358776e-06, "loss": 0.2462, "step": 997 }, { "epoch": 4.102774922918808, "grad_norm": 0.19859635746881463, "learning_rate": 3.7657686254995483e-06, "loss": 0.2404, "step": 998 }, { "epoch": 4.10688591983556, "grad_norm": 0.1983957254871405, "learning_rate": 3.7322607288093117e-06, "loss": 0.2398, "step": 999 }, { "epoch": 4.110996916752312, "grad_norm": 0.22293857279886048, "learning_rate": 3.6988872281136855e-06, "loss": 0.2363, "step": 1000 }, { "epoch": 4.115107913669065, "grad_norm": 0.20443840443106004, "learning_rate": 3.66564839912799e-06, "loss": 0.2318, "step": 1001 }, { "epoch": 4.1192189105858175, "grad_norm": 0.17966769630726293, "learning_rate": 3.632544516454941e-06, "loss": 0.2359, "step": 1002 }, { "epoch": 4.123329907502569, "grad_norm": 0.19432549741475053, "learning_rate": 3.5995758535823997e-06, "loss": 0.2316, "step": 1003 }, { "epoch": 4.127440904419322, "grad_norm": 0.18881014978005276, "learning_rate": 3.566742682881119e-06, "loss": 0.2608, "step": 1004 }, { "epoch": 4.131551901336074, "grad_norm": 0.19088807670118796, "learning_rate": 3.534045275602467e-06, "loss": 0.242, "step": 1005 }, { "epoch": 4.135662898252827, "grad_norm": 0.1816637262264018, "learning_rate": 3.501483901876208e-06, "loss": 0.244, "step": 1006 }, { "epoch": 4.139773895169578, "grad_norm": 0.19010713069523394, "learning_rate": 3.469058830708263e-06, "loss": 0.2324, "step": 1007 }, { "epoch": 4.143884892086331, "grad_norm": 0.19620537155899534, "learning_rate": 3.436770329978494e-06, "loss": 0.2481, "step": 1008 }, { "epoch": 4.1479958890030835, "grad_norm": 0.18566900979279455, "learning_rate": 3.4046186664384795e-06, "loss": 0.2432, "step": 1009 }, { "epoch": 4.152106885919835, "grad_norm": 0.1755700170371331, "learning_rate": 3.3726041057093186e-06, "loss": 0.2386, "step": 1010 }, { "epoch": 4.156217882836588, "grad_norm": 0.18096902328410783, "learning_rate": 3.3407269122794373e-06, "loss": 0.2487, "step": 1011 }, { "epoch": 4.16032887975334, "grad_norm": 0.192754387487128, "learning_rate": 3.3089873495023995e-06, "loss": 0.234, "step": 1012 }, { "epoch": 4.164439876670093, "grad_norm": 0.19892387100550088, "learning_rate": 3.2773856795947336e-06, "loss": 0.2339, "step": 1013 }, { "epoch": 4.168550873586844, "grad_norm": 0.18465157283491226, "learning_rate": 3.2459221636337633e-06, "loss": 0.2379, "step": 1014 }, { "epoch": 4.172661870503597, "grad_norm": 0.1899662552430034, "learning_rate": 3.214597061555458e-06, "loss": 0.2292, "step": 1015 }, { "epoch": 4.1767728674203495, "grad_norm": 0.18665807494909734, "learning_rate": 3.1834106321522727e-06, "loss": 0.2371, "step": 1016 }, { "epoch": 4.180883864337102, "grad_norm": 0.1854509036542964, "learning_rate": 3.152363133071024e-06, "loss": 0.2433, "step": 1017 }, { "epoch": 4.184994861253854, "grad_norm": 0.20338354606609246, "learning_rate": 3.12145482081075e-06, "loss": 0.2373, "step": 1018 }, { "epoch": 4.189105858170606, "grad_norm": 0.1823424640205926, "learning_rate": 3.0906859507206044e-06, "loss": 0.2425, "step": 1019 }, { "epoch": 4.193216855087359, "grad_norm": 0.18646817047228667, "learning_rate": 3.0600567769977286e-06, "loss": 0.2388, "step": 1020 }, { "epoch": 4.197327852004111, "grad_norm": 0.19248044840190429, "learning_rate": 3.0295675526851686e-06, "loss": 0.2327, "step": 1021 }, { "epoch": 4.201438848920863, "grad_norm": 0.1895682953006883, "learning_rate": 2.9992185296697763e-06, "loss": 0.2494, "step": 1022 }, { "epoch": 4.2055498458376155, "grad_norm": 0.1775774161260345, "learning_rate": 2.9690099586801223e-06, "loss": 0.2431, "step": 1023 }, { "epoch": 4.209660842754368, "grad_norm": 0.18688744320331976, "learning_rate": 2.938942089284453e-06, "loss": 0.2243, "step": 1024 }, { "epoch": 4.213771839671121, "grad_norm": 0.18321913204838605, "learning_rate": 2.909015169888587e-06, "loss": 0.2361, "step": 1025 }, { "epoch": 4.217882836587872, "grad_norm": 0.18558364928419416, "learning_rate": 2.879229447733893e-06, "loss": 0.2438, "step": 1026 }, { "epoch": 4.221993833504625, "grad_norm": 0.18370056819501662, "learning_rate": 2.849585168895237e-06, "loss": 0.2372, "step": 1027 }, { "epoch": 4.226104830421377, "grad_norm": 0.17922623411257754, "learning_rate": 2.8200825782789466e-06, "loss": 0.2389, "step": 1028 }, { "epoch": 4.23021582733813, "grad_norm": 0.1814704060047799, "learning_rate": 2.790721919620798e-06, "loss": 0.2299, "step": 1029 }, { "epoch": 4.2343268242548815, "grad_norm": 0.18999808738781843, "learning_rate": 2.7615034354839942e-06, "loss": 0.2346, "step": 1030 }, { "epoch": 4.238437821171634, "grad_norm": 0.18017749013937312, "learning_rate": 2.7324273672571577e-06, "loss": 0.2337, "step": 1031 }, { "epoch": 4.242548818088387, "grad_norm": 0.1799591693551389, "learning_rate": 2.7034939551523476e-06, "loss": 0.2439, "step": 1032 }, { "epoch": 4.246659815005139, "grad_norm": 0.18327330613798448, "learning_rate": 2.6747034382030655e-06, "loss": 0.2445, "step": 1033 }, { "epoch": 4.250770811921891, "grad_norm": 0.17889375387571904, "learning_rate": 2.646056054262287e-06, "loss": 0.2467, "step": 1034 }, { "epoch": 4.254881808838643, "grad_norm": 0.17679323974968908, "learning_rate": 2.6175520400004907e-06, "loss": 0.2405, "step": 1035 }, { "epoch": 4.258992805755396, "grad_norm": 0.17336200096095578, "learning_rate": 2.5891916309037046e-06, "loss": 0.2367, "step": 1036 }, { "epoch": 4.263103802672148, "grad_norm": 0.1862342732350899, "learning_rate": 2.560975061271569e-06, "loss": 0.2294, "step": 1037 }, { "epoch": 4.2672147995889, "grad_norm": 0.1761467998629582, "learning_rate": 2.5329025642153873e-06, "loss": 0.2448, "step": 1038 }, { "epoch": 4.271325796505653, "grad_norm": 0.17762679763602063, "learning_rate": 2.5049743716562104e-06, "loss": 0.2459, "step": 1039 }, { "epoch": 4.275436793422405, "grad_norm": 0.17679704716813474, "learning_rate": 2.4771907143229124e-06, "loss": 0.2366, "step": 1040 }, { "epoch": 4.279547790339157, "grad_norm": 0.18512587191088023, "learning_rate": 2.4495518217502936e-06, "loss": 0.2334, "step": 1041 }, { "epoch": 4.283658787255909, "grad_norm": 0.17098883894517244, "learning_rate": 2.422057922277179e-06, "loss": 0.2366, "step": 1042 }, { "epoch": 4.287769784172662, "grad_norm": 0.19038355517722344, "learning_rate": 2.3947092430445284e-06, "loss": 0.2361, "step": 1043 }, { "epoch": 4.291880781089414, "grad_norm": 0.1807683215327849, "learning_rate": 2.367506009993572e-06, "loss": 0.2314, "step": 1044 }, { "epoch": 4.295991778006167, "grad_norm": 0.18180874713294695, "learning_rate": 2.34044844786393e-06, "loss": 0.2385, "step": 1045 }, { "epoch": 4.300102774922919, "grad_norm": 0.18147750075878016, "learning_rate": 2.313536780191763e-06, "loss": 0.2336, "step": 1046 }, { "epoch": 4.304213771839671, "grad_norm": 0.1782373533284217, "learning_rate": 2.2867712293079223e-06, "loss": 0.2356, "step": 1047 }, { "epoch": 4.308324768756424, "grad_norm": 0.17802709783230702, "learning_rate": 2.2601520163361166e-06, "loss": 0.2445, "step": 1048 }, { "epoch": 4.312435765673175, "grad_norm": 0.17602254086438468, "learning_rate": 2.233679361191081e-06, "loss": 0.2296, "step": 1049 }, { "epoch": 4.316546762589928, "grad_norm": 0.17604437821882946, "learning_rate": 2.2073534825767683e-06, "loss": 0.2493, "step": 1050 }, { "epoch": 4.32065775950668, "grad_norm": 0.18364670883928147, "learning_rate": 2.18117459798453e-06, "loss": 0.2332, "step": 1051 }, { "epoch": 4.324768756423433, "grad_norm": 0.17647874008223446, "learning_rate": 2.155142923691329e-06, "loss": 0.2434, "step": 1052 }, { "epoch": 4.328879753340185, "grad_norm": 0.1821284298329628, "learning_rate": 2.129258674757948e-06, "loss": 0.2405, "step": 1053 }, { "epoch": 4.332990750256937, "grad_norm": 0.17780536510155415, "learning_rate": 2.103522065027217e-06, "loss": 0.2352, "step": 1054 }, { "epoch": 4.33710174717369, "grad_norm": 0.17826171239681762, "learning_rate": 2.07793330712224e-06, "loss": 0.2389, "step": 1055 }, { "epoch": 4.341212744090442, "grad_norm": 0.17939747251527152, "learning_rate": 2.0524926124446497e-06, "loss": 0.2419, "step": 1056 }, { "epoch": 4.345323741007194, "grad_norm": 0.18203279406090278, "learning_rate": 2.0272001911728466e-06, "loss": 0.237, "step": 1057 }, { "epoch": 4.349434737923946, "grad_norm": 0.1797008020492476, "learning_rate": 2.0020562522602716e-06, "loss": 0.2341, "step": 1058 }, { "epoch": 4.353545734840699, "grad_norm": 0.1739746708895779, "learning_rate": 1.9770610034336823e-06, "loss": 0.2391, "step": 1059 }, { "epoch": 4.3576567317574515, "grad_norm": 0.18058019470972567, "learning_rate": 1.9522146511914265e-06, "loss": 0.2322, "step": 1060 }, { "epoch": 4.361767728674203, "grad_norm": 0.17826352479236454, "learning_rate": 1.927517400801746e-06, "loss": 0.2422, "step": 1061 }, { "epoch": 4.365878725590956, "grad_norm": 0.16893750635969274, "learning_rate": 1.902969456301076e-06, "loss": 0.2332, "step": 1062 }, { "epoch": 4.369989722507708, "grad_norm": 0.1782753702044134, "learning_rate": 1.8785710204923612e-06, "loss": 0.2385, "step": 1063 }, { "epoch": 4.374100719424461, "grad_norm": 0.18141238302838078, "learning_rate": 1.8543222949433736e-06, "loss": 0.2463, "step": 1064 }, { "epoch": 4.378211716341212, "grad_norm": 0.17681464500466545, "learning_rate": 1.8302234799850671e-06, "loss": 0.2441, "step": 1065 }, { "epoch": 4.382322713257965, "grad_norm": 0.17491562407701997, "learning_rate": 1.8062747747098974e-06, "loss": 0.2359, "step": 1066 }, { "epoch": 4.3864337101747175, "grad_norm": 0.17582489650428018, "learning_rate": 1.782476376970188e-06, "loss": 0.2518, "step": 1067 }, { "epoch": 4.39054470709147, "grad_norm": 0.179309298657522, "learning_rate": 1.7588284833765024e-06, "loss": 0.2509, "step": 1068 }, { "epoch": 4.394655704008222, "grad_norm": 0.17355362198390345, "learning_rate": 1.7353312892960095e-06, "loss": 0.2396, "step": 1069 }, { "epoch": 4.398766700924974, "grad_norm": 0.20669657827730264, "learning_rate": 1.7119849888508766e-06, "loss": 0.2401, "step": 1070 }, { "epoch": 4.402877697841727, "grad_norm": 0.1759615590766294, "learning_rate": 1.6887897749166548e-06, "loss": 0.239, "step": 1071 }, { "epoch": 4.406988694758479, "grad_norm": 0.1867129692924266, "learning_rate": 1.6657458391207049e-06, "loss": 0.24, "step": 1072 }, { "epoch": 4.411099691675231, "grad_norm": 0.17724290686658428, "learning_rate": 1.6428533718405914e-06, "loss": 0.2485, "step": 1073 }, { "epoch": 4.4152106885919835, "grad_norm": 0.1769255669225784, "learning_rate": 1.6201125622025315e-06, "loss": 0.2343, "step": 1074 }, { "epoch": 4.419321685508736, "grad_norm": 0.17424978161900648, "learning_rate": 1.5975235980798153e-06, "loss": 0.2299, "step": 1075 }, { "epoch": 4.423432682425489, "grad_norm": 0.17407143372977305, "learning_rate": 1.5750866660912634e-06, "loss": 0.2294, "step": 1076 }, { "epoch": 4.42754367934224, "grad_norm": 0.17791660590457703, "learning_rate": 1.5528019515996783e-06, "loss": 0.2425, "step": 1077 }, { "epoch": 4.431654676258993, "grad_norm": 0.18301382705782807, "learning_rate": 1.5306696387103227e-06, "loss": 0.2343, "step": 1078 }, { "epoch": 4.435765673175745, "grad_norm": 0.17589070591387826, "learning_rate": 1.5086899102693875e-06, "loss": 0.2469, "step": 1079 }, { "epoch": 4.439876670092497, "grad_norm": 0.17198527500762634, "learning_rate": 1.486862947862493e-06, "loss": 0.2463, "step": 1080 }, { "epoch": 4.4439876670092495, "grad_norm": 0.17792281862140422, "learning_rate": 1.465188931813175e-06, "loss": 0.2301, "step": 1081 }, { "epoch": 4.448098663926002, "grad_norm": 0.17628369792032114, "learning_rate": 1.4436680411814097e-06, "loss": 0.2399, "step": 1082 }, { "epoch": 4.452209660842755, "grad_norm": 0.17439136560526375, "learning_rate": 1.42230045376212e-06, "loss": 0.237, "step": 1083 }, { "epoch": 4.456320657759507, "grad_norm": 0.17943067929919523, "learning_rate": 1.4010863460837132e-06, "loss": 0.2405, "step": 1084 }, { "epoch": 4.460431654676259, "grad_norm": 0.17235533535415476, "learning_rate": 1.380025893406638e-06, "loss": 0.2397, "step": 1085 }, { "epoch": 4.464542651593011, "grad_norm": 0.17886870223554543, "learning_rate": 1.3591192697219003e-06, "loss": 0.2409, "step": 1086 }, { "epoch": 4.468653648509764, "grad_norm": 0.16738079998494204, "learning_rate": 1.3383666477496627e-06, "loss": 0.2387, "step": 1087 }, { "epoch": 4.4727646454265155, "grad_norm": 0.16758787660813548, "learning_rate": 1.3177681989377944e-06, "loss": 0.2417, "step": 1088 }, { "epoch": 4.476875642343268, "grad_norm": 0.1752202184059869, "learning_rate": 1.2973240934604658e-06, "loss": 0.2274, "step": 1089 }, { "epoch": 4.480986639260021, "grad_norm": 0.17463210365794904, "learning_rate": 1.277034500216736e-06, "loss": 0.226, "step": 1090 }, { "epoch": 4.485097636176773, "grad_norm": 0.17601812963083546, "learning_rate": 1.2568995868291656e-06, "loss": 0.2491, "step": 1091 }, { "epoch": 4.489208633093525, "grad_norm": 0.1775954820916016, "learning_rate": 1.236919519642421e-06, "loss": 0.2432, "step": 1092 }, { "epoch": 4.493319630010277, "grad_norm": 0.17531188232428954, "learning_rate": 1.2170944637219106e-06, "loss": 0.2417, "step": 1093 }, { "epoch": 4.49743062692703, "grad_norm": 0.1752476486120662, "learning_rate": 1.1974245828524156e-06, "loss": 0.2274, "step": 1094 }, { "epoch": 4.501541623843782, "grad_norm": 0.18283725978641932, "learning_rate": 1.177910039536736e-06, "loss": 0.2408, "step": 1095 }, { "epoch": 4.505652620760534, "grad_norm": 0.17390337086901564, "learning_rate": 1.1585509949943518e-06, "loss": 0.2374, "step": 1096 }, { "epoch": 4.509763617677287, "grad_norm": 0.1780578101655513, "learning_rate": 1.1393476091600886e-06, "loss": 0.2473, "step": 1097 }, { "epoch": 4.513874614594039, "grad_norm": 0.17965538239208087, "learning_rate": 1.120300040682798e-06, "loss": 0.244, "step": 1098 }, { "epoch": 4.517985611510792, "grad_norm": 0.17589641804084827, "learning_rate": 1.1014084469240461e-06, "loss": 0.2435, "step": 1099 }, { "epoch": 4.522096608427543, "grad_norm": 0.17442573382633822, "learning_rate": 1.0826729839568073e-06, "loss": 0.2417, "step": 1100 }, { "epoch": 4.526207605344296, "grad_norm": 0.1807965837015226, "learning_rate": 1.0640938065641926e-06, "loss": 0.2424, "step": 1101 }, { "epoch": 4.530318602261048, "grad_norm": 0.17785902254626473, "learning_rate": 1.0456710682381455e-06, "loss": 0.2546, "step": 1102 }, { "epoch": 4.534429599177801, "grad_norm": 0.17258906030475882, "learning_rate": 1.0274049211781967e-06, "loss": 0.2422, "step": 1103 }, { "epoch": 4.538540596094553, "grad_norm": 1.2728190292585875, "learning_rate": 1.009295516290194e-06, "loss": 0.2608, "step": 1104 }, { "epoch": 4.542651593011305, "grad_norm": 0.17125359621163755, "learning_rate": 9.913430031850635e-07, "loss": 0.2356, "step": 1105 }, { "epoch": 4.546762589928058, "grad_norm": 0.17651600362640116, "learning_rate": 9.735475301775632e-07, "loss": 0.246, "step": 1106 }, { "epoch": 4.55087358684481, "grad_norm": 0.16931432337364227, "learning_rate": 9.559092442850671e-07, "loss": 0.2289, "step": 1107 }, { "epoch": 4.554984583761562, "grad_norm": 0.17727932117256406, "learning_rate": 9.384282912263475e-07, "loss": 0.2334, "step": 1108 }, { "epoch": 4.559095580678314, "grad_norm": 0.17674518726323318, "learning_rate": 9.211048154203661e-07, "loss": 0.2512, "step": 1109 }, { "epoch": 4.563206577595067, "grad_norm": 0.17332686615916262, "learning_rate": 9.039389599850912e-07, "loss": 0.2339, "step": 1110 }, { "epoch": 4.567317574511819, "grad_norm": 0.1651967177583125, "learning_rate": 8.869308667363063e-07, "loss": 0.241, "step": 1111 }, { "epoch": 4.571428571428571, "grad_norm": 0.17416950220257496, "learning_rate": 8.700806761864466e-07, "loss": 0.2329, "step": 1112 }, { "epoch": 4.575539568345324, "grad_norm": 0.16574888125550336, "learning_rate": 8.533885275434283e-07, "loss": 0.2429, "step": 1113 }, { "epoch": 4.579650565262076, "grad_norm": 0.1731537234968332, "learning_rate": 8.368545587095056e-07, "loss": 0.2414, "step": 1114 }, { "epoch": 4.583761562178829, "grad_norm": 0.17116481210634382, "learning_rate": 8.20478906280131e-07, "loss": 0.2405, "step": 1115 }, { "epoch": 4.5878725590955804, "grad_norm": 0.17008303524989962, "learning_rate": 8.042617055428215e-07, "loss": 0.2313, "step": 1116 }, { "epoch": 4.591983556012333, "grad_norm": 0.1724744667796713, "learning_rate": 7.882030904760518e-07, "loss": 0.238, "step": 1117 }, { "epoch": 4.5960945529290855, "grad_norm": 0.17588735295940247, "learning_rate": 7.723031937481318e-07, "loss": 0.2497, "step": 1118 }, { "epoch": 4.600205549845837, "grad_norm": 0.17565531889855016, "learning_rate": 7.565621467161244e-07, "loss": 0.2563, "step": 1119 }, { "epoch": 4.60431654676259, "grad_norm": 0.1730755101477369, "learning_rate": 7.409800794247557e-07, "loss": 0.2337, "step": 1120 }, { "epoch": 4.608427543679342, "grad_norm": 0.16854464186311885, "learning_rate": 7.25557120605338e-07, "loss": 0.2445, "step": 1121 }, { "epoch": 4.612538540596095, "grad_norm": 0.17181736526067778, "learning_rate": 7.102933976747084e-07, "loss": 0.2356, "step": 1122 }, { "epoch": 4.616649537512847, "grad_norm": 0.17224841346483227, "learning_rate": 6.951890367341763e-07, "loss": 0.2404, "step": 1123 }, { "epoch": 4.620760534429599, "grad_norm": 0.174217100719342, "learning_rate": 6.802441625684774e-07, "loss": 0.2505, "step": 1124 }, { "epoch": 4.6248715313463515, "grad_norm": 0.172147168668161, "learning_rate": 6.654588986447597e-07, "loss": 0.2387, "step": 1125 }, { "epoch": 4.628982528263104, "grad_norm": 0.17025566837994996, "learning_rate": 6.508333671115341e-07, "loss": 0.2445, "step": 1126 }, { "epoch": 4.633093525179856, "grad_norm": 0.1791903634128583, "learning_rate": 6.363676887976944e-07, "loss": 0.2458, "step": 1127 }, { "epoch": 4.637204522096608, "grad_norm": 0.17050764877819538, "learning_rate": 6.220619832114971e-07, "loss": 0.2504, "step": 1128 }, { "epoch": 4.641315519013361, "grad_norm": 0.17379476155710213, "learning_rate": 6.079163685395917e-07, "loss": 0.2426, "step": 1129 }, { "epoch": 4.645426515930113, "grad_norm": 0.17731694259336384, "learning_rate": 5.939309616460276e-07, "loss": 0.2356, "step": 1130 }, { "epoch": 4.649537512846865, "grad_norm": 0.17342657076328494, "learning_rate": 5.801058780713021e-07, "loss": 0.2454, "step": 1131 }, { "epoch": 4.6536485097636175, "grad_norm": 0.17506716139804732, "learning_rate": 5.664412320314027e-07, "loss": 0.2466, "step": 1132 }, { "epoch": 4.65775950668037, "grad_norm": 0.1760318065249804, "learning_rate": 5.529371364168535e-07, "loss": 0.2298, "step": 1133 }, { "epoch": 4.661870503597123, "grad_norm": 0.17418792165950514, "learning_rate": 5.395937027918008e-07, "loss": 0.2352, "step": 1134 }, { "epoch": 4.665981500513874, "grad_norm": 0.1706417249640066, "learning_rate": 5.264110413930735e-07, "loss": 0.2398, "step": 1135 }, { "epoch": 4.670092497430627, "grad_norm": 0.17721194204664026, "learning_rate": 5.133892611292846e-07, "loss": 0.2378, "step": 1136 }, { "epoch": 4.674203494347379, "grad_norm": 0.18022478809124595, "learning_rate": 5.005284695799217e-07, "loss": 0.2491, "step": 1137 }, { "epoch": 4.678314491264132, "grad_norm": 0.17519368559887136, "learning_rate": 4.878287729944697e-07, "loss": 0.2438, "step": 1138 }, { "epoch": 4.6824254881808836, "grad_norm": 0.1777158927515907, "learning_rate": 4.7529027629152234e-07, "loss": 0.2364, "step": 1139 }, { "epoch": 4.686536485097636, "grad_norm": 0.1707755427569748, "learning_rate": 4.6291308305792315e-07, "loss": 0.2453, "step": 1140 }, { "epoch": 4.690647482014389, "grad_norm": 0.16668258759849372, "learning_rate": 4.5069729554790386e-07, "loss": 0.2402, "step": 1141 }, { "epoch": 4.694758478931141, "grad_norm": 0.16459971581816116, "learning_rate": 4.386430146822429e-07, "loss": 0.2483, "step": 1142 }, { "epoch": 4.698869475847893, "grad_norm": 0.17875849392738621, "learning_rate": 4.2675034004743045e-07, "loss": 0.241, "step": 1143 }, { "epoch": 4.702980472764645, "grad_norm": 0.17296505254178202, "learning_rate": 4.150193698948468e-07, "loss": 0.2465, "step": 1144 }, { "epoch": 4.707091469681398, "grad_norm": 0.17224757284909492, "learning_rate": 4.034502011399499e-07, "loss": 0.2385, "step": 1145 }, { "epoch": 4.7112024665981505, "grad_norm": 0.17411551470001055, "learning_rate": 3.92042929361478e-07, "loss": 0.2362, "step": 1146 }, { "epoch": 4.715313463514902, "grad_norm": 0.17065619759470402, "learning_rate": 3.8079764880064817e-07, "loss": 0.2367, "step": 1147 }, { "epoch": 4.719424460431655, "grad_norm": 0.1692629166872625, "learning_rate": 3.6971445236039685e-07, "loss": 0.2441, "step": 1148 }, { "epoch": 4.723535457348407, "grad_norm": 0.1727903716287266, "learning_rate": 3.587934316045938e-07, "loss": 0.2332, "step": 1149 }, { "epoch": 4.727646454265159, "grad_norm": 0.16765805252498014, "learning_rate": 3.4803467675729843e-07, "loss": 0.2436, "step": 1150 }, { "epoch": 4.731757451181911, "grad_norm": 0.1685165614846259, "learning_rate": 3.374382767020068e-07, "loss": 0.2462, "step": 1151 }, { "epoch": 4.735868448098664, "grad_norm": 0.17176323835480908, "learning_rate": 3.270043189809213e-07, "loss": 0.2475, "step": 1152 }, { "epoch": 4.7399794450154165, "grad_norm": 0.17054983226303438, "learning_rate": 3.167328897942268e-07, "loss": 0.2396, "step": 1153 }, { "epoch": 4.744090441932169, "grad_norm": 0.17377255003391895, "learning_rate": 3.0662407399937757e-07, "loss": 0.2414, "step": 1154 }, { "epoch": 4.748201438848921, "grad_norm": 0.1707561407227221, "learning_rate": 2.96677955110396e-07, "loss": 0.2374, "step": 1155 }, { "epoch": 4.752312435765673, "grad_norm": 0.1720410967594104, "learning_rate": 2.8689461529718634e-07, "loss": 0.2439, "step": 1156 }, { "epoch": 4.756423432682426, "grad_norm": 0.18296516134186272, "learning_rate": 2.7727413538484625e-07, "loss": 0.2361, "step": 1157 }, { "epoch": 4.760534429599177, "grad_norm": 0.17896247286039071, "learning_rate": 2.678165948530143e-07, "loss": 0.2356, "step": 1158 }, { "epoch": 4.76464542651593, "grad_norm": 0.17407845197351318, "learning_rate": 2.5852207183519885e-07, "loss": 0.2251, "step": 1159 }, { "epoch": 4.7687564234326825, "grad_norm": 0.17062892002775074, "learning_rate": 2.493906431181392e-07, "loss": 0.2438, "step": 1160 }, { "epoch": 4.772867420349435, "grad_norm": 0.1672404807221335, "learning_rate": 2.4042238414117016e-07, "loss": 0.2261, "step": 1161 }, { "epoch": 4.7769784172661875, "grad_norm": 0.17275226054969903, "learning_rate": 2.3161736899560249e-07, "loss": 0.2394, "step": 1162 }, { "epoch": 4.781089414182939, "grad_norm": 0.17356630567530312, "learning_rate": 2.2297567042410372e-07, "loss": 0.2345, "step": 1163 }, { "epoch": 4.785200411099692, "grad_norm": 0.17592520932271608, "learning_rate": 2.1449735982010278e-07, "loss": 0.2431, "step": 1164 }, { "epoch": 4.789311408016444, "grad_norm": 0.17039915938031028, "learning_rate": 2.0618250722719501e-07, "loss": 0.2431, "step": 1165 }, { "epoch": 4.793422404933196, "grad_norm": 0.1730081549716055, "learning_rate": 1.9803118133857157e-07, "loss": 0.2486, "step": 1166 }, { "epoch": 4.7975334018499485, "grad_norm": 0.17038132685762578, "learning_rate": 1.9004344949644425e-07, "loss": 0.2409, "step": 1167 }, { "epoch": 4.801644398766701, "grad_norm": 0.1743699418678854, "learning_rate": 1.8221937769149045e-07, "loss": 0.2365, "step": 1168 }, { "epoch": 4.805755395683454, "grad_norm": 0.16868706789230756, "learning_rate": 1.745590305623157e-07, "loss": 0.2415, "step": 1169 }, { "epoch": 4.809866392600205, "grad_norm": 0.17209405727243507, "learning_rate": 1.6706247139490318e-07, "loss": 0.2434, "step": 1170 }, { "epoch": 4.813977389516958, "grad_norm": 0.17768294930126322, "learning_rate": 1.5972976212211388e-07, "loss": 0.2333, "step": 1171 }, { "epoch": 4.81808838643371, "grad_norm": 0.1692714430601308, "learning_rate": 1.525609633231495e-07, "loss": 0.247, "step": 1172 }, { "epoch": 4.822199383350463, "grad_norm": 0.16825815560990823, "learning_rate": 1.455561342230749e-07, "loss": 0.249, "step": 1173 }, { "epoch": 4.8263103802672145, "grad_norm": 0.17141290386405647, "learning_rate": 1.3871533269231187e-07, "loss": 0.2547, "step": 1174 }, { "epoch": 4.830421377183967, "grad_norm": 0.17040562779973573, "learning_rate": 1.3203861524617278e-07, "loss": 0.2519, "step": 1175 }, { "epoch": 4.83453237410072, "grad_norm": 0.1677164586091278, "learning_rate": 1.2552603704438115e-07, "loss": 0.2334, "step": 1176 }, { "epoch": 4.838643371017472, "grad_norm": 0.1695089968957749, "learning_rate": 1.1917765189063402e-07, "loss": 0.243, "step": 1177 }, { "epoch": 4.842754367934224, "grad_norm": 0.17083241390302353, "learning_rate": 1.1299351223214017e-07, "loss": 0.2349, "step": 1178 }, { "epoch": 4.846865364850976, "grad_norm": 0.17067355512311883, "learning_rate": 1.069736691591916e-07, "loss": 0.2392, "step": 1179 }, { "epoch": 4.850976361767729, "grad_norm": 0.16913426842634588, "learning_rate": 1.0111817240475052e-07, "loss": 0.23, "step": 1180 }, { "epoch": 4.8550873586844805, "grad_norm": 0.17062118220717767, "learning_rate": 9.542707034402299e-08, "loss": 0.2358, "step": 1181 }, { "epoch": 4.859198355601233, "grad_norm": 0.17029660865240914, "learning_rate": 8.990040999407701e-08, "loss": 0.2302, "step": 1182 }, { "epoch": 4.863309352517986, "grad_norm": 0.16773211882510938, "learning_rate": 8.453823701343622e-08, "loss": 0.245, "step": 1183 }, { "epoch": 4.867420349434738, "grad_norm": 0.16985639544523204, "learning_rate": 7.93405957017157e-08, "loss": 0.2345, "step": 1184 }, { "epoch": 4.871531346351491, "grad_norm": 0.16908464736483494, "learning_rate": 7.430752899924898e-08, "loss": 0.2413, "step": 1185 }, { "epoch": 4.875642343268242, "grad_norm": 0.1759346258041749, "learning_rate": 6.943907848673937e-08, "loss": 0.2427, "step": 1186 }, { "epoch": 4.879753340184995, "grad_norm": 0.17174700886638744, "learning_rate": 6.473528438490916e-08, "loss": 0.2439, "step": 1187 }, { "epoch": 4.883864337101747, "grad_norm": 0.19693454366502605, "learning_rate": 6.019618555417328e-08, "loss": 0.2377, "step": 1188 }, { "epoch": 4.887975334018499, "grad_norm": 0.17754029524221127, "learning_rate": 5.58218194943172e-08, "loss": 0.2293, "step": 1189 }, { "epoch": 4.892086330935252, "grad_norm": 0.1700686268287155, "learning_rate": 5.161222234418173e-08, "loss": 0.2416, "step": 1190 }, { "epoch": 4.896197327852004, "grad_norm": 0.1778073111241822, "learning_rate": 4.756742888136989e-08, "loss": 0.245, "step": 1191 }, { "epoch": 4.900308324768757, "grad_norm": 0.17085644813691567, "learning_rate": 4.3687472521962704e-08, "loss": 0.2386, "step": 1192 }, { "epoch": 4.904419321685509, "grad_norm": 0.17614275321803505, "learning_rate": 3.997238532023273e-08, "loss": 0.2378, "step": 1193 }, { "epoch": 4.908530318602261, "grad_norm": 0.16573489640277006, "learning_rate": 3.642219796839097e-08, "loss": 0.2386, "step": 1194 }, { "epoch": 4.912641315519013, "grad_norm": 0.16678550464295586, "learning_rate": 3.303693979632039e-08, "loss": 0.2411, "step": 1195 }, { "epoch": 4.916752312435766, "grad_norm": 0.1683860743917499, "learning_rate": 2.981663877134944e-08, "loss": 0.2443, "step": 1196 }, { "epoch": 4.920863309352518, "grad_norm": 0.16597729886542864, "learning_rate": 2.6761321498005587e-08, "loss": 0.2408, "step": 1197 }, { "epoch": 4.92497430626927, "grad_norm": 0.1718127781713061, "learning_rate": 2.3871013217806605e-08, "loss": 0.2412, "step": 1198 }, { "epoch": 4.929085303186023, "grad_norm": 0.17097349823279034, "learning_rate": 2.1145737809045162e-08, "loss": 0.2421, "step": 1199 }, { "epoch": 4.933196300102775, "grad_norm": 0.17427216115981084, "learning_rate": 1.8585517786597894e-08, "loss": 0.2381, "step": 1200 }, { "epoch": 4.937307297019527, "grad_norm": 0.1744216439196038, "learning_rate": 1.6190374301727762e-08, "loss": 0.2282, "step": 1201 }, { "epoch": 4.941418293936279, "grad_norm": 0.17445833543773084, "learning_rate": 1.3960327141926411e-08, "loss": 0.2299, "step": 1202 }, { "epoch": 4.945529290853032, "grad_norm": 0.17503271001824838, "learning_rate": 1.1895394730738751e-08, "loss": 0.2333, "step": 1203 }, { "epoch": 4.9496402877697845, "grad_norm": 0.1689882913201602, "learning_rate": 9.995594127607534e-09, "loss": 0.2426, "step": 1204 }, { "epoch": 4.953751284686536, "grad_norm": 0.17067106138398636, "learning_rate": 8.260941027746772e-09, "loss": 0.2477, "step": 1205 }, { "epoch": 4.957862281603289, "grad_norm": 0.17523534599452864, "learning_rate": 6.6914497619996465e-09, "loss": 0.2362, "step": 1206 }, { "epoch": 4.961973278520041, "grad_norm": 0.16958624262500685, "learning_rate": 5.287133296723035e-09, "loss": 0.2416, "step": 1207 }, { "epoch": 4.966084275436794, "grad_norm": 0.17141643376988966, "learning_rate": 4.048003233687592e-09, "loss": 0.2319, "step": 1208 }, { "epoch": 4.970195272353545, "grad_norm": 0.17444671360259928, "learning_rate": 2.974069809964508e-09, "loss": 0.2442, "step": 1209 }, { "epoch": 4.974306269270298, "grad_norm": 0.16938004185341077, "learning_rate": 2.065341897865558e-09, "loss": 0.2482, "step": 1210 }, { "epoch": 4.9784172661870505, "grad_norm": 0.16777811457050917, "learning_rate": 1.32182700484762e-09, "loss": 0.2465, "step": 1211 }, { "epoch": 4.982528263103803, "grad_norm": 0.17657705108779342, "learning_rate": 7.435312734593858e-10, "loss": 0.2395, "step": 1212 }, { "epoch": 4.986639260020555, "grad_norm": 0.16972832488616352, "learning_rate": 3.304594812991724e-10, "loss": 0.2413, "step": 1213 }, { "epoch": 4.990750256937307, "grad_norm": 0.16507566272358404, "learning_rate": 8.261504095496976e-11, "loss": 0.2336, "step": 1214 }, { "epoch": 4.99486125385406, "grad_norm": 0.17100744078272323, "learning_rate": 0.0, "loss": 0.2357, "step": 1215 }, { "epoch": 4.99486125385406, "step": 1215, "total_flos": 4.757804886857613e+18, "train_loss": 0.34405366748939326, "train_runtime": 28473.715, "train_samples_per_second": 5.463, "train_steps_per_second": 0.043 } ], "logging_steps": 1, "max_steps": 1215, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 4.757804886857613e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }