diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,8547 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 4.99486125385406, + "eval_steps": 500, + "global_step": 1215, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0041109969167523125, + "grad_norm": 6.356808310456779, + "learning_rate": 3.278688524590164e-07, + "loss": 0.82, + "step": 1 + }, + { + "epoch": 0.008221993833504625, + "grad_norm": 6.417518537271285, + "learning_rate": 6.557377049180328e-07, + "loss": 0.8218, + "step": 2 + }, + { + "epoch": 0.012332990750256937, + "grad_norm": 6.266225501769135, + "learning_rate": 9.836065573770493e-07, + "loss": 0.7983, + "step": 3 + }, + { + "epoch": 0.01644398766700925, + "grad_norm": 6.059283773388311, + "learning_rate": 1.3114754098360657e-06, + "loss": 0.7971, + "step": 4 + }, + { + "epoch": 0.020554984583761562, + "grad_norm": 5.8622860351950585, + "learning_rate": 1.6393442622950819e-06, + "loss": 0.811, + "step": 5 + }, + { + "epoch": 0.024665981500513873, + "grad_norm": 5.599946959024431, + "learning_rate": 1.9672131147540985e-06, + "loss": 0.7955, + "step": 6 + }, + { + "epoch": 0.02877697841726619, + "grad_norm": 4.404619948223283, + "learning_rate": 2.295081967213115e-06, + "loss": 0.7786, + "step": 7 + }, + { + "epoch": 0.0328879753340185, + "grad_norm": 3.922409083964933, + "learning_rate": 2.6229508196721314e-06, + "loss": 0.7401, + "step": 8 + }, + { + "epoch": 0.03699897225077081, + "grad_norm": 2.2688697860924765, + "learning_rate": 2.9508196721311478e-06, + "loss": 0.7309, + "step": 9 + }, + { + "epoch": 0.041109969167523124, + "grad_norm": 2.066008891071192, + "learning_rate": 3.2786885245901638e-06, + "loss": 0.719, + "step": 10 + }, + { + "epoch": 0.045220966084275435, + "grad_norm": 1.9201877271634715, + "learning_rate": 3.6065573770491806e-06, + "loss": 0.7258, + "step": 11 + }, + { + "epoch": 0.04933196300102775, + "grad_norm": 3.898309985774545, + "learning_rate": 3.934426229508197e-06, + "loss": 0.7205, + "step": 12 + }, + { + "epoch": 0.05344295991778006, + "grad_norm": 4.0353502189938855, + "learning_rate": 4.2622950819672135e-06, + "loss": 0.7192, + "step": 13 + }, + { + "epoch": 0.05755395683453238, + "grad_norm": 4.121703890013303, + "learning_rate": 4.59016393442623e-06, + "loss": 0.7244, + "step": 14 + }, + { + "epoch": 0.06166495375128469, + "grad_norm": 3.928913042484364, + "learning_rate": 4.918032786885246e-06, + "loss": 0.6995, + "step": 15 + }, + { + "epoch": 0.065775950668037, + "grad_norm": 3.142521725483005, + "learning_rate": 5.245901639344263e-06, + "loss": 0.6643, + "step": 16 + }, + { + "epoch": 0.0698869475847893, + "grad_norm": 2.847209107159321, + "learning_rate": 5.573770491803278e-06, + "loss": 0.6531, + "step": 17 + }, + { + "epoch": 0.07399794450154162, + "grad_norm": 2.2003209693474126, + "learning_rate": 5.9016393442622956e-06, + "loss": 0.66, + "step": 18 + }, + { + "epoch": 0.07810894141829394, + "grad_norm": 1.336519700469157, + "learning_rate": 6.229508196721312e-06, + "loss": 0.6164, + "step": 19 + }, + { + "epoch": 0.08221993833504625, + "grad_norm": 1.2400729726657767, + "learning_rate": 6.5573770491803276e-06, + "loss": 0.6071, + "step": 20 + }, + { + "epoch": 0.08633093525179857, + "grad_norm": 1.4350908797178215, + "learning_rate": 6.885245901639345e-06, + "loss": 0.6066, + "step": 21 + }, + { + "epoch": 0.09044193216855087, + "grad_norm": 1.4014758782073495, + "learning_rate": 7.213114754098361e-06, + "loss": 0.5995, + "step": 22 + }, + { + "epoch": 0.09455292908530319, + "grad_norm": 1.1237552991193, + "learning_rate": 7.540983606557377e-06, + "loss": 0.5895, + "step": 23 + }, + { + "epoch": 0.0986639260020555, + "grad_norm": 0.8497080531691873, + "learning_rate": 7.868852459016394e-06, + "loss": 0.588, + "step": 24 + }, + { + "epoch": 0.10277492291880781, + "grad_norm": 0.9384127005244138, + "learning_rate": 8.19672131147541e-06, + "loss": 0.5767, + "step": 25 + }, + { + "epoch": 0.10688591983556012, + "grad_norm": 0.8190287043666049, + "learning_rate": 8.524590163934427e-06, + "loss": 0.5698, + "step": 26 + }, + { + "epoch": 0.11099691675231244, + "grad_norm": 0.6808839486547595, + "learning_rate": 8.852459016393443e-06, + "loss": 0.5795, + "step": 27 + }, + { + "epoch": 0.11510791366906475, + "grad_norm": 0.7939592915072008, + "learning_rate": 9.18032786885246e-06, + "loss": 0.5642, + "step": 28 + }, + { + "epoch": 0.11921891058581706, + "grad_norm": 0.7673957275771759, + "learning_rate": 9.508196721311476e-06, + "loss": 0.5505, + "step": 29 + }, + { + "epoch": 0.12332990750256938, + "grad_norm": 0.542256505411903, + "learning_rate": 9.836065573770493e-06, + "loss": 0.5525, + "step": 30 + }, + { + "epoch": 0.12744090441932168, + "grad_norm": 0.5835090756188929, + "learning_rate": 1.0163934426229509e-05, + "loss": 0.5394, + "step": 31 + }, + { + "epoch": 0.131551901336074, + "grad_norm": 2.7254227477748034, + "learning_rate": 1.0491803278688525e-05, + "loss": 0.587, + "step": 32 + }, + { + "epoch": 0.13566289825282632, + "grad_norm": 0.869479260973271, + "learning_rate": 1.0819672131147544e-05, + "loss": 0.5492, + "step": 33 + }, + { + "epoch": 0.1397738951695786, + "grad_norm": 0.5177682668145469, + "learning_rate": 1.1147540983606557e-05, + "loss": 0.5325, + "step": 34 + }, + { + "epoch": 0.14388489208633093, + "grad_norm": 0.511981609333191, + "learning_rate": 1.1475409836065575e-05, + "loss": 0.5486, + "step": 35 + }, + { + "epoch": 0.14799588900308325, + "grad_norm": 0.5836498681132752, + "learning_rate": 1.1803278688524591e-05, + "loss": 0.5391, + "step": 36 + }, + { + "epoch": 0.15210688591983557, + "grad_norm": 0.6259670640713604, + "learning_rate": 1.2131147540983608e-05, + "loss": 0.5342, + "step": 37 + }, + { + "epoch": 0.15621788283658788, + "grad_norm": 0.4863673175391185, + "learning_rate": 1.2459016393442624e-05, + "loss": 0.5202, + "step": 38 + }, + { + "epoch": 0.16032887975334018, + "grad_norm": 0.46984524173771686, + "learning_rate": 1.2786885245901642e-05, + "loss": 0.5275, + "step": 39 + }, + { + "epoch": 0.1644398766700925, + "grad_norm": 0.4373961614640998, + "learning_rate": 1.3114754098360655e-05, + "loss": 0.5259, + "step": 40 + }, + { + "epoch": 0.1685508735868448, + "grad_norm": 0.5382826043776142, + "learning_rate": 1.3442622950819673e-05, + "loss": 0.5289, + "step": 41 + }, + { + "epoch": 0.17266187050359713, + "grad_norm": 0.49858273312303336, + "learning_rate": 1.377049180327869e-05, + "loss": 0.5034, + "step": 42 + }, + { + "epoch": 0.17677286742034942, + "grad_norm": 0.4565898955129958, + "learning_rate": 1.4098360655737706e-05, + "loss": 0.4992, + "step": 43 + }, + { + "epoch": 0.18088386433710174, + "grad_norm": 0.4250478862858475, + "learning_rate": 1.4426229508196722e-05, + "loss": 0.5099, + "step": 44 + }, + { + "epoch": 0.18499486125385406, + "grad_norm": 0.5450828551582885, + "learning_rate": 1.4754098360655739e-05, + "loss": 0.5133, + "step": 45 + }, + { + "epoch": 0.18910585817060638, + "grad_norm": 0.5082095741416903, + "learning_rate": 1.5081967213114754e-05, + "loss": 0.507, + "step": 46 + }, + { + "epoch": 0.1932168550873587, + "grad_norm": 0.399668921358076, + "learning_rate": 1.5409836065573772e-05, + "loss": 0.499, + "step": 47 + }, + { + "epoch": 0.197327852004111, + "grad_norm": 0.5857374299092792, + "learning_rate": 1.5737704918032788e-05, + "loss": 0.5182, + "step": 48 + }, + { + "epoch": 0.2014388489208633, + "grad_norm": 0.4337541604951673, + "learning_rate": 1.6065573770491805e-05, + "loss": 0.5048, + "step": 49 + }, + { + "epoch": 0.20554984583761562, + "grad_norm": 0.675490041268254, + "learning_rate": 1.639344262295082e-05, + "loss": 0.5091, + "step": 50 + }, + { + "epoch": 0.20966084275436794, + "grad_norm": 0.44682409800475936, + "learning_rate": 1.6721311475409837e-05, + "loss": 0.4948, + "step": 51 + }, + { + "epoch": 0.21377183967112023, + "grad_norm": 0.5243379991172152, + "learning_rate": 1.7049180327868854e-05, + "loss": 0.4959, + "step": 52 + }, + { + "epoch": 0.21788283658787255, + "grad_norm": 0.48058870125487607, + "learning_rate": 1.737704918032787e-05, + "loss": 0.493, + "step": 53 + }, + { + "epoch": 0.22199383350462487, + "grad_norm": 0.4009755414381969, + "learning_rate": 1.7704918032786887e-05, + "loss": 0.5005, + "step": 54 + }, + { + "epoch": 0.2261048304213772, + "grad_norm": 0.4487072583979547, + "learning_rate": 1.8032786885245903e-05, + "loss": 0.5, + "step": 55 + }, + { + "epoch": 0.2302158273381295, + "grad_norm": 0.43431903464010596, + "learning_rate": 1.836065573770492e-05, + "loss": 0.4822, + "step": 56 + }, + { + "epoch": 0.2343268242548818, + "grad_norm": 0.4223425144399419, + "learning_rate": 1.8688524590163936e-05, + "loss": 0.4884, + "step": 57 + }, + { + "epoch": 0.23843782117163412, + "grad_norm": 0.40422238831771906, + "learning_rate": 1.9016393442622952e-05, + "loss": 0.5064, + "step": 58 + }, + { + "epoch": 0.24254881808838644, + "grad_norm": 0.4353031683109967, + "learning_rate": 1.934426229508197e-05, + "loss": 0.4844, + "step": 59 + }, + { + "epoch": 0.24665981500513876, + "grad_norm": 0.5063299442881862, + "learning_rate": 1.9672131147540985e-05, + "loss": 0.4871, + "step": 60 + }, + { + "epoch": 0.25077081192189105, + "grad_norm": 0.6405429501414496, + "learning_rate": 2e-05, + "loss": 0.4916, + "step": 61 + }, + { + "epoch": 0.25488180883864336, + "grad_norm": 0.7398107392403913, + "learning_rate": 2.0327868852459018e-05, + "loss": 0.4967, + "step": 62 + }, + { + "epoch": 0.2589928057553957, + "grad_norm": 0.6066259496387154, + "learning_rate": 2.0655737704918034e-05, + "loss": 0.4933, + "step": 63 + }, + { + "epoch": 0.263103802672148, + "grad_norm": 0.6888615660905145, + "learning_rate": 2.098360655737705e-05, + "loss": 0.4849, + "step": 64 + }, + { + "epoch": 0.2672147995889003, + "grad_norm": 0.6046305786161926, + "learning_rate": 2.1311475409836067e-05, + "loss": 0.4997, + "step": 65 + }, + { + "epoch": 0.27132579650565264, + "grad_norm": 0.4755750596713722, + "learning_rate": 2.1639344262295087e-05, + "loss": 0.484, + "step": 66 + }, + { + "epoch": 0.27543679342240496, + "grad_norm": 0.4901884477105443, + "learning_rate": 2.1967213114754104e-05, + "loss": 0.4714, + "step": 67 + }, + { + "epoch": 0.2795477903391572, + "grad_norm": 0.5180862601664822, + "learning_rate": 2.2295081967213113e-05, + "loss": 0.4743, + "step": 68 + }, + { + "epoch": 0.28365878725590954, + "grad_norm": 0.6341799796360953, + "learning_rate": 2.2622950819672133e-05, + "loss": 0.4837, + "step": 69 + }, + { + "epoch": 0.28776978417266186, + "grad_norm": 0.7050713511862262, + "learning_rate": 2.295081967213115e-05, + "loss": 0.4732, + "step": 70 + }, + { + "epoch": 0.2918807810894142, + "grad_norm": 0.46520327730925665, + "learning_rate": 2.3278688524590166e-05, + "loss": 0.4763, + "step": 71 + }, + { + "epoch": 0.2959917780061665, + "grad_norm": 0.46570649065351716, + "learning_rate": 2.3606557377049182e-05, + "loss": 0.4729, + "step": 72 + }, + { + "epoch": 0.3001027749229188, + "grad_norm": 0.5435122355995184, + "learning_rate": 2.39344262295082e-05, + "loss": 0.4673, + "step": 73 + }, + { + "epoch": 0.30421377183967113, + "grad_norm": 0.7024832057525984, + "learning_rate": 2.4262295081967215e-05, + "loss": 0.4685, + "step": 74 + }, + { + "epoch": 0.30832476875642345, + "grad_norm": 0.5982496336902186, + "learning_rate": 2.459016393442623e-05, + "loss": 0.4683, + "step": 75 + }, + { + "epoch": 0.31243576567317577, + "grad_norm": 0.5579092038957036, + "learning_rate": 2.4918032786885248e-05, + "loss": 0.4818, + "step": 76 + }, + { + "epoch": 0.31654676258992803, + "grad_norm": 0.75454502368708, + "learning_rate": 2.5245901639344264e-05, + "loss": 0.4745, + "step": 77 + }, + { + "epoch": 0.32065775950668035, + "grad_norm": 0.9103711158770255, + "learning_rate": 2.5573770491803284e-05, + "loss": 0.4732, + "step": 78 + }, + { + "epoch": 0.32476875642343267, + "grad_norm": 0.7230510862281725, + "learning_rate": 2.59016393442623e-05, + "loss": 0.4726, + "step": 79 + }, + { + "epoch": 0.328879753340185, + "grad_norm": 0.661725190625586, + "learning_rate": 2.622950819672131e-05, + "loss": 0.4691, + "step": 80 + }, + { + "epoch": 0.3329907502569373, + "grad_norm": 1.3040040424420736, + "learning_rate": 2.655737704918033e-05, + "loss": 0.4596, + "step": 81 + }, + { + "epoch": 0.3371017471736896, + "grad_norm": 0.9247546415389841, + "learning_rate": 2.6885245901639346e-05, + "loss": 0.4687, + "step": 82 + }, + { + "epoch": 0.34121274409044194, + "grad_norm": 0.6690753031478268, + "learning_rate": 2.7213114754098363e-05, + "loss": 0.475, + "step": 83 + }, + { + "epoch": 0.34532374100719426, + "grad_norm": 0.8875577066120585, + "learning_rate": 2.754098360655738e-05, + "loss": 0.4779, + "step": 84 + }, + { + "epoch": 0.3494347379239466, + "grad_norm": 0.9913820671901682, + "learning_rate": 2.7868852459016396e-05, + "loss": 0.4585, + "step": 85 + }, + { + "epoch": 0.35354573484069884, + "grad_norm": 1.0406026178086218, + "learning_rate": 2.8196721311475412e-05, + "loss": 0.4777, + "step": 86 + }, + { + "epoch": 0.35765673175745116, + "grad_norm": 0.8568856680996076, + "learning_rate": 2.852459016393443e-05, + "loss": 0.476, + "step": 87 + }, + { + "epoch": 0.3617677286742035, + "grad_norm": 0.8578450824032388, + "learning_rate": 2.8852459016393445e-05, + "loss": 0.477, + "step": 88 + }, + { + "epoch": 0.3658787255909558, + "grad_norm": 1.0997371612060205, + "learning_rate": 2.918032786885246e-05, + "loss": 0.4694, + "step": 89 + }, + { + "epoch": 0.3699897225077081, + "grad_norm": 0.6710185323141514, + "learning_rate": 2.9508196721311478e-05, + "loss": 0.4664, + "step": 90 + }, + { + "epoch": 0.37410071942446044, + "grad_norm": 0.8753359919001613, + "learning_rate": 2.9836065573770498e-05, + "loss": 0.4675, + "step": 91 + }, + { + "epoch": 0.37821171634121276, + "grad_norm": 0.9060750756011728, + "learning_rate": 3.0163934426229507e-05, + "loss": 0.4577, + "step": 92 + }, + { + "epoch": 0.3823227132579651, + "grad_norm": 0.8327145117934229, + "learning_rate": 3.0491803278688527e-05, + "loss": 0.4798, + "step": 93 + }, + { + "epoch": 0.3864337101747174, + "grad_norm": 0.9953249905867948, + "learning_rate": 3.0819672131147544e-05, + "loss": 0.4588, + "step": 94 + }, + { + "epoch": 0.39054470709146966, + "grad_norm": 0.790890207442512, + "learning_rate": 3.1147540983606557e-05, + "loss": 0.4813, + "step": 95 + }, + { + "epoch": 0.394655704008222, + "grad_norm": 0.7071469844879325, + "learning_rate": 3.1475409836065576e-05, + "loss": 0.4715, + "step": 96 + }, + { + "epoch": 0.3987667009249743, + "grad_norm": 0.683447139315226, + "learning_rate": 3.180327868852459e-05, + "loss": 0.4568, + "step": 97 + }, + { + "epoch": 0.4028776978417266, + "grad_norm": 0.6863994738211686, + "learning_rate": 3.213114754098361e-05, + "loss": 0.4516, + "step": 98 + }, + { + "epoch": 0.40698869475847893, + "grad_norm": 0.6443321732944037, + "learning_rate": 3.245901639344263e-05, + "loss": 0.4467, + "step": 99 + }, + { + "epoch": 0.41109969167523125, + "grad_norm": 0.6015090752114448, + "learning_rate": 3.278688524590164e-05, + "loss": 0.4566, + "step": 100 + }, + { + "epoch": 0.41521068859198357, + "grad_norm": 0.648925234921687, + "learning_rate": 3.311475409836066e-05, + "loss": 0.4598, + "step": 101 + }, + { + "epoch": 0.4193216855087359, + "grad_norm": 0.5740497039935356, + "learning_rate": 3.3442622950819675e-05, + "loss": 0.4514, + "step": 102 + }, + { + "epoch": 0.4234326824254882, + "grad_norm": 0.7433508320080534, + "learning_rate": 3.3770491803278695e-05, + "loss": 0.4555, + "step": 103 + }, + { + "epoch": 0.42754367934224047, + "grad_norm": 0.9786371138605869, + "learning_rate": 3.409836065573771e-05, + "loss": 0.4724, + "step": 104 + }, + { + "epoch": 0.4316546762589928, + "grad_norm": 1.16381322551552, + "learning_rate": 3.442622950819672e-05, + "loss": 0.4665, + "step": 105 + }, + { + "epoch": 0.4357656731757451, + "grad_norm": 0.7033574666436274, + "learning_rate": 3.475409836065574e-05, + "loss": 0.4741, + "step": 106 + }, + { + "epoch": 0.4398766700924974, + "grad_norm": 1.256476593209221, + "learning_rate": 3.5081967213114754e-05, + "loss": 0.476, + "step": 107 + }, + { + "epoch": 0.44398766700924974, + "grad_norm": 0.5933957475473355, + "learning_rate": 3.5409836065573773e-05, + "loss": 0.4653, + "step": 108 + }, + { + "epoch": 0.44809866392600206, + "grad_norm": 1.025564753787377, + "learning_rate": 3.5737704918032786e-05, + "loss": 0.47, + "step": 109 + }, + { + "epoch": 0.4522096608427544, + "grad_norm": 1.0088674998209484, + "learning_rate": 3.6065573770491806e-05, + "loss": 0.4681, + "step": 110 + }, + { + "epoch": 0.4563206577595067, + "grad_norm": 0.9216004942062503, + "learning_rate": 3.6393442622950826e-05, + "loss": 0.4546, + "step": 111 + }, + { + "epoch": 0.460431654676259, + "grad_norm": 1.1054709646558805, + "learning_rate": 3.672131147540984e-05, + "loss": 0.4669, + "step": 112 + }, + { + "epoch": 0.4645426515930113, + "grad_norm": 0.6642218594282759, + "learning_rate": 3.704918032786886e-05, + "loss": 0.4533, + "step": 113 + }, + { + "epoch": 0.4686536485097636, + "grad_norm": 0.8356269646157981, + "learning_rate": 3.737704918032787e-05, + "loss": 0.4599, + "step": 114 + }, + { + "epoch": 0.4727646454265159, + "grad_norm": 1.1650429141300205, + "learning_rate": 3.770491803278689e-05, + "loss": 0.448, + "step": 115 + }, + { + "epoch": 0.47687564234326824, + "grad_norm": 0.6212175962293394, + "learning_rate": 3.8032786885245905e-05, + "loss": 0.4638, + "step": 116 + }, + { + "epoch": 0.48098663926002055, + "grad_norm": 1.1965895951813037, + "learning_rate": 3.836065573770492e-05, + "loss": 0.4619, + "step": 117 + }, + { + "epoch": 0.4850976361767729, + "grad_norm": 0.8457976781943612, + "learning_rate": 3.868852459016394e-05, + "loss": 0.4555, + "step": 118 + }, + { + "epoch": 0.4892086330935252, + "grad_norm": 0.8463559301031214, + "learning_rate": 3.901639344262295e-05, + "loss": 0.4713, + "step": 119 + }, + { + "epoch": 0.4933196300102775, + "grad_norm": 0.6654609587793014, + "learning_rate": 3.934426229508197e-05, + "loss": 0.4461, + "step": 120 + }, + { + "epoch": 0.49743062692702983, + "grad_norm": 0.7698090467763701, + "learning_rate": 3.9672131147540983e-05, + "loss": 0.4627, + "step": 121 + }, + { + "epoch": 0.5015416238437821, + "grad_norm": 0.5716155461137187, + "learning_rate": 4e-05, + "loss": 0.4576, + "step": 122 + }, + { + "epoch": 0.5056526207605344, + "grad_norm": 0.5151376433722467, + "learning_rate": 3.999991738495905e-05, + "loss": 0.4485, + "step": 123 + }, + { + "epoch": 0.5097636176772867, + "grad_norm": 0.561427237450996, + "learning_rate": 3.9999669540518704e-05, + "loss": 0.454, + "step": 124 + }, + { + "epoch": 0.513874614594039, + "grad_norm": 0.6553799163893537, + "learning_rate": 3.999925646872655e-05, + "loss": 0.4523, + "step": 125 + }, + { + "epoch": 0.5179856115107914, + "grad_norm": 0.7909652053854684, + "learning_rate": 3.9998678172995157e-05, + "loss": 0.4544, + "step": 126 + }, + { + "epoch": 0.5220966084275437, + "grad_norm": 0.8388721187199466, + "learning_rate": 3.999793465810214e-05, + "loss": 0.4408, + "step": 127 + }, + { + "epoch": 0.526207605344296, + "grad_norm": 0.7373151231076792, + "learning_rate": 3.999702593019004e-05, + "loss": 0.4596, + "step": 128 + }, + { + "epoch": 0.5303186022610483, + "grad_norm": 0.9546826007376602, + "learning_rate": 3.9995951996766316e-05, + "loss": 0.459, + "step": 129 + }, + { + "epoch": 0.5344295991778006, + "grad_norm": 0.9027549638128062, + "learning_rate": 3.999471286670328e-05, + "loss": 0.4537, + "step": 130 + }, + { + "epoch": 0.538540596094553, + "grad_norm": 1.0477189023005884, + "learning_rate": 3.9993308550238e-05, + "loss": 0.4455, + "step": 131 + }, + { + "epoch": 0.5426515930113053, + "grad_norm": 1.1744648733550076, + "learning_rate": 3.999173905897226e-05, + "loss": 0.4579, + "step": 132 + }, + { + "epoch": 0.5467625899280576, + "grad_norm": 0.7693181453420259, + "learning_rate": 3.99900044058724e-05, + "loss": 0.4381, + "step": 133 + }, + { + "epoch": 0.5508735868448099, + "grad_norm": 0.8932998819929917, + "learning_rate": 3.998810460526927e-05, + "loss": 0.4663, + "step": 134 + }, + { + "epoch": 0.5549845837615622, + "grad_norm": 1.1927529473551686, + "learning_rate": 3.998603967285808e-05, + "loss": 0.456, + "step": 135 + }, + { + "epoch": 0.5590955806783144, + "grad_norm": 0.7303627266940724, + "learning_rate": 3.998380962569828e-05, + "loss": 0.463, + "step": 136 + }, + { + "epoch": 0.5632065775950668, + "grad_norm": 1.211411577139644, + "learning_rate": 3.9981414482213405e-05, + "loss": 0.4649, + "step": 137 + }, + { + "epoch": 0.5673175745118191, + "grad_norm": 0.8811081756810052, + "learning_rate": 3.997885426219096e-05, + "loss": 0.4637, + "step": 138 + }, + { + "epoch": 0.5714285714285714, + "grad_norm": 0.8713984959209414, + "learning_rate": 3.99761289867822e-05, + "loss": 0.4546, + "step": 139 + }, + { + "epoch": 0.5755395683453237, + "grad_norm": 0.9788586712835476, + "learning_rate": 3.9973238678501996e-05, + "loss": 0.4475, + "step": 140 + }, + { + "epoch": 0.579650565262076, + "grad_norm": 0.7533682906447463, + "learning_rate": 3.997018336122866e-05, + "loss": 0.4428, + "step": 141 + }, + { + "epoch": 0.5837615621788284, + "grad_norm": 0.7461312001689725, + "learning_rate": 3.9966963060203684e-05, + "loss": 0.4494, + "step": 142 + }, + { + "epoch": 0.5878725590955807, + "grad_norm": 0.6174187621354268, + "learning_rate": 3.996357780203161e-05, + "loss": 0.4504, + "step": 143 + }, + { + "epoch": 0.591983556012333, + "grad_norm": 0.49003513931944637, + "learning_rate": 3.9960027614679766e-05, + "loss": 0.4427, + "step": 144 + }, + { + "epoch": 0.5960945529290853, + "grad_norm": 0.5605653135696967, + "learning_rate": 3.995631252747804e-05, + "loss": 0.448, + "step": 145 + }, + { + "epoch": 0.6002055498458376, + "grad_norm": 0.44488928569515496, + "learning_rate": 3.9952432571118634e-05, + "loss": 0.4467, + "step": 146 + }, + { + "epoch": 0.60431654676259, + "grad_norm": 0.5026250568866101, + "learning_rate": 3.994838777765582e-05, + "loss": 0.4477, + "step": 147 + }, + { + "epoch": 0.6084275436793423, + "grad_norm": 0.5172580934087974, + "learning_rate": 3.9944178180505685e-05, + "loss": 0.4523, + "step": 148 + }, + { + "epoch": 0.6125385405960946, + "grad_norm": 0.42653534399915305, + "learning_rate": 3.993980381444583e-05, + "loss": 0.4461, + "step": 149 + }, + { + "epoch": 0.6166495375128469, + "grad_norm": 0.5471101554510113, + "learning_rate": 3.993526471561509e-05, + "loss": 0.4434, + "step": 150 + }, + { + "epoch": 0.6207605344295992, + "grad_norm": 0.44006741961102114, + "learning_rate": 3.993056092151326e-05, + "loss": 0.4433, + "step": 151 + }, + { + "epoch": 0.6248715313463515, + "grad_norm": 0.54547408589461, + "learning_rate": 3.9925692471000755e-05, + "loss": 0.4512, + "step": 152 + }, + { + "epoch": 0.6289825282631039, + "grad_norm": 0.5069055596939723, + "learning_rate": 3.9920659404298285e-05, + "loss": 0.4407, + "step": 153 + }, + { + "epoch": 0.6330935251798561, + "grad_norm": 0.4361985691247893, + "learning_rate": 3.991546176298657e-05, + "loss": 0.443, + "step": 154 + }, + { + "epoch": 0.6372045220966084, + "grad_norm": 0.6492040966314581, + "learning_rate": 3.991009959000593e-05, + "loss": 0.4643, + "step": 155 + }, + { + "epoch": 0.6413155190133607, + "grad_norm": 0.662599709889411, + "learning_rate": 3.990457292965598e-05, + "loss": 0.4421, + "step": 156 + }, + { + "epoch": 0.645426515930113, + "grad_norm": 0.7496440633893157, + "learning_rate": 3.9898881827595255e-05, + "loss": 0.4475, + "step": 157 + }, + { + "epoch": 0.6495375128468653, + "grad_norm": 0.6565888966700488, + "learning_rate": 3.989302633084081e-05, + "loss": 0.4478, + "step": 158 + }, + { + "epoch": 0.6536485097636177, + "grad_norm": 0.4914282437943478, + "learning_rate": 3.988700648776786e-05, + "loss": 0.4485, + "step": 159 + }, + { + "epoch": 0.65775950668037, + "grad_norm": 0.5765645279846272, + "learning_rate": 3.9880822348109365e-05, + "loss": 0.4406, + "step": 160 + }, + { + "epoch": 0.6618705035971223, + "grad_norm": 0.54508458239329, + "learning_rate": 3.9874473962955625e-05, + "loss": 0.4377, + "step": 161 + }, + { + "epoch": 0.6659815005138746, + "grad_norm": 0.5467537029004771, + "learning_rate": 3.986796138475383e-05, + "loss": 0.4404, + "step": 162 + }, + { + "epoch": 0.6700924974306269, + "grad_norm": 0.5927993483965615, + "learning_rate": 3.986128466730769e-05, + "loss": 0.4397, + "step": 163 + }, + { + "epoch": 0.6742034943473793, + "grad_norm": 0.549384325928676, + "learning_rate": 3.985444386577693e-05, + "loss": 0.4447, + "step": 164 + }, + { + "epoch": 0.6783144912641316, + "grad_norm": 0.7853091838958088, + "learning_rate": 3.984743903667685e-05, + "loss": 0.4323, + "step": 165 + }, + { + "epoch": 0.6824254881808839, + "grad_norm": 0.7452346665717267, + "learning_rate": 3.984027023787789e-05, + "loss": 0.4418, + "step": 166 + }, + { + "epoch": 0.6865364850976362, + "grad_norm": 0.6354950658254404, + "learning_rate": 3.98329375286051e-05, + "loss": 0.4462, + "step": 167 + }, + { + "epoch": 0.6906474820143885, + "grad_norm": 0.6039082662120046, + "learning_rate": 3.982544096943769e-05, + "loss": 0.4387, + "step": 168 + }, + { + "epoch": 0.6947584789311408, + "grad_norm": 0.658037323729766, + "learning_rate": 3.9817780622308515e-05, + "loss": 0.4442, + "step": 169 + }, + { + "epoch": 0.6988694758478932, + "grad_norm": 0.4757882993447632, + "learning_rate": 3.980995655050356e-05, + "loss": 0.4432, + "step": 170 + }, + { + "epoch": 0.7029804727646455, + "grad_norm": 0.44532932025468364, + "learning_rate": 3.980196881866143e-05, + "loss": 0.4414, + "step": 171 + }, + { + "epoch": 0.7070914696813977, + "grad_norm": 0.45179948654666446, + "learning_rate": 3.9793817492772806e-05, + "loss": 0.4509, + "step": 172 + }, + { + "epoch": 0.71120246659815, + "grad_norm": 0.4699683428704349, + "learning_rate": 3.9785502640179905e-05, + "loss": 0.4278, + "step": 173 + }, + { + "epoch": 0.7153134635149023, + "grad_norm": 0.5201025709025265, + "learning_rate": 3.97770243295759e-05, + "loss": 0.4335, + "step": 174 + }, + { + "epoch": 0.7194244604316546, + "grad_norm": 0.47266696898911464, + "learning_rate": 3.9768382631004405e-05, + "loss": 0.4501, + "step": 175 + }, + { + "epoch": 0.723535457348407, + "grad_norm": 0.4422465802779614, + "learning_rate": 3.975957761585883e-05, + "loss": 0.446, + "step": 176 + }, + { + "epoch": 0.7276464542651593, + "grad_norm": 0.5391358485913682, + "learning_rate": 3.9750609356881865e-05, + "loss": 0.4512, + "step": 177 + }, + { + "epoch": 0.7317574511819116, + "grad_norm": 0.46555411315299916, + "learning_rate": 3.974147792816481e-05, + "loss": 0.4374, + "step": 178 + }, + { + "epoch": 0.7358684480986639, + "grad_norm": 0.5347959929577083, + "learning_rate": 3.9732183405146984e-05, + "loss": 0.4368, + "step": 179 + }, + { + "epoch": 0.7399794450154162, + "grad_norm": 0.5543063250157177, + "learning_rate": 3.9722725864615156e-05, + "loss": 0.4468, + "step": 180 + }, + { + "epoch": 0.7440904419321686, + "grad_norm": 0.39521704775907723, + "learning_rate": 3.971310538470282e-05, + "loss": 0.4338, + "step": 181 + }, + { + "epoch": 0.7482014388489209, + "grad_norm": 0.47237629534672426, + "learning_rate": 3.9703322044889605e-05, + "loss": 0.4369, + "step": 182 + }, + { + "epoch": 0.7523124357656732, + "grad_norm": 0.434146415819749, + "learning_rate": 3.969337592600062e-05, + "loss": 0.4458, + "step": 183 + }, + { + "epoch": 0.7564234326824255, + "grad_norm": 0.38836391572812273, + "learning_rate": 3.968326711020578e-05, + "loss": 0.4546, + "step": 184 + }, + { + "epoch": 0.7605344295991778, + "grad_norm": 0.34969919974995534, + "learning_rate": 3.967299568101908e-05, + "loss": 0.4459, + "step": 185 + }, + { + "epoch": 0.7646454265159301, + "grad_norm": 0.41064464728289324, + "learning_rate": 3.9662561723298e-05, + "loss": 0.4326, + "step": 186 + }, + { + "epoch": 0.7687564234326825, + "grad_norm": 0.4616126051202659, + "learning_rate": 3.9651965323242704e-05, + "loss": 0.4492, + "step": 187 + }, + { + "epoch": 0.7728674203494348, + "grad_norm": 0.49195669527847435, + "learning_rate": 3.964120656839541e-05, + "loss": 0.4276, + "step": 188 + }, + { + "epoch": 0.7769784172661871, + "grad_norm": 0.3807633073682157, + "learning_rate": 3.963028554763961e-05, + "loss": 0.4428, + "step": 189 + }, + { + "epoch": 0.7810894141829393, + "grad_norm": 0.3811255626131261, + "learning_rate": 3.9619202351199356e-05, + "loss": 0.4337, + "step": 190 + }, + { + "epoch": 0.7852004110996916, + "grad_norm": 0.3612639948436137, + "learning_rate": 3.960795707063852e-05, + "loss": 0.4363, + "step": 191 + }, + { + "epoch": 0.789311408016444, + "grad_norm": 0.4353027404982674, + "learning_rate": 3.959654979886005e-05, + "loss": 0.4365, + "step": 192 + }, + { + "epoch": 0.7934224049331963, + "grad_norm": 0.37923924344854587, + "learning_rate": 3.958498063010516e-05, + "loss": 0.4277, + "step": 193 + }, + { + "epoch": 0.7975334018499486, + "grad_norm": 0.49016416134919827, + "learning_rate": 3.957324965995257e-05, + "loss": 0.4189, + "step": 194 + }, + { + "epoch": 0.8016443987667009, + "grad_norm": 0.3808642318097945, + "learning_rate": 3.956135698531777e-05, + "loss": 0.428, + "step": 195 + }, + { + "epoch": 0.8057553956834532, + "grad_norm": 0.4706420424359872, + "learning_rate": 3.9549302704452104e-05, + "loss": 0.4355, + "step": 196 + }, + { + "epoch": 0.8098663926002055, + "grad_norm": 0.5558683512038307, + "learning_rate": 3.953708691694208e-05, + "loss": 0.4219, + "step": 197 + }, + { + "epoch": 0.8139773895169579, + "grad_norm": 0.5188467581658631, + "learning_rate": 3.952470972370848e-05, + "loss": 0.4369, + "step": 198 + }, + { + "epoch": 0.8180883864337102, + "grad_norm": 0.4485136574531589, + "learning_rate": 3.951217122700554e-05, + "loss": 0.4206, + "step": 199 + }, + { + "epoch": 0.8221993833504625, + "grad_norm": 0.4872982826961068, + "learning_rate": 3.9499471530420086e-05, + "loss": 0.4434, + "step": 200 + }, + { + "epoch": 0.8263103802672148, + "grad_norm": 0.5704413227159343, + "learning_rate": 3.9486610738870726e-05, + "loss": 0.4332, + "step": 201 + }, + { + "epoch": 0.8304213771839671, + "grad_norm": 0.6576571504037381, + "learning_rate": 3.947358895860693e-05, + "loss": 0.4282, + "step": 202 + }, + { + "epoch": 0.8345323741007195, + "grad_norm": 0.5236083635603117, + "learning_rate": 3.9460406297208204e-05, + "loss": 0.4418, + "step": 203 + }, + { + "epoch": 0.8386433710174718, + "grad_norm": 0.4856398721711883, + "learning_rate": 3.944706286358315e-05, + "loss": 0.4446, + "step": 204 + }, + { + "epoch": 0.8427543679342241, + "grad_norm": 0.553946219409764, + "learning_rate": 3.94335587679686e-05, + "loss": 0.4421, + "step": 205 + }, + { + "epoch": 0.8468653648509764, + "grad_norm": 0.559411380541318, + "learning_rate": 3.94198941219287e-05, + "loss": 0.4628, + "step": 206 + }, + { + "epoch": 0.8509763617677287, + "grad_norm": 0.4879763317857753, + "learning_rate": 3.940606903835398e-05, + "loss": 0.442, + "step": 207 + }, + { + "epoch": 0.8550873586844809, + "grad_norm": 0.5054384831570833, + "learning_rate": 3.939208363146041e-05, + "loss": 0.4262, + "step": 208 + }, + { + "epoch": 0.8591983556012333, + "grad_norm": 0.5553954849786898, + "learning_rate": 3.937793801678851e-05, + "loss": 0.427, + "step": 209 + }, + { + "epoch": 0.8633093525179856, + "grad_norm": 0.5872415310529557, + "learning_rate": 3.936363231120231e-05, + "loss": 0.4413, + "step": 210 + }, + { + "epoch": 0.8674203494347379, + "grad_norm": 0.5889656491169154, + "learning_rate": 3.934916663288847e-05, + "loss": 0.4374, + "step": 211 + }, + { + "epoch": 0.8715313463514902, + "grad_norm": 0.5289928076892064, + "learning_rate": 3.9334541101355244e-05, + "loss": 0.4393, + "step": 212 + }, + { + "epoch": 0.8756423432682425, + "grad_norm": 0.5133117516646354, + "learning_rate": 3.931975583743152e-05, + "loss": 0.4207, + "step": 213 + }, + { + "epoch": 0.8797533401849948, + "grad_norm": 0.48037331045870174, + "learning_rate": 3.930481096326583e-05, + "loss": 0.4175, + "step": 214 + }, + { + "epoch": 0.8838643371017472, + "grad_norm": 0.5410217736097758, + "learning_rate": 3.92897066023253e-05, + "loss": 0.431, + "step": 215 + }, + { + "epoch": 0.8879753340184995, + "grad_norm": 0.41649001377169803, + "learning_rate": 3.927444287939467e-05, + "loss": 0.4484, + "step": 216 + }, + { + "epoch": 0.8920863309352518, + "grad_norm": 0.45628332224884727, + "learning_rate": 3.925901992057525e-05, + "loss": 0.4305, + "step": 217 + }, + { + "epoch": 0.8961973278520041, + "grad_norm": 0.5227314578776049, + "learning_rate": 3.924343785328388e-05, + "loss": 0.4393, + "step": 218 + }, + { + "epoch": 0.9003083247687564, + "grad_norm": 0.4530459458277021, + "learning_rate": 3.9227696806251875e-05, + "loss": 0.4382, + "step": 219 + }, + { + "epoch": 0.9044193216855088, + "grad_norm": 0.4488315318208515, + "learning_rate": 3.9211796909523953e-05, + "loss": 0.4209, + "step": 220 + }, + { + "epoch": 0.9085303186022611, + "grad_norm": 0.4369045769060924, + "learning_rate": 3.9195738294457186e-05, + "loss": 0.4357, + "step": 221 + }, + { + "epoch": 0.9126413155190134, + "grad_norm": 0.3980678441937295, + "learning_rate": 3.9179521093719876e-05, + "loss": 0.4142, + "step": 222 + }, + { + "epoch": 0.9167523124357657, + "grad_norm": 0.5003747978502763, + "learning_rate": 3.91631454412905e-05, + "loss": 0.4484, + "step": 223 + }, + { + "epoch": 0.920863309352518, + "grad_norm": 0.43942976248272747, + "learning_rate": 3.914661147245657e-05, + "loss": 0.434, + "step": 224 + }, + { + "epoch": 0.9249743062692704, + "grad_norm": 0.4174753367400882, + "learning_rate": 3.912991932381355e-05, + "loss": 0.4282, + "step": 225 + }, + { + "epoch": 0.9290853031860226, + "grad_norm": 0.36920457252907907, + "learning_rate": 3.91130691332637e-05, + "loss": 0.4347, + "step": 226 + }, + { + "epoch": 0.9331963001027749, + "grad_norm": 0.45392411540078437, + "learning_rate": 3.9096061040014914e-05, + "loss": 0.4135, + "step": 227 + }, + { + "epoch": 0.9373072970195272, + "grad_norm": 0.4203872157822759, + "learning_rate": 3.907889518457964e-05, + "loss": 0.4422, + "step": 228 + }, + { + "epoch": 0.9414182939362795, + "grad_norm": 0.391547280290097, + "learning_rate": 3.9061571708773656e-05, + "loss": 0.428, + "step": 229 + }, + { + "epoch": 0.9455292908530318, + "grad_norm": 0.5746907556468481, + "learning_rate": 3.9044090755714935e-05, + "loss": 0.4273, + "step": 230 + }, + { + "epoch": 0.9496402877697842, + "grad_norm": 0.5021218433821051, + "learning_rate": 3.9026452469822435e-05, + "loss": 0.4318, + "step": 231 + }, + { + "epoch": 0.9537512846865365, + "grad_norm": 0.5118619524543895, + "learning_rate": 3.900865699681494e-05, + "loss": 0.4565, + "step": 232 + }, + { + "epoch": 0.9578622816032888, + "grad_norm": 0.4269764449835691, + "learning_rate": 3.899070448370981e-05, + "loss": 0.4242, + "step": 233 + }, + { + "epoch": 0.9619732785200411, + "grad_norm": 0.544830400097823, + "learning_rate": 3.897259507882181e-05, + "loss": 0.4308, + "step": 234 + }, + { + "epoch": 0.9660842754367934, + "grad_norm": 0.5029148596149111, + "learning_rate": 3.895432893176186e-05, + "loss": 0.4283, + "step": 235 + }, + { + "epoch": 0.9701952723535457, + "grad_norm": 0.6013585416586662, + "learning_rate": 3.8935906193435814e-05, + "loss": 0.4231, + "step": 236 + }, + { + "epoch": 0.9743062692702981, + "grad_norm": 0.501625593569375, + "learning_rate": 3.89173270160432e-05, + "loss": 0.4335, + "step": 237 + }, + { + "epoch": 0.9784172661870504, + "grad_norm": 0.6586654174152249, + "learning_rate": 3.889859155307596e-05, + "loss": 0.4365, + "step": 238 + }, + { + "epoch": 0.9825282631038027, + "grad_norm": 0.6491985191825143, + "learning_rate": 3.8879699959317204e-05, + "loss": 0.428, + "step": 239 + }, + { + "epoch": 0.986639260020555, + "grad_norm": 0.36412537479982626, + "learning_rate": 3.8860652390839915e-05, + "loss": 0.4258, + "step": 240 + }, + { + "epoch": 0.9907502569373073, + "grad_norm": 0.562496507066076, + "learning_rate": 3.884144900500565e-05, + "loss": 0.4352, + "step": 241 + }, + { + "epoch": 0.9948612538540597, + "grad_norm": 0.4838097185277804, + "learning_rate": 3.882208996046327e-05, + "loss": 0.4422, + "step": 242 + }, + { + "epoch": 0.998972250770812, + "grad_norm": 0.4422917967441169, + "learning_rate": 3.880257541714759e-05, + "loss": 0.4273, + "step": 243 + }, + { + "epoch": 1.0030832476875642, + "grad_norm": 0.5334045773924255, + "learning_rate": 3.878290553627809e-05, + "loss": 0.3969, + "step": 244 + }, + { + "epoch": 1.0071942446043165, + "grad_norm": 0.5378794632121926, + "learning_rate": 3.876308048035758e-05, + "loss": 0.3903, + "step": 245 + }, + { + "epoch": 1.0113052415210688, + "grad_norm": 0.5944247982125659, + "learning_rate": 3.874310041317084e-05, + "loss": 0.3866, + "step": 246 + }, + { + "epoch": 1.0154162384378211, + "grad_norm": 0.5659631885785738, + "learning_rate": 3.8722965499783265e-05, + "loss": 0.3859, + "step": 247 + }, + { + "epoch": 1.0195272353545735, + "grad_norm": 0.6678922530928978, + "learning_rate": 3.8702675906539536e-05, + "loss": 0.3975, + "step": 248 + }, + { + "epoch": 1.0236382322713258, + "grad_norm": 0.6092071387321932, + "learning_rate": 3.868223180106221e-05, + "loss": 0.3805, + "step": 249 + }, + { + "epoch": 1.027749229188078, + "grad_norm": 0.48801873476109786, + "learning_rate": 3.866163335225034e-05, + "loss": 0.3924, + "step": 250 + }, + { + "epoch": 1.0318602261048304, + "grad_norm": 0.5338205820825612, + "learning_rate": 3.8640880730278105e-05, + "loss": 0.4015, + "step": 251 + }, + { + "epoch": 1.0359712230215827, + "grad_norm": 0.47770709705325853, + "learning_rate": 3.8619974106593365e-05, + "loss": 0.3979, + "step": 252 + }, + { + "epoch": 1.040082219938335, + "grad_norm": 0.6103179105115757, + "learning_rate": 3.859891365391628e-05, + "loss": 0.388, + "step": 253 + }, + { + "epoch": 1.0441932168550874, + "grad_norm": 0.5427245439232725, + "learning_rate": 3.8577699546237886e-05, + "loss": 0.3811, + "step": 254 + }, + { + "epoch": 1.0483042137718397, + "grad_norm": 0.532814479999278, + "learning_rate": 3.8556331958818596e-05, + "loss": 0.3872, + "step": 255 + }, + { + "epoch": 1.052415210688592, + "grad_norm": 0.5339131788688589, + "learning_rate": 3.853481106818683e-05, + "loss": 0.3914, + "step": 256 + }, + { + "epoch": 1.0565262076053443, + "grad_norm": 0.5060398381577083, + "learning_rate": 3.851313705213751e-05, + "loss": 0.3876, + "step": 257 + }, + { + "epoch": 1.0606372045220966, + "grad_norm": 0.562896010283109, + "learning_rate": 3.8491310089730614e-05, + "loss": 0.3946, + "step": 258 + }, + { + "epoch": 1.064748201438849, + "grad_norm": 0.40638744368816154, + "learning_rate": 3.846933036128968e-05, + "loss": 0.3809, + "step": 259 + }, + { + "epoch": 1.0688591983556013, + "grad_norm": 0.5821981657729004, + "learning_rate": 3.8447198048400325e-05, + "loss": 0.4041, + "step": 260 + }, + { + "epoch": 1.0729701952723536, + "grad_norm": 0.5613586250111681, + "learning_rate": 3.8424913333908744e-05, + "loss": 0.3834, + "step": 261 + }, + { + "epoch": 1.077081192189106, + "grad_norm": 0.5210705488884988, + "learning_rate": 3.840247640192019e-05, + "loss": 0.4053, + "step": 262 + }, + { + "epoch": 1.0811921891058582, + "grad_norm": 0.5223026076271566, + "learning_rate": 3.837988743779747e-05, + "loss": 0.4057, + "step": 263 + }, + { + "epoch": 1.0853031860226106, + "grad_norm": 0.40771864088972815, + "learning_rate": 3.8357146628159415e-05, + "loss": 0.3759, + "step": 264 + }, + { + "epoch": 1.0894141829393629, + "grad_norm": 0.5106166010616134, + "learning_rate": 3.8334254160879296e-05, + "loss": 0.3927, + "step": 265 + }, + { + "epoch": 1.0935251798561152, + "grad_norm": 0.39265167645057447, + "learning_rate": 3.8311210225083347e-05, + "loss": 0.3772, + "step": 266 + }, + { + "epoch": 1.0976361767728675, + "grad_norm": 0.5406659317819649, + "learning_rate": 3.8288015011149126e-05, + "loss": 0.3877, + "step": 267 + }, + { + "epoch": 1.1017471736896198, + "grad_norm": 0.4396566495996877, + "learning_rate": 3.826466871070399e-05, + "loss": 0.3919, + "step": 268 + }, + { + "epoch": 1.1058581706063721, + "grad_norm": 0.4071828889239751, + "learning_rate": 3.82411715166235e-05, + "loss": 0.3929, + "step": 269 + }, + { + "epoch": 1.1099691675231242, + "grad_norm": 0.4408302571199858, + "learning_rate": 3.821752362302982e-05, + "loss": 0.3984, + "step": 270 + }, + { + "epoch": 1.1140801644398768, + "grad_norm": 0.3944864874139757, + "learning_rate": 3.8193725225290105e-05, + "loss": 0.3791, + "step": 271 + }, + { + "epoch": 1.1181911613566289, + "grad_norm": 0.5086637552588018, + "learning_rate": 3.8169776520014935e-05, + "loss": 0.3981, + "step": 272 + }, + { + "epoch": 1.1223021582733812, + "grad_norm": 0.37495702811326503, + "learning_rate": 3.814567770505663e-05, + "loss": 0.399, + "step": 273 + }, + { + "epoch": 1.1264131551901335, + "grad_norm": 0.5606532197558952, + "learning_rate": 3.812142897950765e-05, + "loss": 0.3919, + "step": 274 + }, + { + "epoch": 1.1305241521068858, + "grad_norm": 0.5021460420776965, + "learning_rate": 3.809703054369893e-05, + "loss": 0.3884, + "step": 275 + }, + { + "epoch": 1.1346351490236382, + "grad_norm": 0.42349030253760284, + "learning_rate": 3.807248259919826e-05, + "loss": 0.3834, + "step": 276 + }, + { + "epoch": 1.1387461459403905, + "grad_norm": 0.4379650832741319, + "learning_rate": 3.804778534880858e-05, + "loss": 0.3907, + "step": 277 + }, + { + "epoch": 1.1428571428571428, + "grad_norm": 0.49289600771412606, + "learning_rate": 3.802293899656632e-05, + "loss": 0.3813, + "step": 278 + }, + { + "epoch": 1.1469681397738951, + "grad_norm": 0.42490353928620167, + "learning_rate": 3.7997943747739735e-05, + "loss": 0.3853, + "step": 279 + }, + { + "epoch": 1.1510791366906474, + "grad_norm": 0.518693269655709, + "learning_rate": 3.797279980882716e-05, + "loss": 0.3982, + "step": 280 + }, + { + "epoch": 1.1551901336073997, + "grad_norm": 0.43881434866880253, + "learning_rate": 3.794750738755536e-05, + "loss": 0.3926, + "step": 281 + }, + { + "epoch": 1.159301130524152, + "grad_norm": 0.43485781066948115, + "learning_rate": 3.792206669287776e-05, + "loss": 0.3922, + "step": 282 + }, + { + "epoch": 1.1634121274409044, + "grad_norm": 0.45793373109478, + "learning_rate": 3.789647793497279e-05, + "loss": 0.3949, + "step": 283 + }, + { + "epoch": 1.1675231243576567, + "grad_norm": 0.4344299254818994, + "learning_rate": 3.787074132524206e-05, + "loss": 0.3895, + "step": 284 + }, + { + "epoch": 1.171634121274409, + "grad_norm": 0.44080712302633035, + "learning_rate": 3.784485707630868e-05, + "loss": 0.3914, + "step": 285 + }, + { + "epoch": 1.1757451181911613, + "grad_norm": 0.37695184690701744, + "learning_rate": 3.781882540201547e-05, + "loss": 0.3875, + "step": 286 + }, + { + "epoch": 1.1798561151079137, + "grad_norm": 0.45884161347743313, + "learning_rate": 3.7792646517423236e-05, + "loss": 0.3744, + "step": 287 + }, + { + "epoch": 1.183967112024666, + "grad_norm": 0.4017320092584037, + "learning_rate": 3.7766320638808924e-05, + "loss": 0.3922, + "step": 288 + }, + { + "epoch": 1.1880781089414183, + "grad_norm": 0.4280615463958759, + "learning_rate": 3.773984798366389e-05, + "loss": 0.4006, + "step": 289 + }, + { + "epoch": 1.1921891058581706, + "grad_norm": 0.3513007234774324, + "learning_rate": 3.7713228770692084e-05, + "loss": 0.3819, + "step": 290 + }, + { + "epoch": 1.196300102774923, + "grad_norm": 0.4230975793009198, + "learning_rate": 3.768646321980824e-05, + "loss": 0.3819, + "step": 291 + }, + { + "epoch": 1.2004110996916753, + "grad_norm": 0.38816726480644864, + "learning_rate": 3.765955155213607e-05, + "loss": 0.391, + "step": 292 + }, + { + "epoch": 1.2045220966084276, + "grad_norm": 0.41570862488108373, + "learning_rate": 3.763249399000643e-05, + "loss": 0.3879, + "step": 293 + }, + { + "epoch": 1.20863309352518, + "grad_norm": 0.5386712701377521, + "learning_rate": 3.7605290756955476e-05, + "loss": 0.404, + "step": 294 + }, + { + "epoch": 1.2127440904419322, + "grad_norm": 0.4142838391786987, + "learning_rate": 3.757794207772283e-05, + "loss": 0.394, + "step": 295 + }, + { + "epoch": 1.2168550873586845, + "grad_norm": 0.3918702689073396, + "learning_rate": 3.755044817824971e-05, + "loss": 0.3833, + "step": 296 + }, + { + "epoch": 1.2209660842754368, + "grad_norm": 0.4790045020269064, + "learning_rate": 3.752280928567709e-05, + "loss": 0.3827, + "step": 297 + }, + { + "epoch": 1.2250770811921892, + "grad_norm": 0.4071852105252518, + "learning_rate": 3.749502562834379e-05, + "loss": 0.3972, + "step": 298 + }, + { + "epoch": 1.2291880781089415, + "grad_norm": 0.433522303940447, + "learning_rate": 3.746709743578462e-05, + "loss": 0.3985, + "step": 299 + }, + { + "epoch": 1.2332990750256938, + "grad_norm": 0.4222484903892338, + "learning_rate": 3.7439024938728435e-05, + "loss": 0.384, + "step": 300 + }, + { + "epoch": 1.2374100719424461, + "grad_norm": 0.42821966368019687, + "learning_rate": 3.74108083690963e-05, + "loss": 0.3908, + "step": 301 + }, + { + "epoch": 1.2415210688591984, + "grad_norm": 0.5269787553817297, + "learning_rate": 3.7382447959999514e-05, + "loss": 0.3869, + "step": 302 + }, + { + "epoch": 1.2456320657759508, + "grad_norm": 0.4206960432187445, + "learning_rate": 3.7353943945737716e-05, + "loss": 0.3984, + "step": 303 + }, + { + "epoch": 1.249743062692703, + "grad_norm": 0.3963715348953228, + "learning_rate": 3.7325296561796936e-05, + "loss": 0.3908, + "step": 304 + }, + { + "epoch": 1.2538540596094552, + "grad_norm": 0.5197873707406762, + "learning_rate": 3.729650604484766e-05, + "loss": 0.3789, + "step": 305 + }, + { + "epoch": 1.2579650565262077, + "grad_norm": 0.391975059464178, + "learning_rate": 3.7267572632742846e-05, + "loss": 0.39, + "step": 306 + }, + { + "epoch": 1.2620760534429598, + "grad_norm": 0.4297553917483092, + "learning_rate": 3.7238496564516006e-05, + "loss": 0.398, + "step": 307 + }, + { + "epoch": 1.2661870503597124, + "grad_norm": 0.32415884735671224, + "learning_rate": 3.720927808037921e-05, + "loss": 0.385, + "step": 308 + }, + { + "epoch": 1.2702980472764644, + "grad_norm": 0.4050462187721075, + "learning_rate": 3.717991742172106e-05, + "loss": 0.3801, + "step": 309 + }, + { + "epoch": 1.274409044193217, + "grad_norm": 0.44040991415716113, + "learning_rate": 3.7150414831104765e-05, + "loss": 0.3936, + "step": 310 + }, + { + "epoch": 1.278520041109969, + "grad_norm": 0.4117947843277416, + "learning_rate": 3.712077055226611e-05, + "loss": 0.3966, + "step": 311 + }, + { + "epoch": 1.2826310380267214, + "grad_norm": 0.4039039643321521, + "learning_rate": 3.7090984830111415e-05, + "loss": 0.3863, + "step": 312 + }, + { + "epoch": 1.2867420349434737, + "grad_norm": 0.39088426091872597, + "learning_rate": 3.7061057910715546e-05, + "loss": 0.4019, + "step": 313 + }, + { + "epoch": 1.290853031860226, + "grad_norm": 0.3364663722128402, + "learning_rate": 3.703099004131988e-05, + "loss": 0.389, + "step": 314 + }, + { + "epoch": 1.2949640287769784, + "grad_norm": 0.396387529395801, + "learning_rate": 3.700078147033023e-05, + "loss": 0.3826, + "step": 315 + }, + { + "epoch": 1.2990750256937307, + "grad_norm": 0.4034497950317108, + "learning_rate": 3.697043244731484e-05, + "loss": 0.387, + "step": 316 + }, + { + "epoch": 1.303186022610483, + "grad_norm": 0.45567545271356036, + "learning_rate": 3.693994322300228e-05, + "loss": 0.3903, + "step": 317 + }, + { + "epoch": 1.3072970195272353, + "grad_norm": 0.36949826512347733, + "learning_rate": 3.69093140492794e-05, + "loss": 0.3907, + "step": 318 + }, + { + "epoch": 1.3114080164439876, + "grad_norm": 0.3907383409192243, + "learning_rate": 3.687854517918926e-05, + "loss": 0.3884, + "step": 319 + }, + { + "epoch": 1.31551901336074, + "grad_norm": 0.400771927655429, + "learning_rate": 3.684763686692898e-05, + "loss": 0.3897, + "step": 320 + }, + { + "epoch": 1.3196300102774923, + "grad_norm": 0.28349821982969425, + "learning_rate": 3.681658936784773e-05, + "loss": 0.3819, + "step": 321 + }, + { + "epoch": 1.3237410071942446, + "grad_norm": 0.3707057575475429, + "learning_rate": 3.678540293844455e-05, + "loss": 0.4029, + "step": 322 + }, + { + "epoch": 1.327852004110997, + "grad_norm": 0.3571877683162145, + "learning_rate": 3.675407783636624e-05, + "loss": 0.3888, + "step": 323 + }, + { + "epoch": 1.3319630010277492, + "grad_norm": 0.35550987334717343, + "learning_rate": 3.672261432040527e-05, + "loss": 0.388, + "step": 324 + }, + { + "epoch": 1.3360739979445015, + "grad_norm": 0.33342159219384704, + "learning_rate": 3.6691012650497605e-05, + "loss": 0.3949, + "step": 325 + }, + { + "epoch": 1.3401849948612539, + "grad_norm": 0.33685885775370095, + "learning_rate": 3.665927308772057e-05, + "loss": 0.3801, + "step": 326 + }, + { + "epoch": 1.3442959917780062, + "grad_norm": 0.3507256755448898, + "learning_rate": 3.6627395894290685e-05, + "loss": 0.4011, + "step": 327 + }, + { + "epoch": 1.3484069886947585, + "grad_norm": 0.3729058607264493, + "learning_rate": 3.659538133356153e-05, + "loss": 0.3841, + "step": 328 + }, + { + "epoch": 1.3525179856115108, + "grad_norm": 0.35654106083716275, + "learning_rate": 3.656322967002151e-05, + "loss": 0.3798, + "step": 329 + }, + { + "epoch": 1.3566289825282631, + "grad_norm": 0.34899761497768883, + "learning_rate": 3.6530941169291744e-05, + "loss": 0.3769, + "step": 330 + }, + { + "epoch": 1.3607399794450155, + "grad_norm": 0.3500913238620904, + "learning_rate": 3.649851609812379e-05, + "loss": 0.4005, + "step": 331 + }, + { + "epoch": 1.3648509763617678, + "grad_norm": 0.4370742910901644, + "learning_rate": 3.646595472439753e-05, + "loss": 0.3812, + "step": 332 + }, + { + "epoch": 1.36896197327852, + "grad_norm": 0.4531455394409143, + "learning_rate": 3.643325731711888e-05, + "loss": 0.3949, + "step": 333 + }, + { + "epoch": 1.3730729701952724, + "grad_norm": 0.3382908051688983, + "learning_rate": 3.6400424146417604e-05, + "loss": 0.3951, + "step": 334 + }, + { + "epoch": 1.3771839671120247, + "grad_norm": 0.48124284386355537, + "learning_rate": 3.6367455483545066e-05, + "loss": 0.3886, + "step": 335 + }, + { + "epoch": 1.381294964028777, + "grad_norm": 0.4073484304811201, + "learning_rate": 3.633435160087202e-05, + "loss": 0.3833, + "step": 336 + }, + { + "epoch": 1.3854059609455294, + "grad_norm": 0.37602122616857575, + "learning_rate": 3.6301112771886315e-05, + "loss": 0.3947, + "step": 337 + }, + { + "epoch": 1.3895169578622815, + "grad_norm": 0.35827821253734476, + "learning_rate": 3.62677392711907e-05, + "loss": 0.39, + "step": 338 + }, + { + "epoch": 1.393627954779034, + "grad_norm": 0.40220244427058716, + "learning_rate": 3.623423137450046e-05, + "loss": 0.3912, + "step": 339 + }, + { + "epoch": 1.397738951695786, + "grad_norm": 0.3520064901329717, + "learning_rate": 3.620058935864123e-05, + "loss": 0.3902, + "step": 340 + }, + { + "epoch": 1.4018499486125386, + "grad_norm": 0.3470206706484027, + "learning_rate": 3.616681350154666e-05, + "loss": 0.3817, + "step": 341 + }, + { + "epoch": 1.4059609455292907, + "grad_norm": 0.3684394787845421, + "learning_rate": 3.613290408225615e-05, + "loss": 0.3827, + "step": 342 + }, + { + "epoch": 1.4100719424460433, + "grad_norm": 0.34749273169540446, + "learning_rate": 3.609886138091247e-05, + "loss": 0.3874, + "step": 343 + }, + { + "epoch": 1.4141829393627954, + "grad_norm": 0.36675012503855753, + "learning_rate": 3.606468567875957e-05, + "loss": 0.3863, + "step": 344 + }, + { + "epoch": 1.418293936279548, + "grad_norm": 0.3087501681001265, + "learning_rate": 3.603037725814014e-05, + "loss": 0.3878, + "step": 345 + }, + { + "epoch": 1.4224049331963, + "grad_norm": 0.44389782737034467, + "learning_rate": 3.599593640249334e-05, + "loss": 0.3775, + "step": 346 + }, + { + "epoch": 1.4265159301130523, + "grad_norm": 0.40184525645257135, + "learning_rate": 3.5961363396352435e-05, + "loss": 0.3878, + "step": 347 + }, + { + "epoch": 1.4306269270298047, + "grad_norm": 0.3739195862038827, + "learning_rate": 3.592665852534246e-05, + "loss": 0.3882, + "step": 348 + }, + { + "epoch": 1.434737923946557, + "grad_norm": 0.4667442608373335, + "learning_rate": 3.589182207617785e-05, + "loss": 0.3848, + "step": 349 + }, + { + "epoch": 1.4388489208633093, + "grad_norm": 0.2954516082957151, + "learning_rate": 3.5856854336660075e-05, + "loss": 0.3737, + "step": 350 + }, + { + "epoch": 1.4429599177800616, + "grad_norm": 0.3807234664590424, + "learning_rate": 3.582175559567524e-05, + "loss": 0.394, + "step": 351 + }, + { + "epoch": 1.447070914696814, + "grad_norm": 0.37991638726226773, + "learning_rate": 3.578652614319177e-05, + "loss": 0.3924, + "step": 352 + }, + { + "epoch": 1.4511819116135662, + "grad_norm": 0.4605647523732803, + "learning_rate": 3.575116627025791e-05, + "loss": 0.3895, + "step": 353 + }, + { + "epoch": 1.4552929085303186, + "grad_norm": 0.34460237531655397, + "learning_rate": 3.571567626899939e-05, + "loss": 0.3979, + "step": 354 + }, + { + "epoch": 1.4594039054470709, + "grad_norm": 0.37502366115502783, + "learning_rate": 3.568005643261701e-05, + "loss": 0.3865, + "step": 355 + }, + { + "epoch": 1.4635149023638232, + "grad_norm": 0.361240868061172, + "learning_rate": 3.5644307055384204e-05, + "loss": 0.3927, + "step": 356 + }, + { + "epoch": 1.4676258992805755, + "grad_norm": 0.36549527451613106, + "learning_rate": 3.5608428432644574e-05, + "loss": 0.3906, + "step": 357 + }, + { + "epoch": 1.4717368961973278, + "grad_norm": 0.3841131971215696, + "learning_rate": 3.557242086080953e-05, + "loss": 0.3882, + "step": 358 + }, + { + "epoch": 1.4758478931140802, + "grad_norm": 0.3053174499069298, + "learning_rate": 3.5536284637355766e-05, + "loss": 0.3882, + "step": 359 + }, + { + "epoch": 1.4799588900308325, + "grad_norm": 0.42930198135043723, + "learning_rate": 3.5500020060822844e-05, + "loss": 0.39, + "step": 360 + }, + { + "epoch": 1.4840698869475848, + "grad_norm": 0.3646026910744666, + "learning_rate": 3.54636274308107e-05, + "loss": 0.3919, + "step": 361 + }, + { + "epoch": 1.4881808838643371, + "grad_norm": 0.4584181730800767, + "learning_rate": 3.542710704797721e-05, + "loss": 0.3841, + "step": 362 + }, + { + "epoch": 1.4922918807810894, + "grad_norm": 0.3912766247821292, + "learning_rate": 3.539045921403566e-05, + "loss": 0.375, + "step": 363 + }, + { + "epoch": 1.4964028776978417, + "grad_norm": 0.39561662555483357, + "learning_rate": 3.5353684231752276e-05, + "loss": 0.3884, + "step": 364 + }, + { + "epoch": 1.500513874614594, + "grad_norm": 0.33669597693884484, + "learning_rate": 3.531678240494373e-05, + "loss": 0.3953, + "step": 365 + }, + { + "epoch": 1.5046248715313464, + "grad_norm": 0.4156836645972758, + "learning_rate": 3.5279754038474616e-05, + "loss": 0.3864, + "step": 366 + }, + { + "epoch": 1.5087358684480987, + "grad_norm": 0.3888603103920021, + "learning_rate": 3.524259943825493e-05, + "loss": 0.3864, + "step": 367 + }, + { + "epoch": 1.512846865364851, + "grad_norm": 0.34153109888601435, + "learning_rate": 3.5205318911237566e-05, + "loss": 0.3829, + "step": 368 + }, + { + "epoch": 1.5169578622816033, + "grad_norm": 0.4203599723923179, + "learning_rate": 3.516791276541574e-05, + "loss": 0.391, + "step": 369 + }, + { + "epoch": 1.5210688591983557, + "grad_norm": 0.39707036421576897, + "learning_rate": 3.5130381309820474e-05, + "loss": 0.3852, + "step": 370 + }, + { + "epoch": 1.5251798561151078, + "grad_norm": 0.35484540902249145, + "learning_rate": 3.509272485451806e-05, + "loss": 0.3813, + "step": 371 + }, + { + "epoch": 1.5292908530318603, + "grad_norm": 0.35726960151965814, + "learning_rate": 3.5054943710607435e-05, + "loss": 0.3943, + "step": 372 + }, + { + "epoch": 1.5334018499486124, + "grad_norm": 0.34918237917940137, + "learning_rate": 3.50170381902177e-05, + "loss": 0.3813, + "step": 373 + }, + { + "epoch": 1.537512846865365, + "grad_norm": 0.3225637816337971, + "learning_rate": 3.497900860650545e-05, + "loss": 0.3818, + "step": 374 + }, + { + "epoch": 1.541623843782117, + "grad_norm": 0.3243987867777615, + "learning_rate": 3.494085527365224e-05, + "loss": 0.3759, + "step": 375 + }, + { + "epoch": 1.5457348406988696, + "grad_norm": 0.3158935559652955, + "learning_rate": 3.4902578506861995e-05, + "loss": 0.3893, + "step": 376 + }, + { + "epoch": 1.5498458376156217, + "grad_norm": 0.3815644429337655, + "learning_rate": 3.486417862235839e-05, + "loss": 0.3905, + "step": 377 + }, + { + "epoch": 1.5539568345323742, + "grad_norm": 0.3118180182058997, + "learning_rate": 3.4825655937382216e-05, + "loss": 0.3865, + "step": 378 + }, + { + "epoch": 1.5580678314491263, + "grad_norm": 0.3841430312682266, + "learning_rate": 3.4787010770188795e-05, + "loss": 0.3932, + "step": 379 + }, + { + "epoch": 1.5621788283658788, + "grad_norm": 0.39242889501386036, + "learning_rate": 3.474824344004534e-05, + "loss": 0.3906, + "step": 380 + }, + { + "epoch": 1.566289825282631, + "grad_norm": 0.3632635332732287, + "learning_rate": 3.4709354267228294e-05, + "loss": 0.3783, + "step": 381 + }, + { + "epoch": 1.5704008221993835, + "grad_norm": 0.4216314417617418, + "learning_rate": 3.467034357302073e-05, + "loss": 0.3816, + "step": 382 + }, + { + "epoch": 1.5745118191161356, + "grad_norm": 0.410057885099804, + "learning_rate": 3.463121167970966e-05, + "loss": 0.3843, + "step": 383 + }, + { + "epoch": 1.5786228160328881, + "grad_norm": 0.30821430114214227, + "learning_rate": 3.4591958910583365e-05, + "loss": 0.3871, + "step": 384 + }, + { + "epoch": 1.5827338129496402, + "grad_norm": 0.3935547600639123, + "learning_rate": 3.455258558992877e-05, + "loss": 0.379, + "step": 385 + }, + { + "epoch": 1.5868448098663928, + "grad_norm": 0.3250409563547552, + "learning_rate": 3.451309204302873e-05, + "loss": 0.3801, + "step": 386 + }, + { + "epoch": 1.5909558067831449, + "grad_norm": 0.356548846946166, + "learning_rate": 3.447347859615933e-05, + "loss": 0.379, + "step": 387 + }, + { + "epoch": 1.5950668036998972, + "grad_norm": 0.361163416612143, + "learning_rate": 3.443374557658723e-05, + "loss": 0.3745, + "step": 388 + }, + { + "epoch": 1.5991778006166495, + "grad_norm": 0.3446242948127641, + "learning_rate": 3.439389331256694e-05, + "loss": 0.3807, + "step": 389 + }, + { + "epoch": 1.6032887975334018, + "grad_norm": 0.33333480141647187, + "learning_rate": 3.435392213333809e-05, + "loss": 0.3832, + "step": 390 + }, + { + "epoch": 1.6073997944501541, + "grad_norm": 0.38066181381339836, + "learning_rate": 3.431383236912275e-05, + "loss": 0.3692, + "step": 391 + }, + { + "epoch": 1.6115107913669064, + "grad_norm": 0.3255044440323713, + "learning_rate": 3.427362435112268e-05, + "loss": 0.3728, + "step": 392 + }, + { + "epoch": 1.6156217882836588, + "grad_norm": 0.41417790723734144, + "learning_rate": 3.423329841151656e-05, + "loss": 0.3868, + "step": 393 + }, + { + "epoch": 1.619732785200411, + "grad_norm": 0.32170041659499554, + "learning_rate": 3.4192854883457326e-05, + "loss": 0.3724, + "step": 394 + }, + { + "epoch": 1.6238437821171634, + "grad_norm": 0.3522389078445349, + "learning_rate": 3.4152294101069345e-05, + "loss": 0.3755, + "step": 395 + }, + { + "epoch": 1.6279547790339157, + "grad_norm": 0.3154196575435205, + "learning_rate": 3.411161639944568e-05, + "loss": 0.3866, + "step": 396 + }, + { + "epoch": 1.632065775950668, + "grad_norm": 0.3883625817054837, + "learning_rate": 3.407082211464534e-05, + "loss": 0.3842, + "step": 397 + }, + { + "epoch": 1.6361767728674204, + "grad_norm": 0.32478029230772587, + "learning_rate": 3.402991158369047e-05, + "loss": 0.3856, + "step": 398 + }, + { + "epoch": 1.6402877697841727, + "grad_norm": 0.33777536538509645, + "learning_rate": 3.39888851445636e-05, + "loss": 0.3738, + "step": 399 + }, + { + "epoch": 1.644398766700925, + "grad_norm": 0.3645535574440166, + "learning_rate": 3.394774313620481e-05, + "loss": 0.3768, + "step": 400 + }, + { + "epoch": 1.6485097636176773, + "grad_norm": 0.33553965225554366, + "learning_rate": 3.390648589850897e-05, + "loss": 0.3854, + "step": 401 + }, + { + "epoch": 1.6526207605344296, + "grad_norm": 0.35131544263569836, + "learning_rate": 3.386511377232293e-05, + "loss": 0.383, + "step": 402 + }, + { + "epoch": 1.656731757451182, + "grad_norm": 0.3083698384899604, + "learning_rate": 3.382362709944268e-05, + "loss": 0.3913, + "step": 403 + }, + { + "epoch": 1.6608427543679343, + "grad_norm": 0.3444920510980315, + "learning_rate": 3.3782026222610525e-05, + "loss": 0.3912, + "step": 404 + }, + { + "epoch": 1.6649537512846866, + "grad_norm": 0.3109066824781155, + "learning_rate": 3.374031148551229e-05, + "loss": 0.3785, + "step": 405 + }, + { + "epoch": 1.6690647482014387, + "grad_norm": 0.393332877111885, + "learning_rate": 3.3698483232774435e-05, + "loss": 0.3811, + "step": 406 + }, + { + "epoch": 1.6731757451181912, + "grad_norm": 0.35010985881480106, + "learning_rate": 3.365654180996126e-05, + "loss": 0.3765, + "step": 407 + }, + { + "epoch": 1.6772867420349433, + "grad_norm": 0.3994860261819717, + "learning_rate": 3.361448756357199e-05, + "loss": 0.3855, + "step": 408 + }, + { + "epoch": 1.6813977389516959, + "grad_norm": 0.4026873313554007, + "learning_rate": 3.3572320841037945e-05, + "loss": 0.3776, + "step": 409 + }, + { + "epoch": 1.685508735868448, + "grad_norm": 0.3888166731552757, + "learning_rate": 3.353004199071969e-05, + "loss": 0.389, + "step": 410 + }, + { + "epoch": 1.6896197327852005, + "grad_norm": 0.4229642214250034, + "learning_rate": 3.348765136190412e-05, + "loss": 0.3844, + "step": 411 + }, + { + "epoch": 1.6937307297019526, + "grad_norm": 0.3719493753316055, + "learning_rate": 3.344514930480158e-05, + "loss": 0.3718, + "step": 412 + }, + { + "epoch": 1.6978417266187051, + "grad_norm": 0.3750792470447336, + "learning_rate": 3.3402536170542985e-05, + "loss": 0.4017, + "step": 413 + }, + { + "epoch": 1.7019527235354572, + "grad_norm": 0.37953623181883855, + "learning_rate": 3.335981231117694e-05, + "loss": 0.3786, + "step": 414 + }, + { + "epoch": 1.7060637204522098, + "grad_norm": 0.42228613250314784, + "learning_rate": 3.331697807966676e-05, + "loss": 0.3902, + "step": 415 + }, + { + "epoch": 1.7101747173689619, + "grad_norm": 0.33605301616513616, + "learning_rate": 3.327403382988764e-05, + "loss": 0.382, + "step": 416 + }, + { + "epoch": 1.7142857142857144, + "grad_norm": 0.915407319860973, + "learning_rate": 3.3230979916623667e-05, + "loss": 0.3868, + "step": 417 + }, + { + "epoch": 1.7183967112024665, + "grad_norm": 0.4415883382317921, + "learning_rate": 3.318781669556493e-05, + "loss": 0.4025, + "step": 418 + }, + { + "epoch": 1.722507708119219, + "grad_norm": 0.29764556226533273, + "learning_rate": 3.3144544523304545e-05, + "loss": 0.3868, + "step": 419 + }, + { + "epoch": 1.7266187050359711, + "grad_norm": 0.3864981881512229, + "learning_rate": 3.310116375733575e-05, + "loss": 0.3848, + "step": 420 + }, + { + "epoch": 1.7307297019527237, + "grad_norm": 0.4237534589835872, + "learning_rate": 3.3057674756048906e-05, + "loss": 0.3884, + "step": 421 + }, + { + "epoch": 1.7348406988694758, + "grad_norm": 0.30622506767945284, + "learning_rate": 3.30140778787286e-05, + "loss": 0.3962, + "step": 422 + }, + { + "epoch": 1.7389516957862283, + "grad_norm": 0.3715870543554042, + "learning_rate": 3.297037348555059e-05, + "loss": 0.3804, + "step": 423 + }, + { + "epoch": 1.7430626927029804, + "grad_norm": 0.3158873451974222, + "learning_rate": 3.292656193757891e-05, + "loss": 0.3808, + "step": 424 + }, + { + "epoch": 1.7471736896197327, + "grad_norm": 3.5993500626700534, + "learning_rate": 3.2882643596762847e-05, + "loss": 0.3766, + "step": 425 + }, + { + "epoch": 1.751284686536485, + "grad_norm": 0.41799220024756045, + "learning_rate": 3.283861882593394e-05, + "loss": 0.3629, + "step": 426 + }, + { + "epoch": 1.7553956834532374, + "grad_norm": 0.4028165918419239, + "learning_rate": 3.2794487988803024e-05, + "loss": 0.3946, + "step": 427 + }, + { + "epoch": 1.7595066803699897, + "grad_norm": 0.45312099756724705, + "learning_rate": 3.275025144995719e-05, + "loss": 0.3826, + "step": 428 + }, + { + "epoch": 1.763617677286742, + "grad_norm": 0.3682320829470106, + "learning_rate": 3.270590957485678e-05, + "loss": 0.3822, + "step": 429 + }, + { + "epoch": 1.7677286742034943, + "grad_norm": 0.432471521500914, + "learning_rate": 3.266146272983238e-05, + "loss": 0.379, + "step": 430 + }, + { + "epoch": 1.7718396711202467, + "grad_norm": 0.49258814180632715, + "learning_rate": 3.261691128208178e-05, + "loss": 0.3781, + "step": 431 + }, + { + "epoch": 1.775950668036999, + "grad_norm": 0.42039354140050533, + "learning_rate": 3.2572255599666946e-05, + "loss": 0.3858, + "step": 432 + }, + { + "epoch": 1.7800616649537513, + "grad_norm": 0.4266657139962505, + "learning_rate": 3.252749605151099e-05, + "loss": 0.3889, + "step": 433 + }, + { + "epoch": 1.7841726618705036, + "grad_norm": 0.4111540760053901, + "learning_rate": 3.24826330073951e-05, + "loss": 0.3828, + "step": 434 + }, + { + "epoch": 1.788283658787256, + "grad_norm": 0.40054562650751135, + "learning_rate": 3.2437666837955495e-05, + "loss": 0.3821, + "step": 435 + }, + { + "epoch": 1.7923946557040082, + "grad_norm": 0.4049883565747011, + "learning_rate": 3.239259791468037e-05, + "loss": 0.3782, + "step": 436 + }, + { + "epoch": 1.7965056526207606, + "grad_norm": 0.3211989179680821, + "learning_rate": 3.234742660990681e-05, + "loss": 0.3886, + "step": 437 + }, + { + "epoch": 1.8006166495375129, + "grad_norm": 0.3415159428416263, + "learning_rate": 3.230215329681775e-05, + "loss": 0.3865, + "step": 438 + }, + { + "epoch": 1.8047276464542652, + "grad_norm": 0.30795596054473745, + "learning_rate": 3.225677834943884e-05, + "loss": 0.3798, + "step": 439 + }, + { + "epoch": 1.8088386433710175, + "grad_norm": 0.3527630027822489, + "learning_rate": 3.22113021426354e-05, + "loss": 0.371, + "step": 440 + }, + { + "epoch": 1.8129496402877698, + "grad_norm": 0.38597884530280835, + "learning_rate": 3.216572505210929e-05, + "loss": 0.386, + "step": 441 + }, + { + "epoch": 1.8170606372045222, + "grad_norm": 0.35477892953521534, + "learning_rate": 3.2120047454395845e-05, + "loss": 0.3837, + "step": 442 + }, + { + "epoch": 1.8211716341212743, + "grad_norm": 0.34702546052353167, + "learning_rate": 3.207426972686071e-05, + "loss": 0.3892, + "step": 443 + }, + { + "epoch": 1.8252826310380268, + "grad_norm": 0.30619045437996395, + "learning_rate": 3.202839224769678e-05, + "loss": 0.3911, + "step": 444 + }, + { + "epoch": 1.829393627954779, + "grad_norm": 0.308117763052393, + "learning_rate": 3.198241539592103e-05, + "loss": 0.388, + "step": 445 + }, + { + "epoch": 1.8335046248715314, + "grad_norm": 0.3813900684937835, + "learning_rate": 3.1936339551371416e-05, + "loss": 0.3733, + "step": 446 + }, + { + "epoch": 1.8376156217882835, + "grad_norm": 0.37451609765152405, + "learning_rate": 3.1890165094703704e-05, + "loss": 0.382, + "step": 447 + }, + { + "epoch": 1.841726618705036, + "grad_norm": 0.3343641229801653, + "learning_rate": 3.184389240738838e-05, + "loss": 0.3843, + "step": 448 + }, + { + "epoch": 1.8458376156217882, + "grad_norm": 0.3671589935937082, + "learning_rate": 3.179752187170741e-05, + "loss": 0.3914, + "step": 449 + }, + { + "epoch": 1.8499486125385407, + "grad_norm": 0.3997665963907156, + "learning_rate": 3.1751053870751184e-05, + "loss": 0.3843, + "step": 450 + }, + { + "epoch": 1.8540596094552928, + "grad_norm": 0.3253860699538578, + "learning_rate": 3.1704488788415274e-05, + "loss": 0.3855, + "step": 451 + }, + { + "epoch": 1.8581706063720453, + "grad_norm": 0.3876573196918091, + "learning_rate": 3.16578270093973e-05, + "loss": 0.386, + "step": 452 + }, + { + "epoch": 1.8622816032887974, + "grad_norm": 0.38773352168091224, + "learning_rate": 3.1611068919193756e-05, + "loss": 0.3783, + "step": 453 + }, + { + "epoch": 1.86639260020555, + "grad_norm": 0.31178745493034576, + "learning_rate": 3.1564214904096774e-05, + "loss": 0.385, + "step": 454 + }, + { + "epoch": 1.870503597122302, + "grad_norm": 0.5723565190756046, + "learning_rate": 3.1517265351191e-05, + "loss": 0.3841, + "step": 455 + }, + { + "epoch": 1.8746145940390546, + "grad_norm": 0.46777702023370726, + "learning_rate": 3.147022064835036e-05, + "loss": 0.385, + "step": 456 + }, + { + "epoch": 1.8787255909558067, + "grad_norm": 0.48665108942706403, + "learning_rate": 3.142308118423485e-05, + "loss": 0.3808, + "step": 457 + }, + { + "epoch": 1.8828365878725593, + "grad_norm": 0.5712910353884142, + "learning_rate": 3.1375847348287365e-05, + "loss": 0.3898, + "step": 458 + }, + { + "epoch": 1.8869475847893113, + "grad_norm": 0.4140740468743388, + "learning_rate": 3.132851953073041e-05, + "loss": 0.3823, + "step": 459 + }, + { + "epoch": 1.8910585817060637, + "grad_norm": 0.4981734090282241, + "learning_rate": 3.128109812256296e-05, + "loss": 0.379, + "step": 460 + }, + { + "epoch": 1.895169578622816, + "grad_norm": 0.3901725689405749, + "learning_rate": 3.1233583515557166e-05, + "loss": 0.3802, + "step": 461 + }, + { + "epoch": 1.8992805755395683, + "grad_norm": 0.39288710655716796, + "learning_rate": 3.118597610225514e-05, + "loss": 0.3648, + "step": 462 + }, + { + "epoch": 1.9033915724563206, + "grad_norm": 0.3943553998150945, + "learning_rate": 3.113827627596575e-05, + "loss": 0.3845, + "step": 463 + }, + { + "epoch": 1.907502569373073, + "grad_norm": 0.3815649604071033, + "learning_rate": 3.1090484430761275e-05, + "loss": 0.3968, + "step": 464 + }, + { + "epoch": 1.9116135662898253, + "grad_norm": 0.4341716993023021, + "learning_rate": 3.104260096147426e-05, + "loss": 0.3825, + "step": 465 + }, + { + "epoch": 1.9157245632065776, + "grad_norm": 0.2885815389134767, + "learning_rate": 3.099462626369418e-05, + "loss": 0.379, + "step": 466 + }, + { + "epoch": 1.91983556012333, + "grad_norm": 0.3706179172517124, + "learning_rate": 3.094656073376419e-05, + "loss": 0.3882, + "step": 467 + }, + { + "epoch": 1.9239465570400822, + "grad_norm": 0.3443004696246589, + "learning_rate": 3.0898404768777863e-05, + "loss": 0.3855, + "step": 468 + }, + { + "epoch": 1.9280575539568345, + "grad_norm": 0.29420490623628953, + "learning_rate": 3.0850158766575907e-05, + "loss": 0.3843, + "step": 469 + }, + { + "epoch": 1.9321685508735869, + "grad_norm": 0.3876924984247156, + "learning_rate": 3.080182312574286e-05, + "loss": 0.3746, + "step": 470 + }, + { + "epoch": 1.9362795477903392, + "grad_norm": 0.28747642038559285, + "learning_rate": 3.075339824560382e-05, + "loss": 0.3718, + "step": 471 + }, + { + "epoch": 1.9403905447070915, + "grad_norm": 0.32380146376848085, + "learning_rate": 3.070488452622113e-05, + "loss": 0.3934, + "step": 472 + }, + { + "epoch": 1.9445015416238438, + "grad_norm": 0.32465438985148803, + "learning_rate": 3.0656282368391086e-05, + "loss": 0.3729, + "step": 473 + }, + { + "epoch": 1.9486125385405961, + "grad_norm": 0.35563518327266175, + "learning_rate": 3.0607592173640615e-05, + "loss": 0.3795, + "step": 474 + }, + { + "epoch": 1.9527235354573484, + "grad_norm": 0.331866902145928, + "learning_rate": 3.055881434422395e-05, + "loss": 0.3981, + "step": 475 + }, + { + "epoch": 1.9568345323741008, + "grad_norm": 0.3342210067868538, + "learning_rate": 3.0509949283119348e-05, + "loss": 0.3717, + "step": 476 + }, + { + "epoch": 1.960945529290853, + "grad_norm": 0.33585492528175326, + "learning_rate": 3.0460997394025694e-05, + "loss": 0.3993, + "step": 477 + }, + { + "epoch": 1.9650565262076052, + "grad_norm": 0.31011270781830746, + "learning_rate": 3.0411959081359223e-05, + "loss": 0.3865, + "step": 478 + }, + { + "epoch": 1.9691675231243577, + "grad_norm": 0.35530598698818877, + "learning_rate": 3.036283475025016e-05, + "loss": 0.3784, + "step": 479 + }, + { + "epoch": 1.9732785200411098, + "grad_norm": 0.3734052740131826, + "learning_rate": 3.031362480653937e-05, + "loss": 0.3762, + "step": 480 + }, + { + "epoch": 1.9773895169578624, + "grad_norm": 0.3014940676108034, + "learning_rate": 3.0264329656775e-05, + "loss": 0.3757, + "step": 481 + }, + { + "epoch": 1.9815005138746145, + "grad_norm": 0.3512117145148321, + "learning_rate": 3.021494970820912e-05, + "loss": 0.3827, + "step": 482 + }, + { + "epoch": 1.985611510791367, + "grad_norm": 0.37355802443996994, + "learning_rate": 3.01654853687944e-05, + "loss": 0.3642, + "step": 483 + }, + { + "epoch": 1.989722507708119, + "grad_norm": 0.2861615252457176, + "learning_rate": 3.011593704718067e-05, + "loss": 0.3963, + "step": 484 + }, + { + "epoch": 1.9938335046248716, + "grad_norm": 0.3745753953644458, + "learning_rate": 3.0066305152711598e-05, + "loss": 0.3878, + "step": 485 + }, + { + "epoch": 1.9979445015416237, + "grad_norm": 0.26150625589651816, + "learning_rate": 3.0016590095421273e-05, + "loss": 0.3721, + "step": 486 + }, + { + "epoch": 2.0020554984583763, + "grad_norm": 0.3918210766291012, + "learning_rate": 2.9966792286030853e-05, + "loss": 0.3396, + "step": 487 + }, + { + "epoch": 2.0061664953751284, + "grad_norm": 0.315886174265335, + "learning_rate": 2.9916912135945147e-05, + "loss": 0.3326, + "step": 488 + }, + { + "epoch": 2.010277492291881, + "grad_norm": 0.5139005301093035, + "learning_rate": 2.986695005724921e-05, + "loss": 0.3331, + "step": 489 + }, + { + "epoch": 2.014388489208633, + "grad_norm": 0.4039956111942429, + "learning_rate": 2.9816906462704963e-05, + "loss": 0.3318, + "step": 490 + }, + { + "epoch": 2.0184994861253855, + "grad_norm": 0.3643964107370674, + "learning_rate": 2.9766781765747775e-05, + "loss": 0.331, + "step": 491 + }, + { + "epoch": 2.0226104830421376, + "grad_norm": 0.36816703584916016, + "learning_rate": 2.971657638048302e-05, + "loss": 0.3318, + "step": 492 + }, + { + "epoch": 2.02672147995889, + "grad_norm": 0.42271803167213406, + "learning_rate": 2.966629072168271e-05, + "loss": 0.3344, + "step": 493 + }, + { + "epoch": 2.0308324768756423, + "grad_norm": 0.3930653905455099, + "learning_rate": 2.9615925204782006e-05, + "loss": 0.3177, + "step": 494 + }, + { + "epoch": 2.034943473792395, + "grad_norm": 0.40048080993718765, + "learning_rate": 2.9565480245875845e-05, + "loss": 0.3358, + "step": 495 + }, + { + "epoch": 2.039054470709147, + "grad_norm": 0.3484759664627585, + "learning_rate": 2.9514956261715458e-05, + "loss": 0.3303, + "step": 496 + }, + { + "epoch": 2.0431654676258995, + "grad_norm": 1.2952724176128951, + "learning_rate": 2.9464353669704943e-05, + "loss": 0.3683, + "step": 497 + }, + { + "epoch": 2.0472764645426516, + "grad_norm": 0.3554075988337682, + "learning_rate": 2.9413672887897828e-05, + "loss": 0.34, + "step": 498 + }, + { + "epoch": 2.051387461459404, + "grad_norm": 0.5002880458311052, + "learning_rate": 2.936291433499359e-05, + "loss": 0.3304, + "step": 499 + }, + { + "epoch": 2.055498458376156, + "grad_norm": 0.37407564917246083, + "learning_rate": 2.9312078430334228e-05, + "loss": 0.3358, + "step": 500 + }, + { + "epoch": 2.0596094552929087, + "grad_norm": 0.4077464439245042, + "learning_rate": 2.926116559390078e-05, + "loss": 0.3232, + "step": 501 + }, + { + "epoch": 2.063720452209661, + "grad_norm": 0.3809714773736691, + "learning_rate": 2.921017624630984e-05, + "loss": 0.323, + "step": 502 + }, + { + "epoch": 2.0678314491264134, + "grad_norm": 0.311417329278231, + "learning_rate": 2.9159110808810125e-05, + "loss": 0.3192, + "step": 503 + }, + { + "epoch": 2.0719424460431655, + "grad_norm": 0.44030503896413653, + "learning_rate": 2.9107969703278952e-05, + "loss": 0.3354, + "step": 504 + }, + { + "epoch": 2.0760534429599176, + "grad_norm": 0.3036316440870094, + "learning_rate": 2.905675335221877e-05, + "loss": 0.3299, + "step": 505 + }, + { + "epoch": 2.08016443987667, + "grad_norm": 0.4247647928977746, + "learning_rate": 2.900546217875368e-05, + "loss": 0.3288, + "step": 506 + }, + { + "epoch": 2.084275436793422, + "grad_norm": 0.3390034407042314, + "learning_rate": 2.895409660662592e-05, + "loss": 0.3328, + "step": 507 + }, + { + "epoch": 2.0883864337101747, + "grad_norm": 0.4990365827984489, + "learning_rate": 2.8902657060192366e-05, + "loss": 0.3376, + "step": 508 + }, + { + "epoch": 2.092497430626927, + "grad_norm": 0.5173705351976455, + "learning_rate": 2.8851143964421048e-05, + "loss": 0.3356, + "step": 509 + }, + { + "epoch": 2.0966084275436794, + "grad_norm": 0.5151102205707064, + "learning_rate": 2.879955774488762e-05, + "loss": 0.332, + "step": 510 + }, + { + "epoch": 2.1007194244604315, + "grad_norm": 0.43417520836094964, + "learning_rate": 2.8747898827771846e-05, + "loss": 0.3389, + "step": 511 + }, + { + "epoch": 2.104830421377184, + "grad_norm": 0.5355654606933186, + "learning_rate": 2.8696167639854073e-05, + "loss": 0.341, + "step": 512 + }, + { + "epoch": 2.108941418293936, + "grad_norm": 0.4367393823993611, + "learning_rate": 2.864436460851173e-05, + "loss": 0.3299, + "step": 513 + }, + { + "epoch": 2.1130524152106887, + "grad_norm": 0.45783141095235763, + "learning_rate": 2.8592490161715768e-05, + "loss": 0.3191, + "step": 514 + }, + { + "epoch": 2.1171634121274407, + "grad_norm": 0.42545114058633565, + "learning_rate": 2.8540544728027145e-05, + "loss": 0.3145, + "step": 515 + }, + { + "epoch": 2.1212744090441933, + "grad_norm": 0.3661488589187853, + "learning_rate": 2.8488528736593278e-05, + "loss": 0.3275, + "step": 516 + }, + { + "epoch": 2.1253854059609454, + "grad_norm": 0.5060616601470208, + "learning_rate": 2.843644261714448e-05, + "loss": 0.3384, + "step": 517 + }, + { + "epoch": 2.129496402877698, + "grad_norm": 0.3580510131662911, + "learning_rate": 2.8384286799990452e-05, + "loss": 0.3296, + "step": 518 + }, + { + "epoch": 2.13360739979445, + "grad_norm": 0.45075270681673163, + "learning_rate": 2.8332061716016692e-05, + "loss": 0.32, + "step": 519 + }, + { + "epoch": 2.1377183967112026, + "grad_norm": 0.4708082264494772, + "learning_rate": 2.8279767796680934e-05, + "loss": 0.3332, + "step": 520 + }, + { + "epoch": 2.1418293936279547, + "grad_norm": 0.35417572710043976, + "learning_rate": 2.8227405474009616e-05, + "loss": 0.325, + "step": 521 + }, + { + "epoch": 2.145940390544707, + "grad_norm": 0.5178072041280041, + "learning_rate": 2.817497518059428e-05, + "loss": 0.3286, + "step": 522 + }, + { + "epoch": 2.1500513874614593, + "grad_norm": 0.333153745006992, + "learning_rate": 2.8122477349588005e-05, + "loss": 0.3247, + "step": 523 + }, + { + "epoch": 2.154162384378212, + "grad_norm": 0.5499040672396817, + "learning_rate": 2.8069912414701842e-05, + "loss": 0.3338, + "step": 524 + }, + { + "epoch": 2.158273381294964, + "grad_norm": 0.29956362280088755, + "learning_rate": 2.8017280810201213e-05, + "loss": 0.3307, + "step": 525 + }, + { + "epoch": 2.1623843782117165, + "grad_norm": 0.4276269537060341, + "learning_rate": 2.7964582970902338e-05, + "loss": 0.3263, + "step": 526 + }, + { + "epoch": 2.1664953751284686, + "grad_norm": 0.3311312720633184, + "learning_rate": 2.7911819332168627e-05, + "loss": 0.3302, + "step": 527 + }, + { + "epoch": 2.170606372045221, + "grad_norm": 0.32361165480350135, + "learning_rate": 2.78589903299071e-05, + "loss": 0.3307, + "step": 528 + }, + { + "epoch": 2.174717368961973, + "grad_norm": 0.33128441736832326, + "learning_rate": 2.7806096400564775e-05, + "loss": 0.3234, + "step": 529 + }, + { + "epoch": 2.1788283658787257, + "grad_norm": 0.2945513597575282, + "learning_rate": 2.7753137981125068e-05, + "loss": 0.3354, + "step": 530 + }, + { + "epoch": 2.182939362795478, + "grad_norm": 0.35769877925150756, + "learning_rate": 2.7700115509104176e-05, + "loss": 0.336, + "step": 531 + }, + { + "epoch": 2.1870503597122304, + "grad_norm": 0.3065613778661335, + "learning_rate": 2.7647029422547465e-05, + "loss": 0.3326, + "step": 532 + }, + { + "epoch": 2.1911613566289825, + "grad_norm": 0.31377341167653106, + "learning_rate": 2.7593880160025864e-05, + "loss": 0.3354, + "step": 533 + }, + { + "epoch": 2.195272353545735, + "grad_norm": 0.28252520009349796, + "learning_rate": 2.754066816063222e-05, + "loss": 0.3194, + "step": 534 + }, + { + "epoch": 2.199383350462487, + "grad_norm": 0.2792714767775337, + "learning_rate": 2.7487393863977687e-05, + "loss": 0.3369, + "step": 535 + }, + { + "epoch": 2.2034943473792397, + "grad_norm": 0.28232252447629436, + "learning_rate": 2.7434057710188077e-05, + "loss": 0.3157, + "step": 536 + }, + { + "epoch": 2.2076053442959918, + "grad_norm": 0.25752143372328223, + "learning_rate": 2.738066013990025e-05, + "loss": 0.3153, + "step": 537 + }, + { + "epoch": 2.2117163412127443, + "grad_norm": 0.297593757050134, + "learning_rate": 2.732720159425845e-05, + "loss": 0.3296, + "step": 538 + }, + { + "epoch": 2.2158273381294964, + "grad_norm": 0.2758026361391992, + "learning_rate": 2.7273682514910668e-05, + "loss": 0.3247, + "step": 539 + }, + { + "epoch": 2.2199383350462485, + "grad_norm": 0.3422530970797541, + "learning_rate": 2.7220103344004995e-05, + "loss": 0.3293, + "step": 540 + }, + { + "epoch": 2.224049331963001, + "grad_norm": 0.3668216989996492, + "learning_rate": 2.7166464524185977e-05, + "loss": 0.3419, + "step": 541 + }, + { + "epoch": 2.2281603288797536, + "grad_norm": 0.3353880633821636, + "learning_rate": 2.7112766498590944e-05, + "loss": 0.3277, + "step": 542 + }, + { + "epoch": 2.2322713257965057, + "grad_norm": 0.3990592979092236, + "learning_rate": 2.705900971084635e-05, + "loss": 0.3352, + "step": 543 + }, + { + "epoch": 2.2363823227132578, + "grad_norm": 0.34649020190108354, + "learning_rate": 2.7005194605064122e-05, + "loss": 0.3334, + "step": 544 + }, + { + "epoch": 2.2404933196300103, + "grad_norm": 0.30730771299144677, + "learning_rate": 2.6951321625837975e-05, + "loss": 0.3299, + "step": 545 + }, + { + "epoch": 2.2446043165467624, + "grad_norm": 0.3875753398426506, + "learning_rate": 2.6897391218239746e-05, + "loss": 0.3338, + "step": 546 + }, + { + "epoch": 2.248715313463515, + "grad_norm": 0.27365792996452604, + "learning_rate": 2.6843403827815714e-05, + "loss": 0.3353, + "step": 547 + }, + { + "epoch": 2.252826310380267, + "grad_norm": 0.3913041787492654, + "learning_rate": 2.6789359900582935e-05, + "loss": 0.3274, + "step": 548 + }, + { + "epoch": 2.2569373072970196, + "grad_norm": 0.25296864218831433, + "learning_rate": 2.673525988302553e-05, + "loss": 0.344, + "step": 549 + }, + { + "epoch": 2.2610483042137717, + "grad_norm": 0.3931054005221806, + "learning_rate": 2.6681104222091018e-05, + "loss": 0.3387, + "step": 550 + }, + { + "epoch": 2.265159301130524, + "grad_norm": 0.25058187158942646, + "learning_rate": 2.662689336518661e-05, + "loss": 0.3306, + "step": 551 + }, + { + "epoch": 2.2692702980472763, + "grad_norm": 0.34466530037047466, + "learning_rate": 2.6572627760175523e-05, + "loss": 0.334, + "step": 552 + }, + { + "epoch": 2.273381294964029, + "grad_norm": 0.27034275974079125, + "learning_rate": 2.6518307855373276e-05, + "loss": 0.3245, + "step": 553 + }, + { + "epoch": 2.277492291880781, + "grad_norm": 0.32305382508070213, + "learning_rate": 2.6463934099543992e-05, + "loss": 0.3337, + "step": 554 + }, + { + "epoch": 2.2816032887975335, + "grad_norm": 0.2943172520547782, + "learning_rate": 2.6409506941896665e-05, + "loss": 0.336, + "step": 555 + }, + { + "epoch": 2.2857142857142856, + "grad_norm": 0.3589575171976915, + "learning_rate": 2.6355026832081493e-05, + "loss": 0.331, + "step": 556 + }, + { + "epoch": 2.289825282631038, + "grad_norm": 0.32434981120796447, + "learning_rate": 2.6300494220186113e-05, + "loss": 0.3318, + "step": 557 + }, + { + "epoch": 2.2939362795477902, + "grad_norm": 0.3207602935494296, + "learning_rate": 2.6245909556731937e-05, + "loss": 0.3244, + "step": 558 + }, + { + "epoch": 2.2980472764645428, + "grad_norm": 0.2994113594865251, + "learning_rate": 2.6191273292670372e-05, + "loss": 0.3342, + "step": 559 + }, + { + "epoch": 2.302158273381295, + "grad_norm": 0.3188506407975691, + "learning_rate": 2.6136585879379145e-05, + "loss": 0.3394, + "step": 560 + }, + { + "epoch": 2.3062692702980474, + "grad_norm": 0.3098768532791999, + "learning_rate": 2.608184776865854e-05, + "loss": 0.3289, + "step": 561 + }, + { + "epoch": 2.3103802672147995, + "grad_norm": 0.30655390743691074, + "learning_rate": 2.602705941272769e-05, + "loss": 0.322, + "step": 562 + }, + { + "epoch": 2.314491264131552, + "grad_norm": 0.3063020702615511, + "learning_rate": 2.597222126422081e-05, + "loss": 0.3332, + "step": 563 + }, + { + "epoch": 2.318602261048304, + "grad_norm": 0.3156706802866886, + "learning_rate": 2.5917333776183503e-05, + "loss": 0.3228, + "step": 564 + }, + { + "epoch": 2.3227132579650567, + "grad_norm": 0.28723121088598647, + "learning_rate": 2.586239740206897e-05, + "loss": 0.3197, + "step": 565 + }, + { + "epoch": 2.3268242548818088, + "grad_norm": 0.30433476866497944, + "learning_rate": 2.5807412595734283e-05, + "loss": 0.3279, + "step": 566 + }, + { + "epoch": 2.3309352517985613, + "grad_norm": 0.25756394575456126, + "learning_rate": 2.5752379811436655e-05, + "loss": 0.324, + "step": 567 + }, + { + "epoch": 2.3350462487153134, + "grad_norm": 0.3098697810639567, + "learning_rate": 2.5697299503829657e-05, + "loss": 0.3347, + "step": 568 + }, + { + "epoch": 2.339157245632066, + "grad_norm": 0.30837147852538255, + "learning_rate": 2.5642172127959475e-05, + "loss": 0.3292, + "step": 569 + }, + { + "epoch": 2.343268242548818, + "grad_norm": 0.3069265524451522, + "learning_rate": 2.558699813926115e-05, + "loss": 0.3323, + "step": 570 + }, + { + "epoch": 2.3473792394655706, + "grad_norm": 0.32407359000068336, + "learning_rate": 2.5531777993554813e-05, + "loss": 0.3317, + "step": 571 + }, + { + "epoch": 2.3514902363823227, + "grad_norm": 0.3118278827236543, + "learning_rate": 2.5476512147041926e-05, + "loss": 0.3428, + "step": 572 + }, + { + "epoch": 2.3556012332990752, + "grad_norm": 0.3342461379651357, + "learning_rate": 2.5421201056301507e-05, + "loss": 0.3284, + "step": 573 + }, + { + "epoch": 2.3597122302158273, + "grad_norm": 0.29958642203118996, + "learning_rate": 2.5365845178286358e-05, + "loss": 0.3275, + "step": 574 + }, + { + "epoch": 2.3638232271325794, + "grad_norm": 0.3328808710382115, + "learning_rate": 2.5310444970319292e-05, + "loss": 0.3301, + "step": 575 + }, + { + "epoch": 2.367934224049332, + "grad_norm": 0.3782109254880134, + "learning_rate": 2.525500089008936e-05, + "loss": 0.3375, + "step": 576 + }, + { + "epoch": 2.3720452209660845, + "grad_norm": 0.3451832289715049, + "learning_rate": 2.5199513395648047e-05, + "loss": 0.3207, + "step": 577 + }, + { + "epoch": 2.3761562178828366, + "grad_norm": 0.34430102536898843, + "learning_rate": 2.5143982945405527e-05, + "loss": 0.3335, + "step": 578 + }, + { + "epoch": 2.3802672147995887, + "grad_norm": 0.3480033297872511, + "learning_rate": 2.5088409998126827e-05, + "loss": 0.3364, + "step": 579 + }, + { + "epoch": 2.3843782117163412, + "grad_norm": 0.2864218297613634, + "learning_rate": 2.5032795012928093e-05, + "loss": 0.3296, + "step": 580 + }, + { + "epoch": 2.3884892086330938, + "grad_norm": 0.27656367755763744, + "learning_rate": 2.4977138449272746e-05, + "loss": 0.3252, + "step": 581 + }, + { + "epoch": 2.392600205549846, + "grad_norm": 0.3210933436925842, + "learning_rate": 2.4921440766967718e-05, + "loss": 0.3292, + "step": 582 + }, + { + "epoch": 2.396711202466598, + "grad_norm": 0.2695136624951651, + "learning_rate": 2.4865702426159633e-05, + "loss": 0.3345, + "step": 583 + }, + { + "epoch": 2.4008221993833505, + "grad_norm": 0.2853367762196653, + "learning_rate": 2.4809923887331028e-05, + "loss": 0.3272, + "step": 584 + }, + { + "epoch": 2.4049331963001026, + "grad_norm": 0.26413255067697416, + "learning_rate": 2.4754105611296534e-05, + "loss": 0.3244, + "step": 585 + }, + { + "epoch": 2.409044193216855, + "grad_norm": 0.2788852049644498, + "learning_rate": 2.4698248059199056e-05, + "loss": 0.3211, + "step": 586 + }, + { + "epoch": 2.4131551901336072, + "grad_norm": 0.2839043038975584, + "learning_rate": 2.4642351692505998e-05, + "loss": 0.3227, + "step": 587 + }, + { + "epoch": 2.41726618705036, + "grad_norm": 0.26795507769344473, + "learning_rate": 2.4586416973005414e-05, + "loss": 0.3255, + "step": 588 + }, + { + "epoch": 2.421377183967112, + "grad_norm": 0.28061689093119546, + "learning_rate": 2.453044436280223e-05, + "loss": 0.3297, + "step": 589 + }, + { + "epoch": 2.4254881808838644, + "grad_norm": 0.2768666834694627, + "learning_rate": 2.4474434324314388e-05, + "loss": 0.3351, + "step": 590 + }, + { + "epoch": 2.4295991778006165, + "grad_norm": 0.2992581902997908, + "learning_rate": 2.4418387320269047e-05, + "loss": 0.3185, + "step": 591 + }, + { + "epoch": 2.433710174717369, + "grad_norm": 0.2582002464494716, + "learning_rate": 2.4362303813698766e-05, + "loss": 0.3262, + "step": 592 + }, + { + "epoch": 2.437821171634121, + "grad_norm": 0.288633230170238, + "learning_rate": 2.4306184267937654e-05, + "loss": 0.3317, + "step": 593 + }, + { + "epoch": 2.4419321685508737, + "grad_norm": 0.2642364711177551, + "learning_rate": 2.425002914661758e-05, + "loss": 0.3325, + "step": 594 + }, + { + "epoch": 2.446043165467626, + "grad_norm": 0.28079190224987655, + "learning_rate": 2.419383891366431e-05, + "loss": 0.3186, + "step": 595 + }, + { + "epoch": 2.4501541623843783, + "grad_norm": 0.2517139036005103, + "learning_rate": 2.4137614033293676e-05, + "loss": 0.3325, + "step": 596 + }, + { + "epoch": 2.4542651593011304, + "grad_norm": 0.3048176393508488, + "learning_rate": 2.408135497000776e-05, + "loss": 0.3258, + "step": 597 + }, + { + "epoch": 2.458376156217883, + "grad_norm": 0.2553179117187841, + "learning_rate": 2.4025062188591046e-05, + "loss": 0.3286, + "step": 598 + }, + { + "epoch": 2.462487153134635, + "grad_norm": 0.297522330860201, + "learning_rate": 2.3968736154106574e-05, + "loss": 0.3257, + "step": 599 + }, + { + "epoch": 2.4665981500513876, + "grad_norm": 0.26227969960383657, + "learning_rate": 2.3912377331892112e-05, + "loss": 0.3348, + "step": 600 + }, + { + "epoch": 2.4707091469681397, + "grad_norm": 0.2678339048494993, + "learning_rate": 2.3855986187556295e-05, + "loss": 0.3247, + "step": 601 + }, + { + "epoch": 2.4748201438848922, + "grad_norm": 0.2782462750099432, + "learning_rate": 2.3799563186974802e-05, + "loss": 0.3288, + "step": 602 + }, + { + "epoch": 2.4789311408016443, + "grad_norm": 0.26942196233507953, + "learning_rate": 2.374310879628647e-05, + "loss": 0.3343, + "step": 603 + }, + { + "epoch": 2.483042137718397, + "grad_norm": 0.25192857481609987, + "learning_rate": 2.3686623481889496e-05, + "loss": 0.3355, + "step": 604 + }, + { + "epoch": 2.487153134635149, + "grad_norm": 0.27024432726841424, + "learning_rate": 2.3630107710437526e-05, + "loss": 0.3296, + "step": 605 + }, + { + "epoch": 2.4912641315519015, + "grad_norm": 0.2491507090752715, + "learning_rate": 2.3573561948835836e-05, + "loss": 0.3421, + "step": 606 + }, + { + "epoch": 2.4953751284686536, + "grad_norm": 0.3126612318343971, + "learning_rate": 2.3516986664237474e-05, + "loss": 0.3254, + "step": 607 + }, + { + "epoch": 2.499486125385406, + "grad_norm": 0.2643267374371664, + "learning_rate": 2.3460382324039377e-05, + "loss": 0.3272, + "step": 608 + }, + { + "epoch": 2.5035971223021583, + "grad_norm": 0.2786020179741824, + "learning_rate": 2.3403749395878542e-05, + "loss": 0.3292, + "step": 609 + }, + { + "epoch": 2.5077081192189103, + "grad_norm": 0.2873861028514028, + "learning_rate": 2.3347088347628128e-05, + "loss": 0.3307, + "step": 610 + }, + { + "epoch": 2.511819116135663, + "grad_norm": 0.2772071301023664, + "learning_rate": 2.3290399647393628e-05, + "loss": 0.324, + "step": 611 + }, + { + "epoch": 2.5159301130524154, + "grad_norm": 0.27537902676408144, + "learning_rate": 2.3233683763508957e-05, + "loss": 0.3343, + "step": 612 + }, + { + "epoch": 2.5200411099691675, + "grad_norm": 0.2859207355422494, + "learning_rate": 2.317694116453263e-05, + "loss": 0.34, + "step": 613 + }, + { + "epoch": 2.5241521068859196, + "grad_norm": 0.2743835098944321, + "learning_rate": 2.3120172319243864e-05, + "loss": 0.3338, + "step": 614 + }, + { + "epoch": 2.528263103802672, + "grad_norm": 0.27595779721707764, + "learning_rate": 2.3063377696638707e-05, + "loss": 0.3311, + "step": 615 + }, + { + "epoch": 2.5323741007194247, + "grad_norm": 0.26843945933414415, + "learning_rate": 2.300655776592616e-05, + "loss": 0.3335, + "step": 616 + }, + { + "epoch": 2.536485097636177, + "grad_norm": 0.25648610540979605, + "learning_rate": 2.294971299652432e-05, + "loss": 0.3235, + "step": 617 + }, + { + "epoch": 2.540596094552929, + "grad_norm": 0.3013681074148862, + "learning_rate": 2.2892843858056474e-05, + "loss": 0.3321, + "step": 618 + }, + { + "epoch": 2.5447070914696814, + "grad_norm": 0.24919313896376655, + "learning_rate": 2.283595082034725e-05, + "loss": 0.3167, + "step": 619 + }, + { + "epoch": 2.548818088386434, + "grad_norm": 0.2688625414735968, + "learning_rate": 2.2779034353418707e-05, + "loss": 0.3324, + "step": 620 + }, + { + "epoch": 2.552929085303186, + "grad_norm": 0.26263149016674175, + "learning_rate": 2.2722094927486472e-05, + "loss": 0.3286, + "step": 621 + }, + { + "epoch": 2.557040082219938, + "grad_norm": 0.2823135658125824, + "learning_rate": 2.2665133012955844e-05, + "loss": 0.3383, + "step": 622 + }, + { + "epoch": 2.5611510791366907, + "grad_norm": 0.276217133090313, + "learning_rate": 2.2608149080417913e-05, + "loss": 0.3289, + "step": 623 + }, + { + "epoch": 2.565262076053443, + "grad_norm": 0.2850565964189695, + "learning_rate": 2.2551143600645672e-05, + "loss": 0.3244, + "step": 624 + }, + { + "epoch": 2.5693730729701953, + "grad_norm": 0.26362750208519725, + "learning_rate": 2.249411704459013e-05, + "loss": 0.3361, + "step": 625 + }, + { + "epoch": 2.5734840698869474, + "grad_norm": 0.2798643173147311, + "learning_rate": 2.2437069883376404e-05, + "loss": 0.3142, + "step": 626 + }, + { + "epoch": 2.5775950668037, + "grad_norm": 0.2563288966482464, + "learning_rate": 2.238000258829986e-05, + "loss": 0.324, + "step": 627 + }, + { + "epoch": 2.581706063720452, + "grad_norm": 0.30508227611107025, + "learning_rate": 2.2322915630822184e-05, + "loss": 0.3226, + "step": 628 + }, + { + "epoch": 2.5858170606372046, + "grad_norm": 0.25091522041815256, + "learning_rate": 2.226580948256751e-05, + "loss": 0.3315, + "step": 629 + }, + { + "epoch": 2.5899280575539567, + "grad_norm": 0.32219820646356984, + "learning_rate": 2.2208684615318515e-05, + "loss": 0.3291, + "step": 630 + }, + { + "epoch": 2.5940390544707093, + "grad_norm": 0.2642592817496665, + "learning_rate": 2.2151541501012526e-05, + "loss": 0.3348, + "step": 631 + }, + { + "epoch": 2.5981500513874614, + "grad_norm": 0.2696044140880529, + "learning_rate": 2.2094380611737615e-05, + "loss": 0.336, + "step": 632 + }, + { + "epoch": 2.602261048304214, + "grad_norm": 0.2778886911936094, + "learning_rate": 2.20372024197287e-05, + "loss": 0.3221, + "step": 633 + }, + { + "epoch": 2.606372045220966, + "grad_norm": 0.26840999460258913, + "learning_rate": 2.1980007397363653e-05, + "loss": 0.3283, + "step": 634 + }, + { + "epoch": 2.6104830421377185, + "grad_norm": 0.30677147336816346, + "learning_rate": 2.1922796017159382e-05, + "loss": 0.3391, + "step": 635 + }, + { + "epoch": 2.6145940390544706, + "grad_norm": 0.26204192651719005, + "learning_rate": 2.186556875176794e-05, + "loss": 0.3181, + "step": 636 + }, + { + "epoch": 2.618705035971223, + "grad_norm": 0.29330464889106106, + "learning_rate": 2.1808326073972618e-05, + "loss": 0.3334, + "step": 637 + }, + { + "epoch": 2.6228160328879753, + "grad_norm": 0.3611585390826276, + "learning_rate": 2.1751068456684026e-05, + "loss": 0.3328, + "step": 638 + }, + { + "epoch": 2.626927029804728, + "grad_norm": 0.32081188768018193, + "learning_rate": 2.1693796372936207e-05, + "loss": 0.3348, + "step": 639 + }, + { + "epoch": 2.63103802672148, + "grad_norm": 0.3466957904122417, + "learning_rate": 2.1636510295882723e-05, + "loss": 0.3287, + "step": 640 + }, + { + "epoch": 2.635149023638232, + "grad_norm": 0.3325190809270464, + "learning_rate": 2.1579210698792724e-05, + "loss": 0.3357, + "step": 641 + }, + { + "epoch": 2.6392600205549845, + "grad_norm": 0.323613824705376, + "learning_rate": 2.1521898055047065e-05, + "loss": 0.3254, + "step": 642 + }, + { + "epoch": 2.643371017471737, + "grad_norm": 0.31700835111056935, + "learning_rate": 2.1464572838134393e-05, + "loss": 0.3405, + "step": 643 + }, + { + "epoch": 2.647482014388489, + "grad_norm": 0.31194064100135144, + "learning_rate": 2.1407235521647216e-05, + "loss": 0.3337, + "step": 644 + }, + { + "epoch": 2.6515930113052413, + "grad_norm": 0.291054868309333, + "learning_rate": 2.134988657927802e-05, + "loss": 0.3223, + "step": 645 + }, + { + "epoch": 2.655704008221994, + "grad_norm": 0.28960930247219024, + "learning_rate": 2.129252648481532e-05, + "loss": 0.3399, + "step": 646 + }, + { + "epoch": 2.6598150051387464, + "grad_norm": 0.262272292175284, + "learning_rate": 2.123515571213977e-05, + "loss": 0.3199, + "step": 647 + }, + { + "epoch": 2.6639260020554985, + "grad_norm": 0.3430422990168527, + "learning_rate": 2.1177774735220246e-05, + "loss": 0.3211, + "step": 648 + }, + { + "epoch": 2.6680369989722506, + "grad_norm": 0.24490577578554293, + "learning_rate": 2.1120384028109928e-05, + "loss": 0.3347, + "step": 649 + }, + { + "epoch": 2.672147995889003, + "grad_norm": 0.3135561697948168, + "learning_rate": 2.106298406494237e-05, + "loss": 0.337, + "step": 650 + }, + { + "epoch": 2.6762589928057556, + "grad_norm": 0.2536708220913538, + "learning_rate": 2.1005575319927606e-05, + "loss": 0.3286, + "step": 651 + }, + { + "epoch": 2.6803699897225077, + "grad_norm": 0.2905534330712754, + "learning_rate": 2.094815826734822e-05, + "loss": 0.3344, + "step": 652 + }, + { + "epoch": 2.68448098663926, + "grad_norm": 0.255577529722107, + "learning_rate": 2.089073338155542e-05, + "loss": 0.3347, + "step": 653 + }, + { + "epoch": 2.6885919835560124, + "grad_norm": 0.3169225043435795, + "learning_rate": 2.0833301136965138e-05, + "loss": 0.3368, + "step": 654 + }, + { + "epoch": 2.692702980472765, + "grad_norm": 0.24523301662966585, + "learning_rate": 2.0775862008054102e-05, + "loss": 0.3317, + "step": 655 + }, + { + "epoch": 2.696813977389517, + "grad_norm": 0.3377639592657221, + "learning_rate": 2.0718416469355917e-05, + "loss": 0.3327, + "step": 656 + }, + { + "epoch": 2.700924974306269, + "grad_norm": 0.2760670088077706, + "learning_rate": 2.066096499545712e-05, + "loss": 0.3254, + "step": 657 + }, + { + "epoch": 2.7050359712230216, + "grad_norm": 0.33508069878850794, + "learning_rate": 2.0603508060993306e-05, + "loss": 0.3324, + "step": 658 + }, + { + "epoch": 2.7091469681397737, + "grad_norm": 0.2888367467526053, + "learning_rate": 2.0546046140645178e-05, + "loss": 0.33, + "step": 659 + }, + { + "epoch": 2.7132579650565263, + "grad_norm": 0.2851449912230599, + "learning_rate": 2.0488579709134623e-05, + "loss": 0.3375, + "step": 660 + }, + { + "epoch": 2.7173689619732784, + "grad_norm": 0.28857625298935113, + "learning_rate": 2.04311092412208e-05, + "loss": 0.3324, + "step": 661 + }, + { + "epoch": 2.721479958890031, + "grad_norm": 0.3099583754195003, + "learning_rate": 2.0373635211696214e-05, + "loss": 0.331, + "step": 662 + }, + { + "epoch": 2.725590955806783, + "grad_norm": 0.3033491487781276, + "learning_rate": 2.0316158095382797e-05, + "loss": 0.3354, + "step": 663 + }, + { + "epoch": 2.7297019527235356, + "grad_norm": 0.2619645136131476, + "learning_rate": 2.0258678367127972e-05, + "loss": 0.3236, + "step": 664 + }, + { + "epoch": 2.7338129496402876, + "grad_norm": 0.30087165164939694, + "learning_rate": 2.0201196501800768e-05, + "loss": 0.3232, + "step": 665 + }, + { + "epoch": 2.73792394655704, + "grad_norm": 0.2738871408955049, + "learning_rate": 2.0143712974287838e-05, + "loss": 0.3277, + "step": 666 + }, + { + "epoch": 2.7420349434737923, + "grad_norm": 0.29720770878473823, + "learning_rate": 2.0086228259489578e-05, + "loss": 0.3419, + "step": 667 + }, + { + "epoch": 2.746145940390545, + "grad_norm": 0.24745189549975016, + "learning_rate": 2.0028742832316202e-05, + "loss": 0.3241, + "step": 668 + }, + { + "epoch": 2.750256937307297, + "grad_norm": 0.26535109416407787, + "learning_rate": 1.99712571676838e-05, + "loss": 0.3206, + "step": 669 + }, + { + "epoch": 2.7543679342240495, + "grad_norm": 0.28264530686991374, + "learning_rate": 1.9913771740510426e-05, + "loss": 0.3441, + "step": 670 + }, + { + "epoch": 2.7584789311408016, + "grad_norm": 0.2439511505193991, + "learning_rate": 1.9856287025712172e-05, + "loss": 0.3327, + "step": 671 + }, + { + "epoch": 2.762589928057554, + "grad_norm": 0.25559437564690174, + "learning_rate": 1.979880349819924e-05, + "loss": 0.3325, + "step": 672 + }, + { + "epoch": 2.766700924974306, + "grad_norm": 0.22699680175413017, + "learning_rate": 1.974132163287203e-05, + "loss": 0.329, + "step": 673 + }, + { + "epoch": 2.7708119218910587, + "grad_norm": 0.2582116792070818, + "learning_rate": 1.9683841904617217e-05, + "loss": 0.3319, + "step": 674 + }, + { + "epoch": 2.774922918807811, + "grad_norm": 0.24489677360477968, + "learning_rate": 1.9626364788303796e-05, + "loss": 0.3313, + "step": 675 + }, + { + "epoch": 2.779033915724563, + "grad_norm": 0.29973564906267575, + "learning_rate": 1.956889075877921e-05, + "loss": 0.3359, + "step": 676 + }, + { + "epoch": 2.7831449126413155, + "grad_norm": 0.24725710379682117, + "learning_rate": 1.9511420290865387e-05, + "loss": 0.3269, + "step": 677 + }, + { + "epoch": 2.787255909558068, + "grad_norm": 0.254114192213977, + "learning_rate": 1.945395385935483e-05, + "loss": 0.3315, + "step": 678 + }, + { + "epoch": 2.79136690647482, + "grad_norm": 0.2500901168277256, + "learning_rate": 1.9396491939006693e-05, + "loss": 0.317, + "step": 679 + }, + { + "epoch": 2.795477903391572, + "grad_norm": 0.2506907960447071, + "learning_rate": 1.9339035004542883e-05, + "loss": 0.3355, + "step": 680 + }, + { + "epoch": 2.7995889003083247, + "grad_norm": 0.254060820338398, + "learning_rate": 1.9281583530644087e-05, + "loss": 0.3274, + "step": 681 + }, + { + "epoch": 2.8036998972250773, + "grad_norm": 0.22470917674479732, + "learning_rate": 1.9224137991945898e-05, + "loss": 0.3161, + "step": 682 + }, + { + "epoch": 2.8078108941418294, + "grad_norm": 0.25617294354628883, + "learning_rate": 1.9166698863034865e-05, + "loss": 0.3326, + "step": 683 + }, + { + "epoch": 2.8119218910585815, + "grad_norm": 0.24637096854415516, + "learning_rate": 1.910926661844459e-05, + "loss": 0.3306, + "step": 684 + }, + { + "epoch": 2.816032887975334, + "grad_norm": 0.23065590461427085, + "learning_rate": 1.905184173265179e-05, + "loss": 0.3285, + "step": 685 + }, + { + "epoch": 2.8201438848920866, + "grad_norm": 0.25223738900179504, + "learning_rate": 1.89944246800724e-05, + "loss": 0.3315, + "step": 686 + }, + { + "epoch": 2.8242548818088387, + "grad_norm": 0.2813788401987118, + "learning_rate": 1.8937015935057637e-05, + "loss": 0.343, + "step": 687 + }, + { + "epoch": 2.8283658787255908, + "grad_norm": 0.23658155464390826, + "learning_rate": 1.887961597189008e-05, + "loss": 0.3361, + "step": 688 + }, + { + "epoch": 2.8324768756423433, + "grad_norm": 0.2560263043866784, + "learning_rate": 1.8822225264779757e-05, + "loss": 0.336, + "step": 689 + }, + { + "epoch": 2.836587872559096, + "grad_norm": 0.24171456841261904, + "learning_rate": 1.8764844287860235e-05, + "loss": 0.3155, + "step": 690 + }, + { + "epoch": 2.840698869475848, + "grad_norm": 0.2709130278349106, + "learning_rate": 1.8707473515184686e-05, + "loss": 0.3347, + "step": 691 + }, + { + "epoch": 2.8448098663926, + "grad_norm": 0.2389464904458257, + "learning_rate": 1.8650113420721985e-05, + "loss": 0.3261, + "step": 692 + }, + { + "epoch": 2.8489208633093526, + "grad_norm": 0.23853438478287736, + "learning_rate": 1.8592764478352788e-05, + "loss": 0.3269, + "step": 693 + }, + { + "epoch": 2.8530318602261047, + "grad_norm": 0.24002347978417551, + "learning_rate": 1.8535427161865617e-05, + "loss": 0.3273, + "step": 694 + }, + { + "epoch": 2.857142857142857, + "grad_norm": 0.24648736679259559, + "learning_rate": 1.8478101944952946e-05, + "loss": 0.3336, + "step": 695 + }, + { + "epoch": 2.8612538540596093, + "grad_norm": 0.2539305109029615, + "learning_rate": 1.842078930120729e-05, + "loss": 0.3285, + "step": 696 + }, + { + "epoch": 2.865364850976362, + "grad_norm": 0.25402455868598073, + "learning_rate": 1.836348970411729e-05, + "loss": 0.3415, + "step": 697 + }, + { + "epoch": 2.869475847893114, + "grad_norm": 0.2557721072489133, + "learning_rate": 1.8306203627063803e-05, + "loss": 0.3324, + "step": 698 + }, + { + "epoch": 2.8735868448098665, + "grad_norm": 0.24149579536024937, + "learning_rate": 1.8248931543315974e-05, + "loss": 0.3327, + "step": 699 + }, + { + "epoch": 2.8776978417266186, + "grad_norm": 0.2618118707433648, + "learning_rate": 1.8191673926027386e-05, + "loss": 0.336, + "step": 700 + }, + { + "epoch": 2.881808838643371, + "grad_norm": 0.22607821223026145, + "learning_rate": 1.813443124823206e-05, + "loss": 0.3366, + "step": 701 + }, + { + "epoch": 2.885919835560123, + "grad_norm": 0.2939881318908373, + "learning_rate": 1.807720398284062e-05, + "loss": 0.3279, + "step": 702 + }, + { + "epoch": 2.8900308324768758, + "grad_norm": 0.2260123918569214, + "learning_rate": 1.801999260263635e-05, + "loss": 0.3337, + "step": 703 + }, + { + "epoch": 2.894141829393628, + "grad_norm": 0.24275524474075025, + "learning_rate": 1.7962797580271303e-05, + "loss": 0.3244, + "step": 704 + }, + { + "epoch": 2.8982528263103804, + "grad_norm": 0.2665250980386111, + "learning_rate": 1.790561938826239e-05, + "loss": 0.3341, + "step": 705 + }, + { + "epoch": 2.9023638232271325, + "grad_norm": 0.2439607446787407, + "learning_rate": 1.784845849898748e-05, + "loss": 0.3177, + "step": 706 + }, + { + "epoch": 2.906474820143885, + "grad_norm": 0.22414006570173825, + "learning_rate": 1.7791315384681488e-05, + "loss": 0.3199, + "step": 707 + }, + { + "epoch": 2.910585817060637, + "grad_norm": 0.29428160100150474, + "learning_rate": 1.7734190517432498e-05, + "loss": 0.3276, + "step": 708 + }, + { + "epoch": 2.9146968139773897, + "grad_norm": 0.23652581393109087, + "learning_rate": 1.7677084369177823e-05, + "loss": 0.3252, + "step": 709 + }, + { + "epoch": 2.9188078108941418, + "grad_norm": 0.2638103273242896, + "learning_rate": 1.7619997411700146e-05, + "loss": 0.3269, + "step": 710 + }, + { + "epoch": 2.9229188078108943, + "grad_norm": 0.23905327112200858, + "learning_rate": 1.7562930116623602e-05, + "loss": 0.3221, + "step": 711 + }, + { + "epoch": 2.9270298047276464, + "grad_norm": 0.24681045434883284, + "learning_rate": 1.750588295540988e-05, + "loss": 0.3265, + "step": 712 + }, + { + "epoch": 2.931140801644399, + "grad_norm": 0.22345235272171315, + "learning_rate": 1.7448856399354335e-05, + "loss": 0.331, + "step": 713 + }, + { + "epoch": 2.935251798561151, + "grad_norm": 0.2620096260034816, + "learning_rate": 1.7391850919582097e-05, + "loss": 0.3133, + "step": 714 + }, + { + "epoch": 2.939362795477903, + "grad_norm": 0.24255411379013975, + "learning_rate": 1.733486698704417e-05, + "loss": 0.3345, + "step": 715 + }, + { + "epoch": 2.9434737923946557, + "grad_norm": 0.2677764953102307, + "learning_rate": 1.7277905072513538e-05, + "loss": 0.3125, + "step": 716 + }, + { + "epoch": 2.947584789311408, + "grad_norm": 0.23338472896647094, + "learning_rate": 1.7220965646581304e-05, + "loss": 0.3329, + "step": 717 + }, + { + "epoch": 2.9516957862281603, + "grad_norm": 0.256626369930556, + "learning_rate": 1.7164049179652762e-05, + "loss": 0.342, + "step": 718 + }, + { + "epoch": 2.9558067831449124, + "grad_norm": 0.24204486389961907, + "learning_rate": 1.7107156141943536e-05, + "loss": 0.3317, + "step": 719 + }, + { + "epoch": 2.959917780061665, + "grad_norm": 0.23154113899503126, + "learning_rate": 1.7050287003475684e-05, + "loss": 0.338, + "step": 720 + }, + { + "epoch": 2.9640287769784175, + "grad_norm": 0.2515108652826035, + "learning_rate": 1.699344223407384e-05, + "loss": 0.3221, + "step": 721 + }, + { + "epoch": 2.9681397738951696, + "grad_norm": 0.2508115805000718, + "learning_rate": 1.6936622303361292e-05, + "loss": 0.3272, + "step": 722 + }, + { + "epoch": 2.9722507708119217, + "grad_norm": 0.24166143586067093, + "learning_rate": 1.6879827680756132e-05, + "loss": 0.3283, + "step": 723 + }, + { + "epoch": 2.9763617677286742, + "grad_norm": 0.25924430328001846, + "learning_rate": 1.682305883546737e-05, + "loss": 0.3297, + "step": 724 + }, + { + "epoch": 2.9804727646454268, + "grad_norm": 0.26555305739163787, + "learning_rate": 1.6766316236491046e-05, + "loss": 0.3314, + "step": 725 + }, + { + "epoch": 2.984583761562179, + "grad_norm": 0.24257798147799245, + "learning_rate": 1.6709600352606382e-05, + "loss": 0.3238, + "step": 726 + }, + { + "epoch": 2.988694758478931, + "grad_norm": 0.2772710049751061, + "learning_rate": 1.665291165237188e-05, + "loss": 0.3274, + "step": 727 + }, + { + "epoch": 2.9928057553956835, + "grad_norm": 0.22542030051008188, + "learning_rate": 1.6596250604121468e-05, + "loss": 0.3328, + "step": 728 + }, + { + "epoch": 2.996916752312436, + "grad_norm": 0.27500500626350044, + "learning_rate": 1.653961767596063e-05, + "loss": 0.3335, + "step": 729 + }, + { + "epoch": 3.001027749229188, + "grad_norm": 0.270309023452525, + "learning_rate": 1.6483013335762536e-05, + "loss": 0.2946, + "step": 730 + }, + { + "epoch": 3.0051387461459402, + "grad_norm": 0.31524306413172215, + "learning_rate": 1.6426438051164168e-05, + "loss": 0.2781, + "step": 731 + }, + { + "epoch": 3.0092497430626928, + "grad_norm": 0.5781264673297727, + "learning_rate": 1.636989228956248e-05, + "loss": 0.2843, + "step": 732 + }, + { + "epoch": 3.013360739979445, + "grad_norm": 0.3024860387001426, + "learning_rate": 1.631337651811051e-05, + "loss": 0.2747, + "step": 733 + }, + { + "epoch": 3.0174717368961974, + "grad_norm": 0.3934556205957313, + "learning_rate": 1.6256891203713533e-05, + "loss": 0.2728, + "step": 734 + }, + { + "epoch": 3.0215827338129495, + "grad_norm": 0.3129040186390879, + "learning_rate": 1.6200436813025208e-05, + "loss": 0.2736, + "step": 735 + }, + { + "epoch": 3.025693730729702, + "grad_norm": 0.35042448311035457, + "learning_rate": 1.6144013812443712e-05, + "loss": 0.288, + "step": 736 + }, + { + "epoch": 3.029804727646454, + "grad_norm": 0.32431756468327383, + "learning_rate": 1.60876226681079e-05, + "loss": 0.2675, + "step": 737 + }, + { + "epoch": 3.0339157245632067, + "grad_norm": 0.3106864020345642, + "learning_rate": 1.6031263845893436e-05, + "loss": 0.2696, + "step": 738 + }, + { + "epoch": 3.038026721479959, + "grad_norm": 0.2918561704720298, + "learning_rate": 1.5974937811408964e-05, + "loss": 0.2806, + "step": 739 + }, + { + "epoch": 3.0421377183967113, + "grad_norm": 0.2824200303583143, + "learning_rate": 1.5918645029992237e-05, + "loss": 0.2669, + "step": 740 + }, + { + "epoch": 3.0462487153134634, + "grad_norm": 0.2751071529215866, + "learning_rate": 1.5862385966706324e-05, + "loss": 0.2827, + "step": 741 + }, + { + "epoch": 3.050359712230216, + "grad_norm": 0.2659741638733503, + "learning_rate": 1.580616108633569e-05, + "loss": 0.2772, + "step": 742 + }, + { + "epoch": 3.054470709146968, + "grad_norm": 0.2705999112071291, + "learning_rate": 1.5749970853382416e-05, + "loss": 0.2813, + "step": 743 + }, + { + "epoch": 3.0585817060637206, + "grad_norm": 0.2678804686149958, + "learning_rate": 1.5693815732062346e-05, + "loss": 0.2786, + "step": 744 + }, + { + "epoch": 3.0626927029804727, + "grad_norm": 0.28891827034365974, + "learning_rate": 1.563769618630124e-05, + "loss": 0.2781, + "step": 745 + }, + { + "epoch": 3.0668036998972252, + "grad_norm": 0.2608278243848426, + "learning_rate": 1.558161267973096e-05, + "loss": 0.2811, + "step": 746 + }, + { + "epoch": 3.0709146968139773, + "grad_norm": 0.28779231459872495, + "learning_rate": 1.552556567568562e-05, + "loss": 0.2731, + "step": 747 + }, + { + "epoch": 3.07502569373073, + "grad_norm": 0.27172400888062603, + "learning_rate": 1.5469555637197775e-05, + "loss": 0.273, + "step": 748 + }, + { + "epoch": 3.079136690647482, + "grad_norm": 0.284714544394066, + "learning_rate": 1.541358302699459e-05, + "loss": 0.2737, + "step": 749 + }, + { + "epoch": 3.0832476875642345, + "grad_norm": 0.27108204101689876, + "learning_rate": 1.535764830749401e-05, + "loss": 0.2719, + "step": 750 + }, + { + "epoch": 3.0873586844809866, + "grad_norm": 0.26218248343837663, + "learning_rate": 1.5301751940800947e-05, + "loss": 0.2702, + "step": 751 + }, + { + "epoch": 3.091469681397739, + "grad_norm": 0.27396785993274086, + "learning_rate": 1.5245894388703473e-05, + "loss": 0.2746, + "step": 752 + }, + { + "epoch": 3.0955806783144912, + "grad_norm": 0.26457645017367387, + "learning_rate": 1.5190076112668975e-05, + "loss": 0.2741, + "step": 753 + }, + { + "epoch": 3.099691675231244, + "grad_norm": 0.2669417946440861, + "learning_rate": 1.5134297573840373e-05, + "loss": 0.2609, + "step": 754 + }, + { + "epoch": 3.103802672147996, + "grad_norm": 0.24350309961263825, + "learning_rate": 1.507855923303229e-05, + "loss": 0.2683, + "step": 755 + }, + { + "epoch": 3.1079136690647484, + "grad_norm": 0.24218499055629, + "learning_rate": 1.5022861550727261e-05, + "loss": 0.2753, + "step": 756 + }, + { + "epoch": 3.1120246659815005, + "grad_norm": 0.23884760385788692, + "learning_rate": 1.4967204987071916e-05, + "loss": 0.2674, + "step": 757 + }, + { + "epoch": 3.1161356628982526, + "grad_norm": 0.24427494625864407, + "learning_rate": 1.491159000187318e-05, + "loss": 0.2766, + "step": 758 + }, + { + "epoch": 3.120246659815005, + "grad_norm": 0.22462887698775066, + "learning_rate": 1.4856017054594487e-05, + "loss": 0.2817, + "step": 759 + }, + { + "epoch": 3.1243576567317572, + "grad_norm": 0.22935229598945833, + "learning_rate": 1.4800486604351953e-05, + "loss": 0.2692, + "step": 760 + }, + { + "epoch": 3.12846865364851, + "grad_norm": 0.2283641464528615, + "learning_rate": 1.4744999109910642e-05, + "loss": 0.2881, + "step": 761 + }, + { + "epoch": 3.132579650565262, + "grad_norm": 0.2402319884401938, + "learning_rate": 1.4689555029680706e-05, + "loss": 0.2811, + "step": 762 + }, + { + "epoch": 3.1366906474820144, + "grad_norm": 0.24689907618158027, + "learning_rate": 1.4634154821713642e-05, + "loss": 0.2748, + "step": 763 + }, + { + "epoch": 3.1408016443987665, + "grad_norm": 0.21908840749036268, + "learning_rate": 1.4578798943698495e-05, + "loss": 0.2775, + "step": 764 + }, + { + "epoch": 3.144912641315519, + "grad_norm": 0.2706839692520901, + "learning_rate": 1.4523487852958078e-05, + "loss": 0.274, + "step": 765 + }, + { + "epoch": 3.149023638232271, + "grad_norm": 0.21829989516446477, + "learning_rate": 1.4468222006445194e-05, + "loss": 0.2846, + "step": 766 + }, + { + "epoch": 3.1531346351490237, + "grad_norm": 0.254462615428386, + "learning_rate": 1.4413001860738857e-05, + "loss": 0.2751, + "step": 767 + }, + { + "epoch": 3.157245632065776, + "grad_norm": 0.22996784925457855, + "learning_rate": 1.4357827872040533e-05, + "loss": 0.2763, + "step": 768 + }, + { + "epoch": 3.1613566289825283, + "grad_norm": 0.24916315195392996, + "learning_rate": 1.4302700496170348e-05, + "loss": 0.273, + "step": 769 + }, + { + "epoch": 3.1654676258992804, + "grad_norm": 0.2394151755505642, + "learning_rate": 1.424762018856335e-05, + "loss": 0.2733, + "step": 770 + }, + { + "epoch": 3.169578622816033, + "grad_norm": 0.24554198081740938, + "learning_rate": 1.4192587404265723e-05, + "loss": 0.2739, + "step": 771 + }, + { + "epoch": 3.173689619732785, + "grad_norm": 0.23659011712793626, + "learning_rate": 1.4137602597931039e-05, + "loss": 0.2819, + "step": 772 + }, + { + "epoch": 3.1778006166495376, + "grad_norm": 0.22564337444058383, + "learning_rate": 1.4082666223816503e-05, + "loss": 0.2792, + "step": 773 + }, + { + "epoch": 3.1819116135662897, + "grad_norm": 0.2317146590014487, + "learning_rate": 1.4027778735779194e-05, + "loss": 0.2777, + "step": 774 + }, + { + "epoch": 3.1860226104830422, + "grad_norm": 0.2150695765539657, + "learning_rate": 1.397294058727232e-05, + "loss": 0.2765, + "step": 775 + }, + { + "epoch": 3.1901336073997943, + "grad_norm": 0.23401579346325868, + "learning_rate": 1.3918152231341466e-05, + "loss": 0.2859, + "step": 776 + }, + { + "epoch": 3.194244604316547, + "grad_norm": 0.22190869981693315, + "learning_rate": 1.3863414120620866e-05, + "loss": 0.2739, + "step": 777 + }, + { + "epoch": 3.198355601233299, + "grad_norm": 0.24505629782483931, + "learning_rate": 1.3808726707329636e-05, + "loss": 0.2854, + "step": 778 + }, + { + "epoch": 3.2024665981500515, + "grad_norm": 0.23527894624102066, + "learning_rate": 1.3754090443268073e-05, + "loss": 0.2739, + "step": 779 + }, + { + "epoch": 3.2065775950668036, + "grad_norm": 0.250451185169838, + "learning_rate": 1.3699505779813885e-05, + "loss": 0.2779, + "step": 780 + }, + { + "epoch": 3.210688591983556, + "grad_norm": 0.24199976474098944, + "learning_rate": 1.3644973167918509e-05, + "loss": 0.2819, + "step": 781 + }, + { + "epoch": 3.2147995889003083, + "grad_norm": 0.2295279753606739, + "learning_rate": 1.3590493058103334e-05, + "loss": 0.2912, + "step": 782 + }, + { + "epoch": 3.218910585817061, + "grad_norm": 0.24479637435880175, + "learning_rate": 1.353606590045601e-05, + "loss": 0.2625, + "step": 783 + }, + { + "epoch": 3.223021582733813, + "grad_norm": 0.22366855597040158, + "learning_rate": 1.3481692144626723e-05, + "loss": 0.2716, + "step": 784 + }, + { + "epoch": 3.2271325796505654, + "grad_norm": 0.23386804780243653, + "learning_rate": 1.3427372239824478e-05, + "loss": 0.2833, + "step": 785 + }, + { + "epoch": 3.2312435765673175, + "grad_norm": 0.21138476617701588, + "learning_rate": 1.3373106634813395e-05, + "loss": 0.2815, + "step": 786 + }, + { + "epoch": 3.23535457348407, + "grad_norm": 0.21868921231797736, + "learning_rate": 1.3318895777908989e-05, + "loss": 0.2737, + "step": 787 + }, + { + "epoch": 3.239465570400822, + "grad_norm": 0.22091301457511603, + "learning_rate": 1.3264740116974477e-05, + "loss": 0.2784, + "step": 788 + }, + { + "epoch": 3.2435765673175747, + "grad_norm": 0.21177976498652176, + "learning_rate": 1.3210640099417071e-05, + "loss": 0.2677, + "step": 789 + }, + { + "epoch": 3.247687564234327, + "grad_norm": 0.22075747796505304, + "learning_rate": 1.3156596172184291e-05, + "loss": 0.2843, + "step": 790 + }, + { + "epoch": 3.2517985611510793, + "grad_norm": 0.2093957071643158, + "learning_rate": 1.3102608781760262e-05, + "loss": 0.2783, + "step": 791 + }, + { + "epoch": 3.2559095580678314, + "grad_norm": 0.2502487297475507, + "learning_rate": 1.3048678374162033e-05, + "loss": 0.2764, + "step": 792 + }, + { + "epoch": 3.2600205549845835, + "grad_norm": 0.21828424241121014, + "learning_rate": 1.2994805394935883e-05, + "loss": 0.2783, + "step": 793 + }, + { + "epoch": 3.264131551901336, + "grad_norm": 0.2225059796962467, + "learning_rate": 1.2940990289153654e-05, + "loss": 0.2818, + "step": 794 + }, + { + "epoch": 3.2682425488180886, + "grad_norm": 0.2384933128418085, + "learning_rate": 1.2887233501409062e-05, + "loss": 0.2715, + "step": 795 + }, + { + "epoch": 3.2723535457348407, + "grad_norm": 0.23881527823885554, + "learning_rate": 1.283353547581403e-05, + "loss": 0.2815, + "step": 796 + }, + { + "epoch": 3.276464542651593, + "grad_norm": 0.21990598524463273, + "learning_rate": 1.2779896655995012e-05, + "loss": 0.2649, + "step": 797 + }, + { + "epoch": 3.2805755395683454, + "grad_norm": 0.22014058980246703, + "learning_rate": 1.2726317485089345e-05, + "loss": 0.2857, + "step": 798 + }, + { + "epoch": 3.2846865364850975, + "grad_norm": 0.2128741761301097, + "learning_rate": 1.2672798405741565e-05, + "loss": 0.2744, + "step": 799 + }, + { + "epoch": 3.28879753340185, + "grad_norm": 0.2093902141995586, + "learning_rate": 1.261933986009976e-05, + "loss": 0.2714, + "step": 800 + }, + { + "epoch": 3.292908530318602, + "grad_norm": 0.2317393277270657, + "learning_rate": 1.2565942289811926e-05, + "loss": 0.2821, + "step": 801 + }, + { + "epoch": 3.2970195272353546, + "grad_norm": 0.2124614991624517, + "learning_rate": 1.2512606136022316e-05, + "loss": 0.2684, + "step": 802 + }, + { + "epoch": 3.3011305241521067, + "grad_norm": 0.2237964393839327, + "learning_rate": 1.245933183936778e-05, + "loss": 0.28, + "step": 803 + }, + { + "epoch": 3.3052415210688593, + "grad_norm": 0.20179137845865386, + "learning_rate": 1.2406119839974137e-05, + "loss": 0.2791, + "step": 804 + }, + { + "epoch": 3.3093525179856114, + "grad_norm": 0.21344599872935055, + "learning_rate": 1.2352970577452536e-05, + "loss": 0.282, + "step": 805 + }, + { + "epoch": 3.313463514902364, + "grad_norm": 0.21405309956045562, + "learning_rate": 1.2299884490895829e-05, + "loss": 0.2705, + "step": 806 + }, + { + "epoch": 3.317574511819116, + "grad_norm": 0.20836540998453448, + "learning_rate": 1.2246862018874937e-05, + "loss": 0.2675, + "step": 807 + }, + { + "epoch": 3.3216855087358685, + "grad_norm": 0.21917814502090704, + "learning_rate": 1.2193903599435229e-05, + "loss": 0.2867, + "step": 808 + }, + { + "epoch": 3.3257965056526206, + "grad_norm": 0.21478503443145303, + "learning_rate": 1.2141009670092905e-05, + "loss": 0.263, + "step": 809 + }, + { + "epoch": 3.329907502569373, + "grad_norm": 0.24017325608140172, + "learning_rate": 1.2088180667831378e-05, + "loss": 0.285, + "step": 810 + }, + { + "epoch": 3.3340184994861253, + "grad_norm": 0.21263315635103802, + "learning_rate": 1.2035417029097669e-05, + "loss": 0.2794, + "step": 811 + }, + { + "epoch": 3.338129496402878, + "grad_norm": 0.2208436673519513, + "learning_rate": 1.198271918979879e-05, + "loss": 0.2661, + "step": 812 + }, + { + "epoch": 3.34224049331963, + "grad_norm": 0.21410801362761014, + "learning_rate": 1.1930087585298163e-05, + "loss": 0.2691, + "step": 813 + }, + { + "epoch": 3.3463514902363825, + "grad_norm": 0.2189540505149734, + "learning_rate": 1.1877522650412002e-05, + "loss": 0.2777, + "step": 814 + }, + { + "epoch": 3.3504624871531345, + "grad_norm": 0.2235412920660751, + "learning_rate": 1.1825024819405728e-05, + "loss": 0.2829, + "step": 815 + }, + { + "epoch": 3.354573484069887, + "grad_norm": 0.22891833469755685, + "learning_rate": 1.177259452599039e-05, + "loss": 0.2883, + "step": 816 + }, + { + "epoch": 3.358684480986639, + "grad_norm": 0.20951091444066108, + "learning_rate": 1.1720232203319072e-05, + "loss": 0.2703, + "step": 817 + }, + { + "epoch": 3.3627954779033917, + "grad_norm": 0.2291000642315933, + "learning_rate": 1.1667938283983318e-05, + "loss": 0.2818, + "step": 818 + }, + { + "epoch": 3.366906474820144, + "grad_norm": 0.24820675241373585, + "learning_rate": 1.1615713200009555e-05, + "loss": 0.2894, + "step": 819 + }, + { + "epoch": 3.3710174717368964, + "grad_norm": 0.2112186174561992, + "learning_rate": 1.1563557382855527e-05, + "loss": 0.2765, + "step": 820 + }, + { + "epoch": 3.3751284686536485, + "grad_norm": 0.23516315367694957, + "learning_rate": 1.1511471263406727e-05, + "loss": 0.2783, + "step": 821 + }, + { + "epoch": 3.379239465570401, + "grad_norm": 0.20429288664608256, + "learning_rate": 1.1459455271972855e-05, + "loss": 0.2826, + "step": 822 + }, + { + "epoch": 3.383350462487153, + "grad_norm": 0.22120456160119745, + "learning_rate": 1.1407509838284234e-05, + "loss": 0.2702, + "step": 823 + }, + { + "epoch": 3.3874614594039056, + "grad_norm": 0.22196158784290934, + "learning_rate": 1.1355635391488273e-05, + "loss": 0.2816, + "step": 824 + }, + { + "epoch": 3.3915724563206577, + "grad_norm": 0.23198563181248005, + "learning_rate": 1.130383236014593e-05, + "loss": 0.2807, + "step": 825 + }, + { + "epoch": 3.3956834532374103, + "grad_norm": 0.21328959797566183, + "learning_rate": 1.1252101172228161e-05, + "loss": 0.2812, + "step": 826 + }, + { + "epoch": 3.3997944501541624, + "grad_norm": 0.20829955768200162, + "learning_rate": 1.1200442255112382e-05, + "loss": 0.2781, + "step": 827 + }, + { + "epoch": 3.4039054470709145, + "grad_norm": 0.210021918847506, + "learning_rate": 1.1148856035578954e-05, + "loss": 0.2793, + "step": 828 + }, + { + "epoch": 3.408016443987667, + "grad_norm": 0.21953053255099053, + "learning_rate": 1.1097342939807639e-05, + "loss": 0.2826, + "step": 829 + }, + { + "epoch": 3.4121274409044196, + "grad_norm": 0.21197251533168365, + "learning_rate": 1.1045903393374088e-05, + "loss": 0.2678, + "step": 830 + }, + { + "epoch": 3.4162384378211716, + "grad_norm": 0.22402861818250405, + "learning_rate": 1.0994537821246322e-05, + "loss": 0.2768, + "step": 831 + }, + { + "epoch": 3.4203494347379237, + "grad_norm": 0.20866347607213415, + "learning_rate": 1.0943246647781231e-05, + "loss": 0.2822, + "step": 832 + }, + { + "epoch": 3.4244604316546763, + "grad_norm": 0.20588546575745492, + "learning_rate": 1.0892030296721053e-05, + "loss": 0.274, + "step": 833 + }, + { + "epoch": 3.4285714285714284, + "grad_norm": 0.21933896518445742, + "learning_rate": 1.0840889191189881e-05, + "loss": 0.2815, + "step": 834 + }, + { + "epoch": 3.432682425488181, + "grad_norm": 0.21986260521948456, + "learning_rate": 1.0789823753690165e-05, + "loss": 0.265, + "step": 835 + }, + { + "epoch": 3.436793422404933, + "grad_norm": 0.2472526595417136, + "learning_rate": 1.073883440609923e-05, + "loss": 0.2819, + "step": 836 + }, + { + "epoch": 3.4409044193216856, + "grad_norm": 0.23044008878105163, + "learning_rate": 1.0687921569665778e-05, + "loss": 0.2743, + "step": 837 + }, + { + "epoch": 3.4450154162384377, + "grad_norm": 0.2127401189830073, + "learning_rate": 1.0637085665006416e-05, + "loss": 0.2757, + "step": 838 + }, + { + "epoch": 3.44912641315519, + "grad_norm": 0.23011524871297998, + "learning_rate": 1.058632711210218e-05, + "loss": 0.2867, + "step": 839 + }, + { + "epoch": 3.4532374100719423, + "grad_norm": 0.2143448487687264, + "learning_rate": 1.0535646330295064e-05, + "loss": 0.2775, + "step": 840 + }, + { + "epoch": 3.457348406988695, + "grad_norm": 0.2157327739595805, + "learning_rate": 1.0485043738284543e-05, + "loss": 0.2772, + "step": 841 + }, + { + "epoch": 3.461459403905447, + "grad_norm": 0.21901388123050422, + "learning_rate": 1.0434519754124155e-05, + "loss": 0.2883, + "step": 842 + }, + { + "epoch": 3.4655704008221995, + "grad_norm": 0.20706260357694797, + "learning_rate": 1.0384074795217995e-05, + "loss": 0.2729, + "step": 843 + }, + { + "epoch": 3.4696813977389516, + "grad_norm": 0.20562057118619545, + "learning_rate": 1.0333709278317295e-05, + "loss": 0.2794, + "step": 844 + }, + { + "epoch": 3.473792394655704, + "grad_norm": 0.2053621886084836, + "learning_rate": 1.0283423619516984e-05, + "loss": 0.2831, + "step": 845 + }, + { + "epoch": 3.477903391572456, + "grad_norm": 0.21559967106224392, + "learning_rate": 1.0233218234252233e-05, + "loss": 0.2798, + "step": 846 + }, + { + "epoch": 3.4820143884892087, + "grad_norm": 0.20693716417643127, + "learning_rate": 1.0183093537295038e-05, + "loss": 0.2834, + "step": 847 + }, + { + "epoch": 3.486125385405961, + "grad_norm": 0.196045141198551, + "learning_rate": 1.0133049942750794e-05, + "loss": 0.2815, + "step": 848 + }, + { + "epoch": 3.4902363823227134, + "grad_norm": 0.22184037078133786, + "learning_rate": 1.0083087864054862e-05, + "loss": 0.2782, + "step": 849 + }, + { + "epoch": 3.4943473792394655, + "grad_norm": 0.21680925373572774, + "learning_rate": 1.0033207713969152e-05, + "loss": 0.2668, + "step": 850 + }, + { + "epoch": 3.498458376156218, + "grad_norm": 0.20929159215700033, + "learning_rate": 9.983409904578732e-06, + "loss": 0.2771, + "step": 851 + }, + { + "epoch": 3.50256937307297, + "grad_norm": 0.20085747960075442, + "learning_rate": 9.93369484728841e-06, + "loss": 0.2769, + "step": 852 + }, + { + "epoch": 3.5066803699897227, + "grad_norm": 0.21740453796251422, + "learning_rate": 9.884062952819336e-06, + "loss": 0.2809, + "step": 853 + }, + { + "epoch": 3.5107913669064748, + "grad_norm": 0.19826634602771384, + "learning_rate": 9.834514631205607e-06, + "loss": 0.2826, + "step": 854 + }, + { + "epoch": 3.5149023638232273, + "grad_norm": 0.19929040918628962, + "learning_rate": 9.785050291790886e-06, + "loss": 0.27, + "step": 855 + }, + { + "epoch": 3.5190133607399794, + "grad_norm": 0.20544687803262818, + "learning_rate": 9.735670343225015e-06, + "loss": 0.2759, + "step": 856 + }, + { + "epoch": 3.523124357656732, + "grad_norm": 0.20100075338402584, + "learning_rate": 9.68637519346064e-06, + "loss": 0.2842, + "step": 857 + }, + { + "epoch": 3.527235354573484, + "grad_norm": 0.19998157251828666, + "learning_rate": 9.637165249749847e-06, + "loss": 0.2677, + "step": 858 + }, + { + "epoch": 3.531346351490236, + "grad_norm": 0.20946212814759255, + "learning_rate": 9.588040918640784e-06, + "loss": 0.2819, + "step": 859 + }, + { + "epoch": 3.5354573484069887, + "grad_norm": 0.19305869769870324, + "learning_rate": 9.539002605974315e-06, + "loss": 0.2762, + "step": 860 + }, + { + "epoch": 3.539568345323741, + "grad_norm": 0.22246584009743214, + "learning_rate": 9.490050716880652e-06, + "loss": 0.2761, + "step": 861 + }, + { + "epoch": 3.5436793422404933, + "grad_norm": 0.2106791686837925, + "learning_rate": 9.441185655776044e-06, + "loss": 0.2836, + "step": 862 + }, + { + "epoch": 3.5477903391572454, + "grad_norm": 0.20735417375234855, + "learning_rate": 9.392407826359386e-06, + "loss": 0.2797, + "step": 863 + }, + { + "epoch": 3.551901336073998, + "grad_norm": 0.22319834142117814, + "learning_rate": 9.343717631608913e-06, + "loss": 0.2805, + "step": 864 + }, + { + "epoch": 3.5560123329907505, + "grad_norm": 0.21387661139677305, + "learning_rate": 9.295115473778871e-06, + "loss": 0.2737, + "step": 865 + }, + { + "epoch": 3.5601233299075026, + "grad_norm": 0.19614268534753534, + "learning_rate": 9.246601754396184e-06, + "loss": 0.2775, + "step": 866 + }, + { + "epoch": 3.5642343268242547, + "grad_norm": 0.21426258046660832, + "learning_rate": 9.198176874257147e-06, + "loss": 0.2801, + "step": 867 + }, + { + "epoch": 3.568345323741007, + "grad_norm": 0.20833350511079968, + "learning_rate": 9.149841233424102e-06, + "loss": 0.2903, + "step": 868 + }, + { + "epoch": 3.5724563206577598, + "grad_norm": 0.2054636105867438, + "learning_rate": 9.101595231222142e-06, + "loss": 0.2714, + "step": 869 + }, + { + "epoch": 3.576567317574512, + "grad_norm": 0.21677433378750463, + "learning_rate": 9.053439266235817e-06, + "loss": 0.2747, + "step": 870 + }, + { + "epoch": 3.580678314491264, + "grad_norm": 0.21258620908795176, + "learning_rate": 9.005373736305827e-06, + "loss": 0.2866, + "step": 871 + }, + { + "epoch": 3.5847893114080165, + "grad_norm": 0.21696485235415786, + "learning_rate": 8.957399038525742e-06, + "loss": 0.2768, + "step": 872 + }, + { + "epoch": 3.588900308324769, + "grad_norm": 0.2048645071808934, + "learning_rate": 8.909515569238727e-06, + "loss": 0.2805, + "step": 873 + }, + { + "epoch": 3.593011305241521, + "grad_norm": 0.2041872125610518, + "learning_rate": 8.861723724034256e-06, + "loss": 0.281, + "step": 874 + }, + { + "epoch": 3.597122302158273, + "grad_norm": 0.22105486900940344, + "learning_rate": 8.814023897744861e-06, + "loss": 0.2722, + "step": 875 + }, + { + "epoch": 3.6012332990750258, + "grad_norm": 0.20870597023983126, + "learning_rate": 8.766416484442845e-06, + "loss": 0.288, + "step": 876 + }, + { + "epoch": 3.605344295991778, + "grad_norm": 0.22305369395665908, + "learning_rate": 8.71890187743705e-06, + "loss": 0.2833, + "step": 877 + }, + { + "epoch": 3.6094552929085304, + "grad_norm": 0.20984704638631244, + "learning_rate": 8.6714804692696e-06, + "loss": 0.2815, + "step": 878 + }, + { + "epoch": 3.6135662898252825, + "grad_norm": 0.205661449222605, + "learning_rate": 8.624152651712647e-06, + "loss": 0.2796, + "step": 879 + }, + { + "epoch": 3.617677286742035, + "grad_norm": 0.21670437077691945, + "learning_rate": 8.576918815765155e-06, + "loss": 0.276, + "step": 880 + }, + { + "epoch": 3.621788283658787, + "grad_norm": 0.21657223103082457, + "learning_rate": 8.52977935164965e-06, + "loss": 0.2793, + "step": 881 + }, + { + "epoch": 3.6258992805755397, + "grad_norm": 0.19481374974543536, + "learning_rate": 8.482734648808998e-06, + "loss": 0.2828, + "step": 882 + }, + { + "epoch": 3.6300102774922918, + "grad_norm": 0.22945439732292053, + "learning_rate": 8.435785095903226e-06, + "loss": 0.2767, + "step": 883 + }, + { + "epoch": 3.6341212744090443, + "grad_norm": 0.21026587275904124, + "learning_rate": 8.388931080806244e-06, + "loss": 0.277, + "step": 884 + }, + { + "epoch": 3.6382322713257964, + "grad_norm": 0.1905115883548281, + "learning_rate": 8.342172990602692e-06, + "loss": 0.2743, + "step": 885 + }, + { + "epoch": 3.642343268242549, + "grad_norm": 0.21168502442048126, + "learning_rate": 8.295511211584726e-06, + "loss": 0.2684, + "step": 886 + }, + { + "epoch": 3.646454265159301, + "grad_norm": 0.21498006850055293, + "learning_rate": 8.248946129248821e-06, + "loss": 0.2762, + "step": 887 + }, + { + "epoch": 3.6505652620760536, + "grad_norm": 0.19489762757982362, + "learning_rate": 8.202478128292594e-06, + "loss": 0.279, + "step": 888 + }, + { + "epoch": 3.6546762589928057, + "grad_norm": 0.21734478601458554, + "learning_rate": 8.15610759261163e-06, + "loss": 0.2743, + "step": 889 + }, + { + "epoch": 3.6587872559095582, + "grad_norm": 0.2067502254964237, + "learning_rate": 8.109834905296296e-06, + "loss": 0.2687, + "step": 890 + }, + { + "epoch": 3.6628982528263103, + "grad_norm": 0.20837277018256964, + "learning_rate": 8.06366044862859e-06, + "loss": 0.2776, + "step": 891 + }, + { + "epoch": 3.667009249743063, + "grad_norm": 0.2024301743577271, + "learning_rate": 8.017584604078974e-06, + "loss": 0.2801, + "step": 892 + }, + { + "epoch": 3.671120246659815, + "grad_norm": 0.21530744818182257, + "learning_rate": 7.971607752303226e-06, + "loss": 0.28, + "step": 893 + }, + { + "epoch": 3.675231243576567, + "grad_norm": 0.2139811361890938, + "learning_rate": 7.925730273139294e-06, + "loss": 0.2712, + "step": 894 + }, + { + "epoch": 3.6793422404933196, + "grad_norm": 0.20799988041239068, + "learning_rate": 7.879952545604163e-06, + "loss": 0.2926, + "step": 895 + }, + { + "epoch": 3.683453237410072, + "grad_norm": 0.20418864938595824, + "learning_rate": 7.834274947890715e-06, + "loss": 0.2798, + "step": 896 + }, + { + "epoch": 3.6875642343268242, + "grad_norm": 0.20416263025450562, + "learning_rate": 7.78869785736461e-06, + "loss": 0.2694, + "step": 897 + }, + { + "epoch": 3.6916752312435763, + "grad_norm": 0.19066134679044647, + "learning_rate": 7.74322165056117e-06, + "loss": 0.2667, + "step": 898 + }, + { + "epoch": 3.695786228160329, + "grad_norm": 0.2121667278903765, + "learning_rate": 7.697846703182262e-06, + "loss": 0.2784, + "step": 899 + }, + { + "epoch": 3.6998972250770814, + "grad_norm": 0.2071705071263635, + "learning_rate": 7.652573390093199e-06, + "loss": 0.285, + "step": 900 + }, + { + "epoch": 3.7040082219938335, + "grad_norm": 0.2035973761053005, + "learning_rate": 7.607402085319644e-06, + "loss": 0.2759, + "step": 901 + }, + { + "epoch": 3.7081192189105856, + "grad_norm": 0.2083326930999411, + "learning_rate": 7.562333162044508e-06, + "loss": 0.2775, + "step": 902 + }, + { + "epoch": 3.712230215827338, + "grad_norm": 0.21563075052521988, + "learning_rate": 7.517366992604902e-06, + "loss": 0.2767, + "step": 903 + }, + { + "epoch": 3.7163412127440907, + "grad_norm": 0.20432779262539, + "learning_rate": 7.4725039484890094e-06, + "loss": 0.2874, + "step": 904 + }, + { + "epoch": 3.720452209660843, + "grad_norm": 0.2047844251053815, + "learning_rate": 7.427744400333053e-06, + "loss": 0.2789, + "step": 905 + }, + { + "epoch": 3.724563206577595, + "grad_norm": 0.2055231569256932, + "learning_rate": 7.383088717918223e-06, + "loss": 0.2748, + "step": 906 + }, + { + "epoch": 3.7286742034943474, + "grad_norm": 0.20467879963763858, + "learning_rate": 7.338537270167625e-06, + "loss": 0.277, + "step": 907 + }, + { + "epoch": 3.7327852004111, + "grad_norm": 0.21544746620927177, + "learning_rate": 7.294090425143225e-06, + "loss": 0.273, + "step": 908 + }, + { + "epoch": 3.736896197327852, + "grad_norm": 0.2000666684512926, + "learning_rate": 7.249748550042817e-06, + "loss": 0.2806, + "step": 909 + }, + { + "epoch": 3.741007194244604, + "grad_norm": 0.20770589378766816, + "learning_rate": 7.20551201119698e-06, + "loss": 0.2705, + "step": 910 + }, + { + "epoch": 3.7451181911613567, + "grad_norm": 0.20437780757014407, + "learning_rate": 7.161381174066065e-06, + "loss": 0.2829, + "step": 911 + }, + { + "epoch": 3.749229188078109, + "grad_norm": 0.19567720371080252, + "learning_rate": 7.117356403237161e-06, + "loss": 0.2813, + "step": 912 + }, + { + "epoch": 3.7533401849948613, + "grad_norm": 0.19171574936304334, + "learning_rate": 7.073438062421094e-06, + "loss": 0.2782, + "step": 913 + }, + { + "epoch": 3.7574511819116134, + "grad_norm": 0.20924848866916773, + "learning_rate": 7.029626514449414e-06, + "loss": 0.27, + "step": 914 + }, + { + "epoch": 3.761562178828366, + "grad_norm": 0.20438696705099926, + "learning_rate": 6.985922121271409e-06, + "loss": 0.2728, + "step": 915 + }, + { + "epoch": 3.765673175745118, + "grad_norm": 0.2084495335702813, + "learning_rate": 6.942325243951098e-06, + "loss": 0.2824, + "step": 916 + }, + { + "epoch": 3.7697841726618706, + "grad_norm": 0.1993990523612008, + "learning_rate": 6.898836242664262e-06, + "loss": 0.282, + "step": 917 + }, + { + "epoch": 3.7738951695786227, + "grad_norm": 0.19347775656849484, + "learning_rate": 6.855455476695465e-06, + "loss": 0.2706, + "step": 918 + }, + { + "epoch": 3.7780061664953752, + "grad_norm": 0.20109622486576145, + "learning_rate": 6.812183304435083e-06, + "loss": 0.2801, + "step": 919 + }, + { + "epoch": 3.7821171634121273, + "grad_norm": 0.18886838722656143, + "learning_rate": 6.769020083376341e-06, + "loss": 0.2721, + "step": 920 + }, + { + "epoch": 3.78622816032888, + "grad_norm": 0.208430820513582, + "learning_rate": 6.725966170112368e-06, + "loss": 0.2686, + "step": 921 + }, + { + "epoch": 3.790339157245632, + "grad_norm": 0.1967578418393911, + "learning_rate": 6.6830219203332415e-06, + "loss": 0.2721, + "step": 922 + }, + { + "epoch": 3.7944501541623845, + "grad_norm": 0.2015892872246403, + "learning_rate": 6.640187688823065e-06, + "loss": 0.2792, + "step": 923 + }, + { + "epoch": 3.7985611510791366, + "grad_norm": 0.1938822600108583, + "learning_rate": 6.597463829457014e-06, + "loss": 0.2799, + "step": 924 + }, + { + "epoch": 3.802672147995889, + "grad_norm": 0.2023587740694427, + "learning_rate": 6.554850695198427e-06, + "loss": 0.2695, + "step": 925 + }, + { + "epoch": 3.8067831449126412, + "grad_norm": 0.19570583847216003, + "learning_rate": 6.512348638095887e-06, + "loss": 0.2858, + "step": 926 + }, + { + "epoch": 3.810894141829394, + "grad_norm": 0.19738903231975544, + "learning_rate": 6.469958009280315e-06, + "loss": 0.2681, + "step": 927 + }, + { + "epoch": 3.815005138746146, + "grad_norm": 0.20083483818328293, + "learning_rate": 6.4276791589620595e-06, + "loss": 0.2852, + "step": 928 + }, + { + "epoch": 3.819116135662898, + "grad_norm": 0.19273874331489446, + "learning_rate": 6.385512436428021e-06, + "loss": 0.2864, + "step": 929 + }, + { + "epoch": 3.8232271325796505, + "grad_norm": 0.1869845010972472, + "learning_rate": 6.343458190038747e-06, + "loss": 0.2727, + "step": 930 + }, + { + "epoch": 3.827338129496403, + "grad_norm": 0.19346715289339741, + "learning_rate": 6.301516767225568e-06, + "loss": 0.2739, + "step": 931 + }, + { + "epoch": 3.831449126413155, + "grad_norm": 0.19227603993401987, + "learning_rate": 6.259688514487718e-06, + "loss": 0.2758, + "step": 932 + }, + { + "epoch": 3.8355601233299073, + "grad_norm": 0.20411187735886127, + "learning_rate": 6.217973777389483e-06, + "loss": 0.2761, + "step": 933 + }, + { + "epoch": 3.83967112024666, + "grad_norm": 0.18675711473098772, + "learning_rate": 6.1763729005573284e-06, + "loss": 0.2829, + "step": 934 + }, + { + "epoch": 3.8437821171634123, + "grad_norm": 0.2123802835671684, + "learning_rate": 6.134886227677073e-06, + "loss": 0.2922, + "step": 935 + }, + { + "epoch": 3.8478931140801644, + "grad_norm": 0.18956127541911397, + "learning_rate": 6.093514101491034e-06, + "loss": 0.2763, + "step": 936 + }, + { + "epoch": 3.8520041109969165, + "grad_norm": 0.18788309236848885, + "learning_rate": 6.052256863795198e-06, + "loss": 0.2711, + "step": 937 + }, + { + "epoch": 3.856115107913669, + "grad_norm": 0.19828249178491697, + "learning_rate": 6.0111148554364084e-06, + "loss": 0.2799, + "step": 938 + }, + { + "epoch": 3.8602261048304216, + "grad_norm": 0.18431610567167325, + "learning_rate": 5.970088416309532e-06, + "loss": 0.2689, + "step": 939 + }, + { + "epoch": 3.8643371017471737, + "grad_norm": 0.21004802063561837, + "learning_rate": 5.929177885354665e-06, + "loss": 0.279, + "step": 940 + }, + { + "epoch": 3.868448098663926, + "grad_norm": 0.18145712447424242, + "learning_rate": 5.888383600554326e-06, + "loss": 0.2769, + "step": 941 + }, + { + "epoch": 3.8725590955806783, + "grad_norm": 0.1998489072868665, + "learning_rate": 5.8477058989306605e-06, + "loss": 0.2902, + "step": 942 + }, + { + "epoch": 3.876670092497431, + "grad_norm": 0.19349791063075825, + "learning_rate": 5.807145116542678e-06, + "loss": 0.2772, + "step": 943 + }, + { + "epoch": 3.880781089414183, + "grad_norm": 0.20224120336775228, + "learning_rate": 5.766701588483443e-06, + "loss": 0.2766, + "step": 944 + }, + { + "epoch": 3.884892086330935, + "grad_norm": 0.20201558369754713, + "learning_rate": 5.726375648877329e-06, + "loss": 0.2711, + "step": 945 + }, + { + "epoch": 3.8890030832476876, + "grad_norm": 0.186362787594006, + "learning_rate": 5.68616763087725e-06, + "loss": 0.2637, + "step": 946 + }, + { + "epoch": 3.8931140801644397, + "grad_norm": 0.18827723220330278, + "learning_rate": 5.646077866661912e-06, + "loss": 0.2728, + "step": 947 + }, + { + "epoch": 3.8972250770811923, + "grad_norm": 0.20621122766057245, + "learning_rate": 5.606106687433066e-06, + "loss": 0.277, + "step": 948 + }, + { + "epoch": 3.9013360739979444, + "grad_norm": 0.1997165387359167, + "learning_rate": 5.5662544234127735e-06, + "loss": 0.2852, + "step": 949 + }, + { + "epoch": 3.905447070914697, + "grad_norm": 0.1986176597393475, + "learning_rate": 5.526521403840677e-06, + "loss": 0.2724, + "step": 950 + }, + { + "epoch": 3.909558067831449, + "grad_norm": 0.19315083170854766, + "learning_rate": 5.486907956971277e-06, + "loss": 0.2654, + "step": 951 + }, + { + "epoch": 3.9136690647482015, + "grad_norm": 0.19208269826966257, + "learning_rate": 5.447414410071232e-06, + "loss": 0.28, + "step": 952 + }, + { + "epoch": 3.9177800616649536, + "grad_norm": 0.1986061425594109, + "learning_rate": 5.40804108941664e-06, + "loss": 0.2809, + "step": 953 + }, + { + "epoch": 3.921891058581706, + "grad_norm": 0.18060496237659096, + "learning_rate": 5.36878832029035e-06, + "loss": 0.2753, + "step": 954 + }, + { + "epoch": 3.9260020554984583, + "grad_norm": 0.19007144815119342, + "learning_rate": 5.329656426979275e-06, + "loss": 0.2844, + "step": 955 + }, + { + "epoch": 3.930113052415211, + "grad_norm": 0.18228170358892676, + "learning_rate": 5.290645732771711e-06, + "loss": 0.2776, + "step": 956 + }, + { + "epoch": 3.934224049331963, + "grad_norm": 0.20611317253574513, + "learning_rate": 5.251756559954668e-06, + "loss": 0.2752, + "step": 957 + }, + { + "epoch": 3.9383350462487154, + "grad_norm": 0.19496510102086326, + "learning_rate": 5.212989229811209e-06, + "loss": 0.2703, + "step": 958 + }, + { + "epoch": 3.9424460431654675, + "grad_norm": 0.18813827312165923, + "learning_rate": 5.174344062617789e-06, + "loss": 0.2817, + "step": 959 + }, + { + "epoch": 3.94655704008222, + "grad_norm": 0.19091427031439173, + "learning_rate": 5.135821377641616e-06, + "loss": 0.2787, + "step": 960 + }, + { + "epoch": 3.950668036998972, + "grad_norm": 0.1901592123123516, + "learning_rate": 5.097421493138008e-06, + "loss": 0.2766, + "step": 961 + }, + { + "epoch": 3.9547790339157247, + "grad_norm": 0.1870098631363826, + "learning_rate": 5.059144726347765e-06, + "loss": 0.2728, + "step": 962 + }, + { + "epoch": 3.958890030832477, + "grad_norm": 0.17796954931972553, + "learning_rate": 5.020991393494558e-06, + "loss": 0.2867, + "step": 963 + }, + { + "epoch": 3.963001027749229, + "grad_norm": 0.19046713852280395, + "learning_rate": 4.9829618097823055e-06, + "loss": 0.2675, + "step": 964 + }, + { + "epoch": 3.9671120246659815, + "grad_norm": 0.19367792434634498, + "learning_rate": 4.945056289392565e-06, + "loss": 0.2765, + "step": 965 + }, + { + "epoch": 3.971223021582734, + "grad_norm": 0.18974765427392373, + "learning_rate": 4.907275145481947e-06, + "loss": 0.2731, + "step": 966 + }, + { + "epoch": 3.975334018499486, + "grad_norm": 0.18889755922787974, + "learning_rate": 4.8696186901795275e-06, + "loss": 0.2817, + "step": 967 + }, + { + "epoch": 3.979445015416238, + "grad_norm": 0.19028199023394596, + "learning_rate": 4.832087234584266e-06, + "loss": 0.2783, + "step": 968 + }, + { + "epoch": 3.9835560123329907, + "grad_norm": 0.1964825876876656, + "learning_rate": 4.794681088762438e-06, + "loss": 0.2744, + "step": 969 + }, + { + "epoch": 3.9876670092497433, + "grad_norm": 0.17957398832039587, + "learning_rate": 4.757400561745069e-06, + "loss": 0.2762, + "step": 970 + }, + { + "epoch": 3.9917780061664954, + "grad_norm": 0.20987505932024647, + "learning_rate": 4.720245961525387e-06, + "loss": 0.2949, + "step": 971 + }, + { + "epoch": 3.9958890030832475, + "grad_norm": 0.18879687589648914, + "learning_rate": 4.683217595056275e-06, + "loss": 0.2746, + "step": 972 + }, + { + "epoch": 4.0, + "grad_norm": 1.707156689602904, + "learning_rate": 4.646315768247731e-06, + "loss": 0.2868, + "step": 973 + }, + { + "epoch": 4.0041109969167525, + "grad_norm": 0.3673275720964706, + "learning_rate": 4.609540785964348e-06, + "loss": 0.2379, + "step": 974 + }, + { + "epoch": 4.008221993833504, + "grad_norm": 0.26013071708722996, + "learning_rate": 4.572892952022796e-06, + "loss": 0.2495, + "step": 975 + }, + { + "epoch": 4.012332990750257, + "grad_norm": 0.30039166221512403, + "learning_rate": 4.5363725691893045e-06, + "loss": 0.2434, + "step": 976 + }, + { + "epoch": 4.016443987667009, + "grad_norm": 0.40331206801802966, + "learning_rate": 4.499979939177164e-06, + "loss": 0.2413, + "step": 977 + }, + { + "epoch": 4.020554984583762, + "grad_norm": 0.2653915725640132, + "learning_rate": 4.463715362644239e-06, + "loss": 0.2415, + "step": 978 + }, + { + "epoch": 4.0246659815005135, + "grad_norm": 0.2706794398843468, + "learning_rate": 4.427579139190474e-06, + "loss": 0.2353, + "step": 979 + }, + { + "epoch": 4.028776978417266, + "grad_norm": 0.33800513453404296, + "learning_rate": 4.391571567355428e-06, + "loss": 0.244, + "step": 980 + }, + { + "epoch": 4.0328879753340185, + "grad_norm": 0.2848868937309266, + "learning_rate": 4.355692944615806e-06, + "loss": 0.2446, + "step": 981 + }, + { + "epoch": 4.036998972250771, + "grad_norm": 0.213052312700043, + "learning_rate": 4.319943567382991e-06, + "loss": 0.2446, + "step": 982 + }, + { + "epoch": 4.041109969167523, + "grad_norm": 0.24448300665475436, + "learning_rate": 4.28432373100061e-06, + "loss": 0.2383, + "step": 983 + }, + { + "epoch": 4.045220966084275, + "grad_norm": 0.28289541109409083, + "learning_rate": 4.248833729742095e-06, + "loss": 0.2335, + "step": 984 + }, + { + "epoch": 4.049331963001028, + "grad_norm": 0.27075279957678594, + "learning_rate": 4.2134738568082325e-06, + "loss": 0.2388, + "step": 985 + }, + { + "epoch": 4.05344295991778, + "grad_norm": 0.2271083193598205, + "learning_rate": 4.1782444043247565e-06, + "loss": 0.2386, + "step": 986 + }, + { + "epoch": 4.057553956834532, + "grad_norm": 0.22324730717439883, + "learning_rate": 4.143145663339932e-06, + "loss": 0.2447, + "step": 987 + }, + { + "epoch": 4.061664953751285, + "grad_norm": 0.26100760343340185, + "learning_rate": 4.108177923822154e-06, + "loss": 0.2426, + "step": 988 + }, + { + "epoch": 4.065775950668037, + "grad_norm": 0.23257567018511596, + "learning_rate": 4.073341474657544e-06, + "loss": 0.2482, + "step": 989 + }, + { + "epoch": 4.06988694758479, + "grad_norm": 0.1994071326027501, + "learning_rate": 4.03863660364757e-06, + "loss": 0.2389, + "step": 990 + }, + { + "epoch": 4.073997944501541, + "grad_norm": 0.21371643270197568, + "learning_rate": 4.004063597506664e-06, + "loss": 0.2337, + "step": 991 + }, + { + "epoch": 4.078108941418294, + "grad_norm": 0.24512669596399653, + "learning_rate": 3.969622741859862e-06, + "loss": 0.2477, + "step": 992 + }, + { + "epoch": 4.082219938335046, + "grad_norm": 0.21744045295237915, + "learning_rate": 3.935314321240433e-06, + "loss": 0.2405, + "step": 993 + }, + { + "epoch": 4.086330935251799, + "grad_norm": 0.20192278557379797, + "learning_rate": 3.90113861908753e-06, + "loss": 0.2394, + "step": 994 + }, + { + "epoch": 4.090441932168551, + "grad_norm": 0.2027471703666848, + "learning_rate": 3.867095917743862e-06, + "loss": 0.2326, + "step": 995 + }, + { + "epoch": 4.094552929085303, + "grad_norm": 0.20882580151186148, + "learning_rate": 3.8331864984533404e-06, + "loss": 0.2362, + "step": 996 + }, + { + "epoch": 4.098663926002056, + "grad_norm": 0.1930471416017011, + "learning_rate": 3.799410641358776e-06, + "loss": 0.2462, + "step": 997 + }, + { + "epoch": 4.102774922918808, + "grad_norm": 0.19859635746881463, + "learning_rate": 3.7657686254995483e-06, + "loss": 0.2404, + "step": 998 + }, + { + "epoch": 4.10688591983556, + "grad_norm": 0.1983957254871405, + "learning_rate": 3.7322607288093117e-06, + "loss": 0.2398, + "step": 999 + }, + { + "epoch": 4.110996916752312, + "grad_norm": 0.22293857279886048, + "learning_rate": 3.6988872281136855e-06, + "loss": 0.2363, + "step": 1000 + }, + { + "epoch": 4.115107913669065, + "grad_norm": 0.20443840443106004, + "learning_rate": 3.66564839912799e-06, + "loss": 0.2318, + "step": 1001 + }, + { + "epoch": 4.1192189105858175, + "grad_norm": 0.17966769630726293, + "learning_rate": 3.632544516454941e-06, + "loss": 0.2359, + "step": 1002 + }, + { + "epoch": 4.123329907502569, + "grad_norm": 0.19432549741475053, + "learning_rate": 3.5995758535823997e-06, + "loss": 0.2316, + "step": 1003 + }, + { + "epoch": 4.127440904419322, + "grad_norm": 0.18881014978005276, + "learning_rate": 3.566742682881119e-06, + "loss": 0.2608, + "step": 1004 + }, + { + "epoch": 4.131551901336074, + "grad_norm": 0.19088807670118796, + "learning_rate": 3.534045275602467e-06, + "loss": 0.242, + "step": 1005 + }, + { + "epoch": 4.135662898252827, + "grad_norm": 0.1816637262264018, + "learning_rate": 3.501483901876208e-06, + "loss": 0.244, + "step": 1006 + }, + { + "epoch": 4.139773895169578, + "grad_norm": 0.19010713069523394, + "learning_rate": 3.469058830708263e-06, + "loss": 0.2324, + "step": 1007 + }, + { + "epoch": 4.143884892086331, + "grad_norm": 0.19620537155899534, + "learning_rate": 3.436770329978494e-06, + "loss": 0.2481, + "step": 1008 + }, + { + "epoch": 4.1479958890030835, + "grad_norm": 0.18566900979279455, + "learning_rate": 3.4046186664384795e-06, + "loss": 0.2432, + "step": 1009 + }, + { + "epoch": 4.152106885919835, + "grad_norm": 0.1755700170371331, + "learning_rate": 3.3726041057093186e-06, + "loss": 0.2386, + "step": 1010 + }, + { + "epoch": 4.156217882836588, + "grad_norm": 0.18096902328410783, + "learning_rate": 3.3407269122794373e-06, + "loss": 0.2487, + "step": 1011 + }, + { + "epoch": 4.16032887975334, + "grad_norm": 0.192754387487128, + "learning_rate": 3.3089873495023995e-06, + "loss": 0.234, + "step": 1012 + }, + { + "epoch": 4.164439876670093, + "grad_norm": 0.19892387100550088, + "learning_rate": 3.2773856795947336e-06, + "loss": 0.2339, + "step": 1013 + }, + { + "epoch": 4.168550873586844, + "grad_norm": 0.18465157283491226, + "learning_rate": 3.2459221636337633e-06, + "loss": 0.2379, + "step": 1014 + }, + { + "epoch": 4.172661870503597, + "grad_norm": 0.1899662552430034, + "learning_rate": 3.214597061555458e-06, + "loss": 0.2292, + "step": 1015 + }, + { + "epoch": 4.1767728674203495, + "grad_norm": 0.18665807494909734, + "learning_rate": 3.1834106321522727e-06, + "loss": 0.2371, + "step": 1016 + }, + { + "epoch": 4.180883864337102, + "grad_norm": 0.1854509036542964, + "learning_rate": 3.152363133071024e-06, + "loss": 0.2433, + "step": 1017 + }, + { + "epoch": 4.184994861253854, + "grad_norm": 0.20338354606609246, + "learning_rate": 3.12145482081075e-06, + "loss": 0.2373, + "step": 1018 + }, + { + "epoch": 4.189105858170606, + "grad_norm": 0.1823424640205926, + "learning_rate": 3.0906859507206044e-06, + "loss": 0.2425, + "step": 1019 + }, + { + "epoch": 4.193216855087359, + "grad_norm": 0.18646817047228667, + "learning_rate": 3.0600567769977286e-06, + "loss": 0.2388, + "step": 1020 + }, + { + "epoch": 4.197327852004111, + "grad_norm": 0.19248044840190429, + "learning_rate": 3.0295675526851686e-06, + "loss": 0.2327, + "step": 1021 + }, + { + "epoch": 4.201438848920863, + "grad_norm": 0.1895682953006883, + "learning_rate": 2.9992185296697763e-06, + "loss": 0.2494, + "step": 1022 + }, + { + "epoch": 4.2055498458376155, + "grad_norm": 0.1775774161260345, + "learning_rate": 2.9690099586801223e-06, + "loss": 0.2431, + "step": 1023 + }, + { + "epoch": 4.209660842754368, + "grad_norm": 0.18688744320331976, + "learning_rate": 2.938942089284453e-06, + "loss": 0.2243, + "step": 1024 + }, + { + "epoch": 4.213771839671121, + "grad_norm": 0.18321913204838605, + "learning_rate": 2.909015169888587e-06, + "loss": 0.2361, + "step": 1025 + }, + { + "epoch": 4.217882836587872, + "grad_norm": 0.18558364928419416, + "learning_rate": 2.879229447733893e-06, + "loss": 0.2438, + "step": 1026 + }, + { + "epoch": 4.221993833504625, + "grad_norm": 0.18370056819501662, + "learning_rate": 2.849585168895237e-06, + "loss": 0.2372, + "step": 1027 + }, + { + "epoch": 4.226104830421377, + "grad_norm": 0.17922623411257754, + "learning_rate": 2.8200825782789466e-06, + "loss": 0.2389, + "step": 1028 + }, + { + "epoch": 4.23021582733813, + "grad_norm": 0.1814704060047799, + "learning_rate": 2.790721919620798e-06, + "loss": 0.2299, + "step": 1029 + }, + { + "epoch": 4.2343268242548815, + "grad_norm": 0.18999808738781843, + "learning_rate": 2.7615034354839942e-06, + "loss": 0.2346, + "step": 1030 + }, + { + "epoch": 4.238437821171634, + "grad_norm": 0.18017749013937312, + "learning_rate": 2.7324273672571577e-06, + "loss": 0.2337, + "step": 1031 + }, + { + "epoch": 4.242548818088387, + "grad_norm": 0.1799591693551389, + "learning_rate": 2.7034939551523476e-06, + "loss": 0.2439, + "step": 1032 + }, + { + "epoch": 4.246659815005139, + "grad_norm": 0.18327330613798448, + "learning_rate": 2.6747034382030655e-06, + "loss": 0.2445, + "step": 1033 + }, + { + "epoch": 4.250770811921891, + "grad_norm": 0.17889375387571904, + "learning_rate": 2.646056054262287e-06, + "loss": 0.2467, + "step": 1034 + }, + { + "epoch": 4.254881808838643, + "grad_norm": 0.17679323974968908, + "learning_rate": 2.6175520400004907e-06, + "loss": 0.2405, + "step": 1035 + }, + { + "epoch": 4.258992805755396, + "grad_norm": 0.17336200096095578, + "learning_rate": 2.5891916309037046e-06, + "loss": 0.2367, + "step": 1036 + }, + { + "epoch": 4.263103802672148, + "grad_norm": 0.1862342732350899, + "learning_rate": 2.560975061271569e-06, + "loss": 0.2294, + "step": 1037 + }, + { + "epoch": 4.2672147995889, + "grad_norm": 0.1761467998629582, + "learning_rate": 2.5329025642153873e-06, + "loss": 0.2448, + "step": 1038 + }, + { + "epoch": 4.271325796505653, + "grad_norm": 0.17762679763602063, + "learning_rate": 2.5049743716562104e-06, + "loss": 0.2459, + "step": 1039 + }, + { + "epoch": 4.275436793422405, + "grad_norm": 0.17679704716813474, + "learning_rate": 2.4771907143229124e-06, + "loss": 0.2366, + "step": 1040 + }, + { + "epoch": 4.279547790339157, + "grad_norm": 0.18512587191088023, + "learning_rate": 2.4495518217502936e-06, + "loss": 0.2334, + "step": 1041 + }, + { + "epoch": 4.283658787255909, + "grad_norm": 0.17098883894517244, + "learning_rate": 2.422057922277179e-06, + "loss": 0.2366, + "step": 1042 + }, + { + "epoch": 4.287769784172662, + "grad_norm": 0.19038355517722344, + "learning_rate": 2.3947092430445284e-06, + "loss": 0.2361, + "step": 1043 + }, + { + "epoch": 4.291880781089414, + "grad_norm": 0.1807683215327849, + "learning_rate": 2.367506009993572e-06, + "loss": 0.2314, + "step": 1044 + }, + { + "epoch": 4.295991778006167, + "grad_norm": 0.18180874713294695, + "learning_rate": 2.34044844786393e-06, + "loss": 0.2385, + "step": 1045 + }, + { + "epoch": 4.300102774922919, + "grad_norm": 0.18147750075878016, + "learning_rate": 2.313536780191763e-06, + "loss": 0.2336, + "step": 1046 + }, + { + "epoch": 4.304213771839671, + "grad_norm": 0.1782373533284217, + "learning_rate": 2.2867712293079223e-06, + "loss": 0.2356, + "step": 1047 + }, + { + "epoch": 4.308324768756424, + "grad_norm": 0.17802709783230702, + "learning_rate": 2.2601520163361166e-06, + "loss": 0.2445, + "step": 1048 + }, + { + "epoch": 4.312435765673175, + "grad_norm": 0.17602254086438468, + "learning_rate": 2.233679361191081e-06, + "loss": 0.2296, + "step": 1049 + }, + { + "epoch": 4.316546762589928, + "grad_norm": 0.17604437821882946, + "learning_rate": 2.2073534825767683e-06, + "loss": 0.2493, + "step": 1050 + }, + { + "epoch": 4.32065775950668, + "grad_norm": 0.18364670883928147, + "learning_rate": 2.18117459798453e-06, + "loss": 0.2332, + "step": 1051 + }, + { + "epoch": 4.324768756423433, + "grad_norm": 0.17647874008223446, + "learning_rate": 2.155142923691329e-06, + "loss": 0.2434, + "step": 1052 + }, + { + "epoch": 4.328879753340185, + "grad_norm": 0.1821284298329628, + "learning_rate": 2.129258674757948e-06, + "loss": 0.2405, + "step": 1053 + }, + { + "epoch": 4.332990750256937, + "grad_norm": 0.17780536510155415, + "learning_rate": 2.103522065027217e-06, + "loss": 0.2352, + "step": 1054 + }, + { + "epoch": 4.33710174717369, + "grad_norm": 0.17826171239681762, + "learning_rate": 2.07793330712224e-06, + "loss": 0.2389, + "step": 1055 + }, + { + "epoch": 4.341212744090442, + "grad_norm": 0.17939747251527152, + "learning_rate": 2.0524926124446497e-06, + "loss": 0.2419, + "step": 1056 + }, + { + "epoch": 4.345323741007194, + "grad_norm": 0.18203279406090278, + "learning_rate": 2.0272001911728466e-06, + "loss": 0.237, + "step": 1057 + }, + { + "epoch": 4.349434737923946, + "grad_norm": 0.1797008020492476, + "learning_rate": 2.0020562522602716e-06, + "loss": 0.2341, + "step": 1058 + }, + { + "epoch": 4.353545734840699, + "grad_norm": 0.1739746708895779, + "learning_rate": 1.9770610034336823e-06, + "loss": 0.2391, + "step": 1059 + }, + { + "epoch": 4.3576567317574515, + "grad_norm": 0.18058019470972567, + "learning_rate": 1.9522146511914265e-06, + "loss": 0.2322, + "step": 1060 + }, + { + "epoch": 4.361767728674203, + "grad_norm": 0.17826352479236454, + "learning_rate": 1.927517400801746e-06, + "loss": 0.2422, + "step": 1061 + }, + { + "epoch": 4.365878725590956, + "grad_norm": 0.16893750635969274, + "learning_rate": 1.902969456301076e-06, + "loss": 0.2332, + "step": 1062 + }, + { + "epoch": 4.369989722507708, + "grad_norm": 0.1782753702044134, + "learning_rate": 1.8785710204923612e-06, + "loss": 0.2385, + "step": 1063 + }, + { + "epoch": 4.374100719424461, + "grad_norm": 0.18141238302838078, + "learning_rate": 1.8543222949433736e-06, + "loss": 0.2463, + "step": 1064 + }, + { + "epoch": 4.378211716341212, + "grad_norm": 0.17681464500466545, + "learning_rate": 1.8302234799850671e-06, + "loss": 0.2441, + "step": 1065 + }, + { + "epoch": 4.382322713257965, + "grad_norm": 0.17491562407701997, + "learning_rate": 1.8062747747098974e-06, + "loss": 0.2359, + "step": 1066 + }, + { + "epoch": 4.3864337101747175, + "grad_norm": 0.17582489650428018, + "learning_rate": 1.782476376970188e-06, + "loss": 0.2518, + "step": 1067 + }, + { + "epoch": 4.39054470709147, + "grad_norm": 0.179309298657522, + "learning_rate": 1.7588284833765024e-06, + "loss": 0.2509, + "step": 1068 + }, + { + "epoch": 4.394655704008222, + "grad_norm": 0.17355362198390345, + "learning_rate": 1.7353312892960095e-06, + "loss": 0.2396, + "step": 1069 + }, + { + "epoch": 4.398766700924974, + "grad_norm": 0.20669657827730264, + "learning_rate": 1.7119849888508766e-06, + "loss": 0.2401, + "step": 1070 + }, + { + "epoch": 4.402877697841727, + "grad_norm": 0.1759615590766294, + "learning_rate": 1.6887897749166548e-06, + "loss": 0.239, + "step": 1071 + }, + { + "epoch": 4.406988694758479, + "grad_norm": 0.1867129692924266, + "learning_rate": 1.6657458391207049e-06, + "loss": 0.24, + "step": 1072 + }, + { + "epoch": 4.411099691675231, + "grad_norm": 0.17724290686658428, + "learning_rate": 1.6428533718405914e-06, + "loss": 0.2485, + "step": 1073 + }, + { + "epoch": 4.4152106885919835, + "grad_norm": 0.1769255669225784, + "learning_rate": 1.6201125622025315e-06, + "loss": 0.2343, + "step": 1074 + }, + { + "epoch": 4.419321685508736, + "grad_norm": 0.17424978161900648, + "learning_rate": 1.5975235980798153e-06, + "loss": 0.2299, + "step": 1075 + }, + { + "epoch": 4.423432682425489, + "grad_norm": 0.17407143372977305, + "learning_rate": 1.5750866660912634e-06, + "loss": 0.2294, + "step": 1076 + }, + { + "epoch": 4.42754367934224, + "grad_norm": 0.17791660590457703, + "learning_rate": 1.5528019515996783e-06, + "loss": 0.2425, + "step": 1077 + }, + { + "epoch": 4.431654676258993, + "grad_norm": 0.18301382705782807, + "learning_rate": 1.5306696387103227e-06, + "loss": 0.2343, + "step": 1078 + }, + { + "epoch": 4.435765673175745, + "grad_norm": 0.17589070591387826, + "learning_rate": 1.5086899102693875e-06, + "loss": 0.2469, + "step": 1079 + }, + { + "epoch": 4.439876670092497, + "grad_norm": 0.17198527500762634, + "learning_rate": 1.486862947862493e-06, + "loss": 0.2463, + "step": 1080 + }, + { + "epoch": 4.4439876670092495, + "grad_norm": 0.17792281862140422, + "learning_rate": 1.465188931813175e-06, + "loss": 0.2301, + "step": 1081 + }, + { + "epoch": 4.448098663926002, + "grad_norm": 0.17628369792032114, + "learning_rate": 1.4436680411814097e-06, + "loss": 0.2399, + "step": 1082 + }, + { + "epoch": 4.452209660842755, + "grad_norm": 0.17439136560526375, + "learning_rate": 1.42230045376212e-06, + "loss": 0.237, + "step": 1083 + }, + { + "epoch": 4.456320657759507, + "grad_norm": 0.17943067929919523, + "learning_rate": 1.4010863460837132e-06, + "loss": 0.2405, + "step": 1084 + }, + { + "epoch": 4.460431654676259, + "grad_norm": 0.17235533535415476, + "learning_rate": 1.380025893406638e-06, + "loss": 0.2397, + "step": 1085 + }, + { + "epoch": 4.464542651593011, + "grad_norm": 0.17886870223554543, + "learning_rate": 1.3591192697219003e-06, + "loss": 0.2409, + "step": 1086 + }, + { + "epoch": 4.468653648509764, + "grad_norm": 0.16738079998494204, + "learning_rate": 1.3383666477496627e-06, + "loss": 0.2387, + "step": 1087 + }, + { + "epoch": 4.4727646454265155, + "grad_norm": 0.16758787660813548, + "learning_rate": 1.3177681989377944e-06, + "loss": 0.2417, + "step": 1088 + }, + { + "epoch": 4.476875642343268, + "grad_norm": 0.1752202184059869, + "learning_rate": 1.2973240934604658e-06, + "loss": 0.2274, + "step": 1089 + }, + { + "epoch": 4.480986639260021, + "grad_norm": 0.17463210365794904, + "learning_rate": 1.277034500216736e-06, + "loss": 0.226, + "step": 1090 + }, + { + "epoch": 4.485097636176773, + "grad_norm": 0.17601812963083546, + "learning_rate": 1.2568995868291656e-06, + "loss": 0.2491, + "step": 1091 + }, + { + "epoch": 4.489208633093525, + "grad_norm": 0.1775954820916016, + "learning_rate": 1.236919519642421e-06, + "loss": 0.2432, + "step": 1092 + }, + { + "epoch": 4.493319630010277, + "grad_norm": 0.17531188232428954, + "learning_rate": 1.2170944637219106e-06, + "loss": 0.2417, + "step": 1093 + }, + { + "epoch": 4.49743062692703, + "grad_norm": 0.1752476486120662, + "learning_rate": 1.1974245828524156e-06, + "loss": 0.2274, + "step": 1094 + }, + { + "epoch": 4.501541623843782, + "grad_norm": 0.18283725978641932, + "learning_rate": 1.177910039536736e-06, + "loss": 0.2408, + "step": 1095 + }, + { + "epoch": 4.505652620760534, + "grad_norm": 0.17390337086901564, + "learning_rate": 1.1585509949943518e-06, + "loss": 0.2374, + "step": 1096 + }, + { + "epoch": 4.509763617677287, + "grad_norm": 0.1780578101655513, + "learning_rate": 1.1393476091600886e-06, + "loss": 0.2473, + "step": 1097 + }, + { + "epoch": 4.513874614594039, + "grad_norm": 0.17965538239208087, + "learning_rate": 1.120300040682798e-06, + "loss": 0.244, + "step": 1098 + }, + { + "epoch": 4.517985611510792, + "grad_norm": 0.17589641804084827, + "learning_rate": 1.1014084469240461e-06, + "loss": 0.2435, + "step": 1099 + }, + { + "epoch": 4.522096608427543, + "grad_norm": 0.17442573382633822, + "learning_rate": 1.0826729839568073e-06, + "loss": 0.2417, + "step": 1100 + }, + { + "epoch": 4.526207605344296, + "grad_norm": 0.1807965837015226, + "learning_rate": 1.0640938065641926e-06, + "loss": 0.2424, + "step": 1101 + }, + { + "epoch": 4.530318602261048, + "grad_norm": 0.17785902254626473, + "learning_rate": 1.0456710682381455e-06, + "loss": 0.2546, + "step": 1102 + }, + { + "epoch": 4.534429599177801, + "grad_norm": 0.17258906030475882, + "learning_rate": 1.0274049211781967e-06, + "loss": 0.2422, + "step": 1103 + }, + { + "epoch": 4.538540596094553, + "grad_norm": 1.2728190292585875, + "learning_rate": 1.009295516290194e-06, + "loss": 0.2608, + "step": 1104 + }, + { + "epoch": 4.542651593011305, + "grad_norm": 0.17125359621163755, + "learning_rate": 9.913430031850635e-07, + "loss": 0.2356, + "step": 1105 + }, + { + "epoch": 4.546762589928058, + "grad_norm": 0.17651600362640116, + "learning_rate": 9.735475301775632e-07, + "loss": 0.246, + "step": 1106 + }, + { + "epoch": 4.55087358684481, + "grad_norm": 0.16931432337364227, + "learning_rate": 9.559092442850671e-07, + "loss": 0.2289, + "step": 1107 + }, + { + "epoch": 4.554984583761562, + "grad_norm": 0.17727932117256406, + "learning_rate": 9.384282912263475e-07, + "loss": 0.2334, + "step": 1108 + }, + { + "epoch": 4.559095580678314, + "grad_norm": 0.17674518726323318, + "learning_rate": 9.211048154203661e-07, + "loss": 0.2512, + "step": 1109 + }, + { + "epoch": 4.563206577595067, + "grad_norm": 0.17332686615916262, + "learning_rate": 9.039389599850912e-07, + "loss": 0.2339, + "step": 1110 + }, + { + "epoch": 4.567317574511819, + "grad_norm": 0.1651967177583125, + "learning_rate": 8.869308667363063e-07, + "loss": 0.241, + "step": 1111 + }, + { + "epoch": 4.571428571428571, + "grad_norm": 0.17416950220257496, + "learning_rate": 8.700806761864466e-07, + "loss": 0.2329, + "step": 1112 + }, + { + "epoch": 4.575539568345324, + "grad_norm": 0.16574888125550336, + "learning_rate": 8.533885275434283e-07, + "loss": 0.2429, + "step": 1113 + }, + { + "epoch": 4.579650565262076, + "grad_norm": 0.1731537234968332, + "learning_rate": 8.368545587095056e-07, + "loss": 0.2414, + "step": 1114 + }, + { + "epoch": 4.583761562178829, + "grad_norm": 0.17116481210634382, + "learning_rate": 8.20478906280131e-07, + "loss": 0.2405, + "step": 1115 + }, + { + "epoch": 4.5878725590955804, + "grad_norm": 0.17008303524989962, + "learning_rate": 8.042617055428215e-07, + "loss": 0.2313, + "step": 1116 + }, + { + "epoch": 4.591983556012333, + "grad_norm": 0.1724744667796713, + "learning_rate": 7.882030904760518e-07, + "loss": 0.238, + "step": 1117 + }, + { + "epoch": 4.5960945529290855, + "grad_norm": 0.17588735295940247, + "learning_rate": 7.723031937481318e-07, + "loss": 0.2497, + "step": 1118 + }, + { + "epoch": 4.600205549845837, + "grad_norm": 0.17565531889855016, + "learning_rate": 7.565621467161244e-07, + "loss": 0.2563, + "step": 1119 + }, + { + "epoch": 4.60431654676259, + "grad_norm": 0.1730755101477369, + "learning_rate": 7.409800794247557e-07, + "loss": 0.2337, + "step": 1120 + }, + { + "epoch": 4.608427543679342, + "grad_norm": 0.16854464186311885, + "learning_rate": 7.25557120605338e-07, + "loss": 0.2445, + "step": 1121 + }, + { + "epoch": 4.612538540596095, + "grad_norm": 0.17181736526067778, + "learning_rate": 7.102933976747084e-07, + "loss": 0.2356, + "step": 1122 + }, + { + "epoch": 4.616649537512847, + "grad_norm": 0.17224841346483227, + "learning_rate": 6.951890367341763e-07, + "loss": 0.2404, + "step": 1123 + }, + { + "epoch": 4.620760534429599, + "grad_norm": 0.174217100719342, + "learning_rate": 6.802441625684774e-07, + "loss": 0.2505, + "step": 1124 + }, + { + "epoch": 4.6248715313463515, + "grad_norm": 0.172147168668161, + "learning_rate": 6.654588986447597e-07, + "loss": 0.2387, + "step": 1125 + }, + { + "epoch": 4.628982528263104, + "grad_norm": 0.17025566837994996, + "learning_rate": 6.508333671115341e-07, + "loss": 0.2445, + "step": 1126 + }, + { + "epoch": 4.633093525179856, + "grad_norm": 0.1791903634128583, + "learning_rate": 6.363676887976944e-07, + "loss": 0.2458, + "step": 1127 + }, + { + "epoch": 4.637204522096608, + "grad_norm": 0.17050764877819538, + "learning_rate": 6.220619832114971e-07, + "loss": 0.2504, + "step": 1128 + }, + { + "epoch": 4.641315519013361, + "grad_norm": 0.17379476155710213, + "learning_rate": 6.079163685395917e-07, + "loss": 0.2426, + "step": 1129 + }, + { + "epoch": 4.645426515930113, + "grad_norm": 0.17731694259336384, + "learning_rate": 5.939309616460276e-07, + "loss": 0.2356, + "step": 1130 + }, + { + "epoch": 4.649537512846865, + "grad_norm": 0.17342657076328494, + "learning_rate": 5.801058780713021e-07, + "loss": 0.2454, + "step": 1131 + }, + { + "epoch": 4.6536485097636175, + "grad_norm": 0.17506716139804732, + "learning_rate": 5.664412320314027e-07, + "loss": 0.2466, + "step": 1132 + }, + { + "epoch": 4.65775950668037, + "grad_norm": 0.1760318065249804, + "learning_rate": 5.529371364168535e-07, + "loss": 0.2298, + "step": 1133 + }, + { + "epoch": 4.661870503597123, + "grad_norm": 0.17418792165950514, + "learning_rate": 5.395937027918008e-07, + "loss": 0.2352, + "step": 1134 + }, + { + "epoch": 4.665981500513874, + "grad_norm": 0.1706417249640066, + "learning_rate": 5.264110413930735e-07, + "loss": 0.2398, + "step": 1135 + }, + { + "epoch": 4.670092497430627, + "grad_norm": 0.17721194204664026, + "learning_rate": 5.133892611292846e-07, + "loss": 0.2378, + "step": 1136 + }, + { + "epoch": 4.674203494347379, + "grad_norm": 0.18022478809124595, + "learning_rate": 5.005284695799217e-07, + "loss": 0.2491, + "step": 1137 + }, + { + "epoch": 4.678314491264132, + "grad_norm": 0.17519368559887136, + "learning_rate": 4.878287729944697e-07, + "loss": 0.2438, + "step": 1138 + }, + { + "epoch": 4.6824254881808836, + "grad_norm": 0.1777158927515907, + "learning_rate": 4.7529027629152234e-07, + "loss": 0.2364, + "step": 1139 + }, + { + "epoch": 4.686536485097636, + "grad_norm": 0.1707755427569748, + "learning_rate": 4.6291308305792315e-07, + "loss": 0.2453, + "step": 1140 + }, + { + "epoch": 4.690647482014389, + "grad_norm": 0.16668258759849372, + "learning_rate": 4.5069729554790386e-07, + "loss": 0.2402, + "step": 1141 + }, + { + "epoch": 4.694758478931141, + "grad_norm": 0.16459971581816116, + "learning_rate": 4.386430146822429e-07, + "loss": 0.2483, + "step": 1142 + }, + { + "epoch": 4.698869475847893, + "grad_norm": 0.17875849392738621, + "learning_rate": 4.2675034004743045e-07, + "loss": 0.241, + "step": 1143 + }, + { + "epoch": 4.702980472764645, + "grad_norm": 0.17296505254178202, + "learning_rate": 4.150193698948468e-07, + "loss": 0.2465, + "step": 1144 + }, + { + "epoch": 4.707091469681398, + "grad_norm": 0.17224757284909492, + "learning_rate": 4.034502011399499e-07, + "loss": 0.2385, + "step": 1145 + }, + { + "epoch": 4.7112024665981505, + "grad_norm": 0.17411551470001055, + "learning_rate": 3.92042929361478e-07, + "loss": 0.2362, + "step": 1146 + }, + { + "epoch": 4.715313463514902, + "grad_norm": 0.17065619759470402, + "learning_rate": 3.8079764880064817e-07, + "loss": 0.2367, + "step": 1147 + }, + { + "epoch": 4.719424460431655, + "grad_norm": 0.1692629166872625, + "learning_rate": 3.6971445236039685e-07, + "loss": 0.2441, + "step": 1148 + }, + { + "epoch": 4.723535457348407, + "grad_norm": 0.1727903716287266, + "learning_rate": 3.587934316045938e-07, + "loss": 0.2332, + "step": 1149 + }, + { + "epoch": 4.727646454265159, + "grad_norm": 0.16765805252498014, + "learning_rate": 3.4803467675729843e-07, + "loss": 0.2436, + "step": 1150 + }, + { + "epoch": 4.731757451181911, + "grad_norm": 0.1685165614846259, + "learning_rate": 3.374382767020068e-07, + "loss": 0.2462, + "step": 1151 + }, + { + "epoch": 4.735868448098664, + "grad_norm": 0.17176323835480908, + "learning_rate": 3.270043189809213e-07, + "loss": 0.2475, + "step": 1152 + }, + { + "epoch": 4.7399794450154165, + "grad_norm": 0.17054983226303438, + "learning_rate": 3.167328897942268e-07, + "loss": 0.2396, + "step": 1153 + }, + { + "epoch": 4.744090441932169, + "grad_norm": 0.17377255003391895, + "learning_rate": 3.0662407399937757e-07, + "loss": 0.2414, + "step": 1154 + }, + { + "epoch": 4.748201438848921, + "grad_norm": 0.1707561407227221, + "learning_rate": 2.96677955110396e-07, + "loss": 0.2374, + "step": 1155 + }, + { + "epoch": 4.752312435765673, + "grad_norm": 0.1720410967594104, + "learning_rate": 2.8689461529718634e-07, + "loss": 0.2439, + "step": 1156 + }, + { + "epoch": 4.756423432682426, + "grad_norm": 0.18296516134186272, + "learning_rate": 2.7727413538484625e-07, + "loss": 0.2361, + "step": 1157 + }, + { + "epoch": 4.760534429599177, + "grad_norm": 0.17896247286039071, + "learning_rate": 2.678165948530143e-07, + "loss": 0.2356, + "step": 1158 + }, + { + "epoch": 4.76464542651593, + "grad_norm": 0.17407845197351318, + "learning_rate": 2.5852207183519885e-07, + "loss": 0.2251, + "step": 1159 + }, + { + "epoch": 4.7687564234326825, + "grad_norm": 0.17062892002775074, + "learning_rate": 2.493906431181392e-07, + "loss": 0.2438, + "step": 1160 + }, + { + "epoch": 4.772867420349435, + "grad_norm": 0.1672404807221335, + "learning_rate": 2.4042238414117016e-07, + "loss": 0.2261, + "step": 1161 + }, + { + "epoch": 4.7769784172661875, + "grad_norm": 0.17275226054969903, + "learning_rate": 2.3161736899560249e-07, + "loss": 0.2394, + "step": 1162 + }, + { + "epoch": 4.781089414182939, + "grad_norm": 0.17356630567530312, + "learning_rate": 2.2297567042410372e-07, + "loss": 0.2345, + "step": 1163 + }, + { + "epoch": 4.785200411099692, + "grad_norm": 0.17592520932271608, + "learning_rate": 2.1449735982010278e-07, + "loss": 0.2431, + "step": 1164 + }, + { + "epoch": 4.789311408016444, + "grad_norm": 0.17039915938031028, + "learning_rate": 2.0618250722719501e-07, + "loss": 0.2431, + "step": 1165 + }, + { + "epoch": 4.793422404933196, + "grad_norm": 0.1730081549716055, + "learning_rate": 1.9803118133857157e-07, + "loss": 0.2486, + "step": 1166 + }, + { + "epoch": 4.7975334018499485, + "grad_norm": 0.17038132685762578, + "learning_rate": 1.9004344949644425e-07, + "loss": 0.2409, + "step": 1167 + }, + { + "epoch": 4.801644398766701, + "grad_norm": 0.1743699418678854, + "learning_rate": 1.8221937769149045e-07, + "loss": 0.2365, + "step": 1168 + }, + { + "epoch": 4.805755395683454, + "grad_norm": 0.16868706789230756, + "learning_rate": 1.745590305623157e-07, + "loss": 0.2415, + "step": 1169 + }, + { + "epoch": 4.809866392600205, + "grad_norm": 0.17209405727243507, + "learning_rate": 1.6706247139490318e-07, + "loss": 0.2434, + "step": 1170 + }, + { + "epoch": 4.813977389516958, + "grad_norm": 0.17768294930126322, + "learning_rate": 1.5972976212211388e-07, + "loss": 0.2333, + "step": 1171 + }, + { + "epoch": 4.81808838643371, + "grad_norm": 0.1692714430601308, + "learning_rate": 1.525609633231495e-07, + "loss": 0.247, + "step": 1172 + }, + { + "epoch": 4.822199383350463, + "grad_norm": 0.16825815560990823, + "learning_rate": 1.455561342230749e-07, + "loss": 0.249, + "step": 1173 + }, + { + "epoch": 4.8263103802672145, + "grad_norm": 0.17141290386405647, + "learning_rate": 1.3871533269231187e-07, + "loss": 0.2547, + "step": 1174 + }, + { + "epoch": 4.830421377183967, + "grad_norm": 0.17040562779973573, + "learning_rate": 1.3203861524617278e-07, + "loss": 0.2519, + "step": 1175 + }, + { + "epoch": 4.83453237410072, + "grad_norm": 0.1677164586091278, + "learning_rate": 1.2552603704438115e-07, + "loss": 0.2334, + "step": 1176 + }, + { + "epoch": 4.838643371017472, + "grad_norm": 0.1695089968957749, + "learning_rate": 1.1917765189063402e-07, + "loss": 0.243, + "step": 1177 + }, + { + "epoch": 4.842754367934224, + "grad_norm": 0.17083241390302353, + "learning_rate": 1.1299351223214017e-07, + "loss": 0.2349, + "step": 1178 + }, + { + "epoch": 4.846865364850976, + "grad_norm": 0.17067355512311883, + "learning_rate": 1.069736691591916e-07, + "loss": 0.2392, + "step": 1179 + }, + { + "epoch": 4.850976361767729, + "grad_norm": 0.16913426842634588, + "learning_rate": 1.0111817240475052e-07, + "loss": 0.23, + "step": 1180 + }, + { + "epoch": 4.8550873586844805, + "grad_norm": 0.17062118220717767, + "learning_rate": 9.542707034402299e-08, + "loss": 0.2358, + "step": 1181 + }, + { + "epoch": 4.859198355601233, + "grad_norm": 0.17029660865240914, + "learning_rate": 8.990040999407701e-08, + "loss": 0.2302, + "step": 1182 + }, + { + "epoch": 4.863309352517986, + "grad_norm": 0.16773211882510938, + "learning_rate": 8.453823701343622e-08, + "loss": 0.245, + "step": 1183 + }, + { + "epoch": 4.867420349434738, + "grad_norm": 0.16985639544523204, + "learning_rate": 7.93405957017157e-08, + "loss": 0.2345, + "step": 1184 + }, + { + "epoch": 4.871531346351491, + "grad_norm": 0.16908464736483494, + "learning_rate": 7.430752899924898e-08, + "loss": 0.2413, + "step": 1185 + }, + { + "epoch": 4.875642343268242, + "grad_norm": 0.1759346258041749, + "learning_rate": 6.943907848673937e-08, + "loss": 0.2427, + "step": 1186 + }, + { + "epoch": 4.879753340184995, + "grad_norm": 0.17174700886638744, + "learning_rate": 6.473528438490916e-08, + "loss": 0.2439, + "step": 1187 + }, + { + "epoch": 4.883864337101747, + "grad_norm": 0.19693454366502605, + "learning_rate": 6.019618555417328e-08, + "loss": 0.2377, + "step": 1188 + }, + { + "epoch": 4.887975334018499, + "grad_norm": 0.17754029524221127, + "learning_rate": 5.58218194943172e-08, + "loss": 0.2293, + "step": 1189 + }, + { + "epoch": 4.892086330935252, + "grad_norm": 0.1700686268287155, + "learning_rate": 5.161222234418173e-08, + "loss": 0.2416, + "step": 1190 + }, + { + "epoch": 4.896197327852004, + "grad_norm": 0.1778073111241822, + "learning_rate": 4.756742888136989e-08, + "loss": 0.245, + "step": 1191 + }, + { + "epoch": 4.900308324768757, + "grad_norm": 0.17085644813691567, + "learning_rate": 4.3687472521962704e-08, + "loss": 0.2386, + "step": 1192 + }, + { + "epoch": 4.904419321685509, + "grad_norm": 0.17614275321803505, + "learning_rate": 3.997238532023273e-08, + "loss": 0.2378, + "step": 1193 + }, + { + "epoch": 4.908530318602261, + "grad_norm": 0.16573489640277006, + "learning_rate": 3.642219796839097e-08, + "loss": 0.2386, + "step": 1194 + }, + { + "epoch": 4.912641315519013, + "grad_norm": 0.16678550464295586, + "learning_rate": 3.303693979632039e-08, + "loss": 0.2411, + "step": 1195 + }, + { + "epoch": 4.916752312435766, + "grad_norm": 0.1683860743917499, + "learning_rate": 2.981663877134944e-08, + "loss": 0.2443, + "step": 1196 + }, + { + "epoch": 4.920863309352518, + "grad_norm": 0.16597729886542864, + "learning_rate": 2.6761321498005587e-08, + "loss": 0.2408, + "step": 1197 + }, + { + "epoch": 4.92497430626927, + "grad_norm": 0.1718127781713061, + "learning_rate": 2.3871013217806605e-08, + "loss": 0.2412, + "step": 1198 + }, + { + "epoch": 4.929085303186023, + "grad_norm": 0.17097349823279034, + "learning_rate": 2.1145737809045162e-08, + "loss": 0.2421, + "step": 1199 + }, + { + "epoch": 4.933196300102775, + "grad_norm": 0.17427216115981084, + "learning_rate": 1.8585517786597894e-08, + "loss": 0.2381, + "step": 1200 + }, + { + "epoch": 4.937307297019527, + "grad_norm": 0.1744216439196038, + "learning_rate": 1.6190374301727762e-08, + "loss": 0.2282, + "step": 1201 + }, + { + "epoch": 4.941418293936279, + "grad_norm": 0.17445833543773084, + "learning_rate": 1.3960327141926411e-08, + "loss": 0.2299, + "step": 1202 + }, + { + "epoch": 4.945529290853032, + "grad_norm": 0.17503271001824838, + "learning_rate": 1.1895394730738751e-08, + "loss": 0.2333, + "step": 1203 + }, + { + "epoch": 4.9496402877697845, + "grad_norm": 0.1689882913201602, + "learning_rate": 9.995594127607534e-09, + "loss": 0.2426, + "step": 1204 + }, + { + "epoch": 4.953751284686536, + "grad_norm": 0.17067106138398636, + "learning_rate": 8.260941027746772e-09, + "loss": 0.2477, + "step": 1205 + }, + { + "epoch": 4.957862281603289, + "grad_norm": 0.17523534599452864, + "learning_rate": 6.6914497619996465e-09, + "loss": 0.2362, + "step": 1206 + }, + { + "epoch": 4.961973278520041, + "grad_norm": 0.16958624262500685, + "learning_rate": 5.287133296723035e-09, + "loss": 0.2416, + "step": 1207 + }, + { + "epoch": 4.966084275436794, + "grad_norm": 0.17141643376988966, + "learning_rate": 4.048003233687592e-09, + "loss": 0.2319, + "step": 1208 + }, + { + "epoch": 4.970195272353545, + "grad_norm": 0.17444671360259928, + "learning_rate": 2.974069809964508e-09, + "loss": 0.2442, + "step": 1209 + }, + { + "epoch": 4.974306269270298, + "grad_norm": 0.16938004185341077, + "learning_rate": 2.065341897865558e-09, + "loss": 0.2482, + "step": 1210 + }, + { + "epoch": 4.9784172661870505, + "grad_norm": 0.16777811457050917, + "learning_rate": 1.32182700484762e-09, + "loss": 0.2465, + "step": 1211 + }, + { + "epoch": 4.982528263103803, + "grad_norm": 0.17657705108779342, + "learning_rate": 7.435312734593858e-10, + "loss": 0.2395, + "step": 1212 + }, + { + "epoch": 4.986639260020555, + "grad_norm": 0.16972832488616352, + "learning_rate": 3.304594812991724e-10, + "loss": 0.2413, + "step": 1213 + }, + { + "epoch": 4.990750256937307, + "grad_norm": 0.16507566272358404, + "learning_rate": 8.261504095496976e-11, + "loss": 0.2336, + "step": 1214 + }, + { + "epoch": 4.99486125385406, + "grad_norm": 0.17100744078272323, + "learning_rate": 0.0, + "loss": 0.2357, + "step": 1215 + }, + { + "epoch": 4.99486125385406, + "step": 1215, + "total_flos": 4.757804886857613e+18, + "train_loss": 0.34405366748939326, + "train_runtime": 28473.715, + "train_samples_per_second": 5.463, + "train_steps_per_second": 0.043 + } + ], + "logging_steps": 1, + "max_steps": 1215, + "num_input_tokens_seen": 0, + "num_train_epochs": 5, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 4.757804886857613e+18, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}