d1_math_long_paragraphs / trainer_state.json
neginr's picture
End of training
08a31e0 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.99486125385406,
"eval_steps": 500,
"global_step": 1215,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0041109969167523125,
"grad_norm": 6.356808310456779,
"learning_rate": 3.278688524590164e-07,
"loss": 0.82,
"step": 1
},
{
"epoch": 0.008221993833504625,
"grad_norm": 6.417518537271285,
"learning_rate": 6.557377049180328e-07,
"loss": 0.8218,
"step": 2
},
{
"epoch": 0.012332990750256937,
"grad_norm": 6.266225501769135,
"learning_rate": 9.836065573770493e-07,
"loss": 0.7983,
"step": 3
},
{
"epoch": 0.01644398766700925,
"grad_norm": 6.059283773388311,
"learning_rate": 1.3114754098360657e-06,
"loss": 0.7971,
"step": 4
},
{
"epoch": 0.020554984583761562,
"grad_norm": 5.8622860351950585,
"learning_rate": 1.6393442622950819e-06,
"loss": 0.811,
"step": 5
},
{
"epoch": 0.024665981500513873,
"grad_norm": 5.599946959024431,
"learning_rate": 1.9672131147540985e-06,
"loss": 0.7955,
"step": 6
},
{
"epoch": 0.02877697841726619,
"grad_norm": 4.404619948223283,
"learning_rate": 2.295081967213115e-06,
"loss": 0.7786,
"step": 7
},
{
"epoch": 0.0328879753340185,
"grad_norm": 3.922409083964933,
"learning_rate": 2.6229508196721314e-06,
"loss": 0.7401,
"step": 8
},
{
"epoch": 0.03699897225077081,
"grad_norm": 2.2688697860924765,
"learning_rate": 2.9508196721311478e-06,
"loss": 0.7309,
"step": 9
},
{
"epoch": 0.041109969167523124,
"grad_norm": 2.066008891071192,
"learning_rate": 3.2786885245901638e-06,
"loss": 0.719,
"step": 10
},
{
"epoch": 0.045220966084275435,
"grad_norm": 1.9201877271634715,
"learning_rate": 3.6065573770491806e-06,
"loss": 0.7258,
"step": 11
},
{
"epoch": 0.04933196300102775,
"grad_norm": 3.898309985774545,
"learning_rate": 3.934426229508197e-06,
"loss": 0.7205,
"step": 12
},
{
"epoch": 0.05344295991778006,
"grad_norm": 4.0353502189938855,
"learning_rate": 4.2622950819672135e-06,
"loss": 0.7192,
"step": 13
},
{
"epoch": 0.05755395683453238,
"grad_norm": 4.121703890013303,
"learning_rate": 4.59016393442623e-06,
"loss": 0.7244,
"step": 14
},
{
"epoch": 0.06166495375128469,
"grad_norm": 3.928913042484364,
"learning_rate": 4.918032786885246e-06,
"loss": 0.6995,
"step": 15
},
{
"epoch": 0.065775950668037,
"grad_norm": 3.142521725483005,
"learning_rate": 5.245901639344263e-06,
"loss": 0.6643,
"step": 16
},
{
"epoch": 0.0698869475847893,
"grad_norm": 2.847209107159321,
"learning_rate": 5.573770491803278e-06,
"loss": 0.6531,
"step": 17
},
{
"epoch": 0.07399794450154162,
"grad_norm": 2.2003209693474126,
"learning_rate": 5.9016393442622956e-06,
"loss": 0.66,
"step": 18
},
{
"epoch": 0.07810894141829394,
"grad_norm": 1.336519700469157,
"learning_rate": 6.229508196721312e-06,
"loss": 0.6164,
"step": 19
},
{
"epoch": 0.08221993833504625,
"grad_norm": 1.2400729726657767,
"learning_rate": 6.5573770491803276e-06,
"loss": 0.6071,
"step": 20
},
{
"epoch": 0.08633093525179857,
"grad_norm": 1.4350908797178215,
"learning_rate": 6.885245901639345e-06,
"loss": 0.6066,
"step": 21
},
{
"epoch": 0.09044193216855087,
"grad_norm": 1.4014758782073495,
"learning_rate": 7.213114754098361e-06,
"loss": 0.5995,
"step": 22
},
{
"epoch": 0.09455292908530319,
"grad_norm": 1.1237552991193,
"learning_rate": 7.540983606557377e-06,
"loss": 0.5895,
"step": 23
},
{
"epoch": 0.0986639260020555,
"grad_norm": 0.8497080531691873,
"learning_rate": 7.868852459016394e-06,
"loss": 0.588,
"step": 24
},
{
"epoch": 0.10277492291880781,
"grad_norm": 0.9384127005244138,
"learning_rate": 8.19672131147541e-06,
"loss": 0.5767,
"step": 25
},
{
"epoch": 0.10688591983556012,
"grad_norm": 0.8190287043666049,
"learning_rate": 8.524590163934427e-06,
"loss": 0.5698,
"step": 26
},
{
"epoch": 0.11099691675231244,
"grad_norm": 0.6808839486547595,
"learning_rate": 8.852459016393443e-06,
"loss": 0.5795,
"step": 27
},
{
"epoch": 0.11510791366906475,
"grad_norm": 0.7939592915072008,
"learning_rate": 9.18032786885246e-06,
"loss": 0.5642,
"step": 28
},
{
"epoch": 0.11921891058581706,
"grad_norm": 0.7673957275771759,
"learning_rate": 9.508196721311476e-06,
"loss": 0.5505,
"step": 29
},
{
"epoch": 0.12332990750256938,
"grad_norm": 0.542256505411903,
"learning_rate": 9.836065573770493e-06,
"loss": 0.5525,
"step": 30
},
{
"epoch": 0.12744090441932168,
"grad_norm": 0.5835090756188929,
"learning_rate": 1.0163934426229509e-05,
"loss": 0.5394,
"step": 31
},
{
"epoch": 0.131551901336074,
"grad_norm": 2.7254227477748034,
"learning_rate": 1.0491803278688525e-05,
"loss": 0.587,
"step": 32
},
{
"epoch": 0.13566289825282632,
"grad_norm": 0.869479260973271,
"learning_rate": 1.0819672131147544e-05,
"loss": 0.5492,
"step": 33
},
{
"epoch": 0.1397738951695786,
"grad_norm": 0.5177682668145469,
"learning_rate": 1.1147540983606557e-05,
"loss": 0.5325,
"step": 34
},
{
"epoch": 0.14388489208633093,
"grad_norm": 0.511981609333191,
"learning_rate": 1.1475409836065575e-05,
"loss": 0.5486,
"step": 35
},
{
"epoch": 0.14799588900308325,
"grad_norm": 0.5836498681132752,
"learning_rate": 1.1803278688524591e-05,
"loss": 0.5391,
"step": 36
},
{
"epoch": 0.15210688591983557,
"grad_norm": 0.6259670640713604,
"learning_rate": 1.2131147540983608e-05,
"loss": 0.5342,
"step": 37
},
{
"epoch": 0.15621788283658788,
"grad_norm": 0.4863673175391185,
"learning_rate": 1.2459016393442624e-05,
"loss": 0.5202,
"step": 38
},
{
"epoch": 0.16032887975334018,
"grad_norm": 0.46984524173771686,
"learning_rate": 1.2786885245901642e-05,
"loss": 0.5275,
"step": 39
},
{
"epoch": 0.1644398766700925,
"grad_norm": 0.4373961614640998,
"learning_rate": 1.3114754098360655e-05,
"loss": 0.5259,
"step": 40
},
{
"epoch": 0.1685508735868448,
"grad_norm": 0.5382826043776142,
"learning_rate": 1.3442622950819673e-05,
"loss": 0.5289,
"step": 41
},
{
"epoch": 0.17266187050359713,
"grad_norm": 0.49858273312303336,
"learning_rate": 1.377049180327869e-05,
"loss": 0.5034,
"step": 42
},
{
"epoch": 0.17677286742034942,
"grad_norm": 0.4565898955129958,
"learning_rate": 1.4098360655737706e-05,
"loss": 0.4992,
"step": 43
},
{
"epoch": 0.18088386433710174,
"grad_norm": 0.4250478862858475,
"learning_rate": 1.4426229508196722e-05,
"loss": 0.5099,
"step": 44
},
{
"epoch": 0.18499486125385406,
"grad_norm": 0.5450828551582885,
"learning_rate": 1.4754098360655739e-05,
"loss": 0.5133,
"step": 45
},
{
"epoch": 0.18910585817060638,
"grad_norm": 0.5082095741416903,
"learning_rate": 1.5081967213114754e-05,
"loss": 0.507,
"step": 46
},
{
"epoch": 0.1932168550873587,
"grad_norm": 0.399668921358076,
"learning_rate": 1.5409836065573772e-05,
"loss": 0.499,
"step": 47
},
{
"epoch": 0.197327852004111,
"grad_norm": 0.5857374299092792,
"learning_rate": 1.5737704918032788e-05,
"loss": 0.5182,
"step": 48
},
{
"epoch": 0.2014388489208633,
"grad_norm": 0.4337541604951673,
"learning_rate": 1.6065573770491805e-05,
"loss": 0.5048,
"step": 49
},
{
"epoch": 0.20554984583761562,
"grad_norm": 0.675490041268254,
"learning_rate": 1.639344262295082e-05,
"loss": 0.5091,
"step": 50
},
{
"epoch": 0.20966084275436794,
"grad_norm": 0.44682409800475936,
"learning_rate": 1.6721311475409837e-05,
"loss": 0.4948,
"step": 51
},
{
"epoch": 0.21377183967112023,
"grad_norm": 0.5243379991172152,
"learning_rate": 1.7049180327868854e-05,
"loss": 0.4959,
"step": 52
},
{
"epoch": 0.21788283658787255,
"grad_norm": 0.48058870125487607,
"learning_rate": 1.737704918032787e-05,
"loss": 0.493,
"step": 53
},
{
"epoch": 0.22199383350462487,
"grad_norm": 0.4009755414381969,
"learning_rate": 1.7704918032786887e-05,
"loss": 0.5005,
"step": 54
},
{
"epoch": 0.2261048304213772,
"grad_norm": 0.4487072583979547,
"learning_rate": 1.8032786885245903e-05,
"loss": 0.5,
"step": 55
},
{
"epoch": 0.2302158273381295,
"grad_norm": 0.43431903464010596,
"learning_rate": 1.836065573770492e-05,
"loss": 0.4822,
"step": 56
},
{
"epoch": 0.2343268242548818,
"grad_norm": 0.4223425144399419,
"learning_rate": 1.8688524590163936e-05,
"loss": 0.4884,
"step": 57
},
{
"epoch": 0.23843782117163412,
"grad_norm": 0.40422238831771906,
"learning_rate": 1.9016393442622952e-05,
"loss": 0.5064,
"step": 58
},
{
"epoch": 0.24254881808838644,
"grad_norm": 0.4353031683109967,
"learning_rate": 1.934426229508197e-05,
"loss": 0.4844,
"step": 59
},
{
"epoch": 0.24665981500513876,
"grad_norm": 0.5063299442881862,
"learning_rate": 1.9672131147540985e-05,
"loss": 0.4871,
"step": 60
},
{
"epoch": 0.25077081192189105,
"grad_norm": 0.6405429501414496,
"learning_rate": 2e-05,
"loss": 0.4916,
"step": 61
},
{
"epoch": 0.25488180883864336,
"grad_norm": 0.7398107392403913,
"learning_rate": 2.0327868852459018e-05,
"loss": 0.4967,
"step": 62
},
{
"epoch": 0.2589928057553957,
"grad_norm": 0.6066259496387154,
"learning_rate": 2.0655737704918034e-05,
"loss": 0.4933,
"step": 63
},
{
"epoch": 0.263103802672148,
"grad_norm": 0.6888615660905145,
"learning_rate": 2.098360655737705e-05,
"loss": 0.4849,
"step": 64
},
{
"epoch": 0.2672147995889003,
"grad_norm": 0.6046305786161926,
"learning_rate": 2.1311475409836067e-05,
"loss": 0.4997,
"step": 65
},
{
"epoch": 0.27132579650565264,
"grad_norm": 0.4755750596713722,
"learning_rate": 2.1639344262295087e-05,
"loss": 0.484,
"step": 66
},
{
"epoch": 0.27543679342240496,
"grad_norm": 0.4901884477105443,
"learning_rate": 2.1967213114754104e-05,
"loss": 0.4714,
"step": 67
},
{
"epoch": 0.2795477903391572,
"grad_norm": 0.5180862601664822,
"learning_rate": 2.2295081967213113e-05,
"loss": 0.4743,
"step": 68
},
{
"epoch": 0.28365878725590954,
"grad_norm": 0.6341799796360953,
"learning_rate": 2.2622950819672133e-05,
"loss": 0.4837,
"step": 69
},
{
"epoch": 0.28776978417266186,
"grad_norm": 0.7050713511862262,
"learning_rate": 2.295081967213115e-05,
"loss": 0.4732,
"step": 70
},
{
"epoch": 0.2918807810894142,
"grad_norm": 0.46520327730925665,
"learning_rate": 2.3278688524590166e-05,
"loss": 0.4763,
"step": 71
},
{
"epoch": 0.2959917780061665,
"grad_norm": 0.46570649065351716,
"learning_rate": 2.3606557377049182e-05,
"loss": 0.4729,
"step": 72
},
{
"epoch": 0.3001027749229188,
"grad_norm": 0.5435122355995184,
"learning_rate": 2.39344262295082e-05,
"loss": 0.4673,
"step": 73
},
{
"epoch": 0.30421377183967113,
"grad_norm": 0.7024832057525984,
"learning_rate": 2.4262295081967215e-05,
"loss": 0.4685,
"step": 74
},
{
"epoch": 0.30832476875642345,
"grad_norm": 0.5982496336902186,
"learning_rate": 2.459016393442623e-05,
"loss": 0.4683,
"step": 75
},
{
"epoch": 0.31243576567317577,
"grad_norm": 0.5579092038957036,
"learning_rate": 2.4918032786885248e-05,
"loss": 0.4818,
"step": 76
},
{
"epoch": 0.31654676258992803,
"grad_norm": 0.75454502368708,
"learning_rate": 2.5245901639344264e-05,
"loss": 0.4745,
"step": 77
},
{
"epoch": 0.32065775950668035,
"grad_norm": 0.9103711158770255,
"learning_rate": 2.5573770491803284e-05,
"loss": 0.4732,
"step": 78
},
{
"epoch": 0.32476875642343267,
"grad_norm": 0.7230510862281725,
"learning_rate": 2.59016393442623e-05,
"loss": 0.4726,
"step": 79
},
{
"epoch": 0.328879753340185,
"grad_norm": 0.661725190625586,
"learning_rate": 2.622950819672131e-05,
"loss": 0.4691,
"step": 80
},
{
"epoch": 0.3329907502569373,
"grad_norm": 1.3040040424420736,
"learning_rate": 2.655737704918033e-05,
"loss": 0.4596,
"step": 81
},
{
"epoch": 0.3371017471736896,
"grad_norm": 0.9247546415389841,
"learning_rate": 2.6885245901639346e-05,
"loss": 0.4687,
"step": 82
},
{
"epoch": 0.34121274409044194,
"grad_norm": 0.6690753031478268,
"learning_rate": 2.7213114754098363e-05,
"loss": 0.475,
"step": 83
},
{
"epoch": 0.34532374100719426,
"grad_norm": 0.8875577066120585,
"learning_rate": 2.754098360655738e-05,
"loss": 0.4779,
"step": 84
},
{
"epoch": 0.3494347379239466,
"grad_norm": 0.9913820671901682,
"learning_rate": 2.7868852459016396e-05,
"loss": 0.4585,
"step": 85
},
{
"epoch": 0.35354573484069884,
"grad_norm": 1.0406026178086218,
"learning_rate": 2.8196721311475412e-05,
"loss": 0.4777,
"step": 86
},
{
"epoch": 0.35765673175745116,
"grad_norm": 0.8568856680996076,
"learning_rate": 2.852459016393443e-05,
"loss": 0.476,
"step": 87
},
{
"epoch": 0.3617677286742035,
"grad_norm": 0.8578450824032388,
"learning_rate": 2.8852459016393445e-05,
"loss": 0.477,
"step": 88
},
{
"epoch": 0.3658787255909558,
"grad_norm": 1.0997371612060205,
"learning_rate": 2.918032786885246e-05,
"loss": 0.4694,
"step": 89
},
{
"epoch": 0.3699897225077081,
"grad_norm": 0.6710185323141514,
"learning_rate": 2.9508196721311478e-05,
"loss": 0.4664,
"step": 90
},
{
"epoch": 0.37410071942446044,
"grad_norm": 0.8753359919001613,
"learning_rate": 2.9836065573770498e-05,
"loss": 0.4675,
"step": 91
},
{
"epoch": 0.37821171634121276,
"grad_norm": 0.9060750756011728,
"learning_rate": 3.0163934426229507e-05,
"loss": 0.4577,
"step": 92
},
{
"epoch": 0.3823227132579651,
"grad_norm": 0.8327145117934229,
"learning_rate": 3.0491803278688527e-05,
"loss": 0.4798,
"step": 93
},
{
"epoch": 0.3864337101747174,
"grad_norm": 0.9953249905867948,
"learning_rate": 3.0819672131147544e-05,
"loss": 0.4588,
"step": 94
},
{
"epoch": 0.39054470709146966,
"grad_norm": 0.790890207442512,
"learning_rate": 3.1147540983606557e-05,
"loss": 0.4813,
"step": 95
},
{
"epoch": 0.394655704008222,
"grad_norm": 0.7071469844879325,
"learning_rate": 3.1475409836065576e-05,
"loss": 0.4715,
"step": 96
},
{
"epoch": 0.3987667009249743,
"grad_norm": 0.683447139315226,
"learning_rate": 3.180327868852459e-05,
"loss": 0.4568,
"step": 97
},
{
"epoch": 0.4028776978417266,
"grad_norm": 0.6863994738211686,
"learning_rate": 3.213114754098361e-05,
"loss": 0.4516,
"step": 98
},
{
"epoch": 0.40698869475847893,
"grad_norm": 0.6443321732944037,
"learning_rate": 3.245901639344263e-05,
"loss": 0.4467,
"step": 99
},
{
"epoch": 0.41109969167523125,
"grad_norm": 0.6015090752114448,
"learning_rate": 3.278688524590164e-05,
"loss": 0.4566,
"step": 100
},
{
"epoch": 0.41521068859198357,
"grad_norm": 0.648925234921687,
"learning_rate": 3.311475409836066e-05,
"loss": 0.4598,
"step": 101
},
{
"epoch": 0.4193216855087359,
"grad_norm": 0.5740497039935356,
"learning_rate": 3.3442622950819675e-05,
"loss": 0.4514,
"step": 102
},
{
"epoch": 0.4234326824254882,
"grad_norm": 0.7433508320080534,
"learning_rate": 3.3770491803278695e-05,
"loss": 0.4555,
"step": 103
},
{
"epoch": 0.42754367934224047,
"grad_norm": 0.9786371138605869,
"learning_rate": 3.409836065573771e-05,
"loss": 0.4724,
"step": 104
},
{
"epoch": 0.4316546762589928,
"grad_norm": 1.16381322551552,
"learning_rate": 3.442622950819672e-05,
"loss": 0.4665,
"step": 105
},
{
"epoch": 0.4357656731757451,
"grad_norm": 0.7033574666436274,
"learning_rate": 3.475409836065574e-05,
"loss": 0.4741,
"step": 106
},
{
"epoch": 0.4398766700924974,
"grad_norm": 1.256476593209221,
"learning_rate": 3.5081967213114754e-05,
"loss": 0.476,
"step": 107
},
{
"epoch": 0.44398766700924974,
"grad_norm": 0.5933957475473355,
"learning_rate": 3.5409836065573773e-05,
"loss": 0.4653,
"step": 108
},
{
"epoch": 0.44809866392600206,
"grad_norm": 1.025564753787377,
"learning_rate": 3.5737704918032786e-05,
"loss": 0.47,
"step": 109
},
{
"epoch": 0.4522096608427544,
"grad_norm": 1.0088674998209484,
"learning_rate": 3.6065573770491806e-05,
"loss": 0.4681,
"step": 110
},
{
"epoch": 0.4563206577595067,
"grad_norm": 0.9216004942062503,
"learning_rate": 3.6393442622950826e-05,
"loss": 0.4546,
"step": 111
},
{
"epoch": 0.460431654676259,
"grad_norm": 1.1054709646558805,
"learning_rate": 3.672131147540984e-05,
"loss": 0.4669,
"step": 112
},
{
"epoch": 0.4645426515930113,
"grad_norm": 0.6642218594282759,
"learning_rate": 3.704918032786886e-05,
"loss": 0.4533,
"step": 113
},
{
"epoch": 0.4686536485097636,
"grad_norm": 0.8356269646157981,
"learning_rate": 3.737704918032787e-05,
"loss": 0.4599,
"step": 114
},
{
"epoch": 0.4727646454265159,
"grad_norm": 1.1650429141300205,
"learning_rate": 3.770491803278689e-05,
"loss": 0.448,
"step": 115
},
{
"epoch": 0.47687564234326824,
"grad_norm": 0.6212175962293394,
"learning_rate": 3.8032786885245905e-05,
"loss": 0.4638,
"step": 116
},
{
"epoch": 0.48098663926002055,
"grad_norm": 1.1965895951813037,
"learning_rate": 3.836065573770492e-05,
"loss": 0.4619,
"step": 117
},
{
"epoch": 0.4850976361767729,
"grad_norm": 0.8457976781943612,
"learning_rate": 3.868852459016394e-05,
"loss": 0.4555,
"step": 118
},
{
"epoch": 0.4892086330935252,
"grad_norm": 0.8463559301031214,
"learning_rate": 3.901639344262295e-05,
"loss": 0.4713,
"step": 119
},
{
"epoch": 0.4933196300102775,
"grad_norm": 0.6654609587793014,
"learning_rate": 3.934426229508197e-05,
"loss": 0.4461,
"step": 120
},
{
"epoch": 0.49743062692702983,
"grad_norm": 0.7698090467763701,
"learning_rate": 3.9672131147540983e-05,
"loss": 0.4627,
"step": 121
},
{
"epoch": 0.5015416238437821,
"grad_norm": 0.5716155461137187,
"learning_rate": 4e-05,
"loss": 0.4576,
"step": 122
},
{
"epoch": 0.5056526207605344,
"grad_norm": 0.5151376433722467,
"learning_rate": 3.999991738495905e-05,
"loss": 0.4485,
"step": 123
},
{
"epoch": 0.5097636176772867,
"grad_norm": 0.561427237450996,
"learning_rate": 3.9999669540518704e-05,
"loss": 0.454,
"step": 124
},
{
"epoch": 0.513874614594039,
"grad_norm": 0.6553799163893537,
"learning_rate": 3.999925646872655e-05,
"loss": 0.4523,
"step": 125
},
{
"epoch": 0.5179856115107914,
"grad_norm": 0.7909652053854684,
"learning_rate": 3.9998678172995157e-05,
"loss": 0.4544,
"step": 126
},
{
"epoch": 0.5220966084275437,
"grad_norm": 0.8388721187199466,
"learning_rate": 3.999793465810214e-05,
"loss": 0.4408,
"step": 127
},
{
"epoch": 0.526207605344296,
"grad_norm": 0.7373151231076792,
"learning_rate": 3.999702593019004e-05,
"loss": 0.4596,
"step": 128
},
{
"epoch": 0.5303186022610483,
"grad_norm": 0.9546826007376602,
"learning_rate": 3.9995951996766316e-05,
"loss": 0.459,
"step": 129
},
{
"epoch": 0.5344295991778006,
"grad_norm": 0.9027549638128062,
"learning_rate": 3.999471286670328e-05,
"loss": 0.4537,
"step": 130
},
{
"epoch": 0.538540596094553,
"grad_norm": 1.0477189023005884,
"learning_rate": 3.9993308550238e-05,
"loss": 0.4455,
"step": 131
},
{
"epoch": 0.5426515930113053,
"grad_norm": 1.1744648733550076,
"learning_rate": 3.999173905897226e-05,
"loss": 0.4579,
"step": 132
},
{
"epoch": 0.5467625899280576,
"grad_norm": 0.7693181453420259,
"learning_rate": 3.99900044058724e-05,
"loss": 0.4381,
"step": 133
},
{
"epoch": 0.5508735868448099,
"grad_norm": 0.8932998819929917,
"learning_rate": 3.998810460526927e-05,
"loss": 0.4663,
"step": 134
},
{
"epoch": 0.5549845837615622,
"grad_norm": 1.1927529473551686,
"learning_rate": 3.998603967285808e-05,
"loss": 0.456,
"step": 135
},
{
"epoch": 0.5590955806783144,
"grad_norm": 0.7303627266940724,
"learning_rate": 3.998380962569828e-05,
"loss": 0.463,
"step": 136
},
{
"epoch": 0.5632065775950668,
"grad_norm": 1.211411577139644,
"learning_rate": 3.9981414482213405e-05,
"loss": 0.4649,
"step": 137
},
{
"epoch": 0.5673175745118191,
"grad_norm": 0.8811081756810052,
"learning_rate": 3.997885426219096e-05,
"loss": 0.4637,
"step": 138
},
{
"epoch": 0.5714285714285714,
"grad_norm": 0.8713984959209414,
"learning_rate": 3.99761289867822e-05,
"loss": 0.4546,
"step": 139
},
{
"epoch": 0.5755395683453237,
"grad_norm": 0.9788586712835476,
"learning_rate": 3.9973238678501996e-05,
"loss": 0.4475,
"step": 140
},
{
"epoch": 0.579650565262076,
"grad_norm": 0.7533682906447463,
"learning_rate": 3.997018336122866e-05,
"loss": 0.4428,
"step": 141
},
{
"epoch": 0.5837615621788284,
"grad_norm": 0.7461312001689725,
"learning_rate": 3.9966963060203684e-05,
"loss": 0.4494,
"step": 142
},
{
"epoch": 0.5878725590955807,
"grad_norm": 0.6174187621354268,
"learning_rate": 3.996357780203161e-05,
"loss": 0.4504,
"step": 143
},
{
"epoch": 0.591983556012333,
"grad_norm": 0.49003513931944637,
"learning_rate": 3.9960027614679766e-05,
"loss": 0.4427,
"step": 144
},
{
"epoch": 0.5960945529290853,
"grad_norm": 0.5605653135696967,
"learning_rate": 3.995631252747804e-05,
"loss": 0.448,
"step": 145
},
{
"epoch": 0.6002055498458376,
"grad_norm": 0.44488928569515496,
"learning_rate": 3.9952432571118634e-05,
"loss": 0.4467,
"step": 146
},
{
"epoch": 0.60431654676259,
"grad_norm": 0.5026250568866101,
"learning_rate": 3.994838777765582e-05,
"loss": 0.4477,
"step": 147
},
{
"epoch": 0.6084275436793423,
"grad_norm": 0.5172580934087974,
"learning_rate": 3.9944178180505685e-05,
"loss": 0.4523,
"step": 148
},
{
"epoch": 0.6125385405960946,
"grad_norm": 0.42653534399915305,
"learning_rate": 3.993980381444583e-05,
"loss": 0.4461,
"step": 149
},
{
"epoch": 0.6166495375128469,
"grad_norm": 0.5471101554510113,
"learning_rate": 3.993526471561509e-05,
"loss": 0.4434,
"step": 150
},
{
"epoch": 0.6207605344295992,
"grad_norm": 0.44006741961102114,
"learning_rate": 3.993056092151326e-05,
"loss": 0.4433,
"step": 151
},
{
"epoch": 0.6248715313463515,
"grad_norm": 0.54547408589461,
"learning_rate": 3.9925692471000755e-05,
"loss": 0.4512,
"step": 152
},
{
"epoch": 0.6289825282631039,
"grad_norm": 0.5069055596939723,
"learning_rate": 3.9920659404298285e-05,
"loss": 0.4407,
"step": 153
},
{
"epoch": 0.6330935251798561,
"grad_norm": 0.4361985691247893,
"learning_rate": 3.991546176298657e-05,
"loss": 0.443,
"step": 154
},
{
"epoch": 0.6372045220966084,
"grad_norm": 0.6492040966314581,
"learning_rate": 3.991009959000593e-05,
"loss": 0.4643,
"step": 155
},
{
"epoch": 0.6413155190133607,
"grad_norm": 0.662599709889411,
"learning_rate": 3.990457292965598e-05,
"loss": 0.4421,
"step": 156
},
{
"epoch": 0.645426515930113,
"grad_norm": 0.7496440633893157,
"learning_rate": 3.9898881827595255e-05,
"loss": 0.4475,
"step": 157
},
{
"epoch": 0.6495375128468653,
"grad_norm": 0.6565888966700488,
"learning_rate": 3.989302633084081e-05,
"loss": 0.4478,
"step": 158
},
{
"epoch": 0.6536485097636177,
"grad_norm": 0.4914282437943478,
"learning_rate": 3.988700648776786e-05,
"loss": 0.4485,
"step": 159
},
{
"epoch": 0.65775950668037,
"grad_norm": 0.5765645279846272,
"learning_rate": 3.9880822348109365e-05,
"loss": 0.4406,
"step": 160
},
{
"epoch": 0.6618705035971223,
"grad_norm": 0.54508458239329,
"learning_rate": 3.9874473962955625e-05,
"loss": 0.4377,
"step": 161
},
{
"epoch": 0.6659815005138746,
"grad_norm": 0.5467537029004771,
"learning_rate": 3.986796138475383e-05,
"loss": 0.4404,
"step": 162
},
{
"epoch": 0.6700924974306269,
"grad_norm": 0.5927993483965615,
"learning_rate": 3.986128466730769e-05,
"loss": 0.4397,
"step": 163
},
{
"epoch": 0.6742034943473793,
"grad_norm": 0.549384325928676,
"learning_rate": 3.985444386577693e-05,
"loss": 0.4447,
"step": 164
},
{
"epoch": 0.6783144912641316,
"grad_norm": 0.7853091838958088,
"learning_rate": 3.984743903667685e-05,
"loss": 0.4323,
"step": 165
},
{
"epoch": 0.6824254881808839,
"grad_norm": 0.7452346665717267,
"learning_rate": 3.984027023787789e-05,
"loss": 0.4418,
"step": 166
},
{
"epoch": 0.6865364850976362,
"grad_norm": 0.6354950658254404,
"learning_rate": 3.98329375286051e-05,
"loss": 0.4462,
"step": 167
},
{
"epoch": 0.6906474820143885,
"grad_norm": 0.6039082662120046,
"learning_rate": 3.982544096943769e-05,
"loss": 0.4387,
"step": 168
},
{
"epoch": 0.6947584789311408,
"grad_norm": 0.658037323729766,
"learning_rate": 3.9817780622308515e-05,
"loss": 0.4442,
"step": 169
},
{
"epoch": 0.6988694758478932,
"grad_norm": 0.4757882993447632,
"learning_rate": 3.980995655050356e-05,
"loss": 0.4432,
"step": 170
},
{
"epoch": 0.7029804727646455,
"grad_norm": 0.44532932025468364,
"learning_rate": 3.980196881866143e-05,
"loss": 0.4414,
"step": 171
},
{
"epoch": 0.7070914696813977,
"grad_norm": 0.45179948654666446,
"learning_rate": 3.9793817492772806e-05,
"loss": 0.4509,
"step": 172
},
{
"epoch": 0.71120246659815,
"grad_norm": 0.4699683428704349,
"learning_rate": 3.9785502640179905e-05,
"loss": 0.4278,
"step": 173
},
{
"epoch": 0.7153134635149023,
"grad_norm": 0.5201025709025265,
"learning_rate": 3.97770243295759e-05,
"loss": 0.4335,
"step": 174
},
{
"epoch": 0.7194244604316546,
"grad_norm": 0.47266696898911464,
"learning_rate": 3.9768382631004405e-05,
"loss": 0.4501,
"step": 175
},
{
"epoch": 0.723535457348407,
"grad_norm": 0.4422465802779614,
"learning_rate": 3.975957761585883e-05,
"loss": 0.446,
"step": 176
},
{
"epoch": 0.7276464542651593,
"grad_norm": 0.5391358485913682,
"learning_rate": 3.9750609356881865e-05,
"loss": 0.4512,
"step": 177
},
{
"epoch": 0.7317574511819116,
"grad_norm": 0.46555411315299916,
"learning_rate": 3.974147792816481e-05,
"loss": 0.4374,
"step": 178
},
{
"epoch": 0.7358684480986639,
"grad_norm": 0.5347959929577083,
"learning_rate": 3.9732183405146984e-05,
"loss": 0.4368,
"step": 179
},
{
"epoch": 0.7399794450154162,
"grad_norm": 0.5543063250157177,
"learning_rate": 3.9722725864615156e-05,
"loss": 0.4468,
"step": 180
},
{
"epoch": 0.7440904419321686,
"grad_norm": 0.39521704775907723,
"learning_rate": 3.971310538470282e-05,
"loss": 0.4338,
"step": 181
},
{
"epoch": 0.7482014388489209,
"grad_norm": 0.47237629534672426,
"learning_rate": 3.9703322044889605e-05,
"loss": 0.4369,
"step": 182
},
{
"epoch": 0.7523124357656732,
"grad_norm": 0.434146415819749,
"learning_rate": 3.969337592600062e-05,
"loss": 0.4458,
"step": 183
},
{
"epoch": 0.7564234326824255,
"grad_norm": 0.38836391572812273,
"learning_rate": 3.968326711020578e-05,
"loss": 0.4546,
"step": 184
},
{
"epoch": 0.7605344295991778,
"grad_norm": 0.34969919974995534,
"learning_rate": 3.967299568101908e-05,
"loss": 0.4459,
"step": 185
},
{
"epoch": 0.7646454265159301,
"grad_norm": 0.41064464728289324,
"learning_rate": 3.9662561723298e-05,
"loss": 0.4326,
"step": 186
},
{
"epoch": 0.7687564234326825,
"grad_norm": 0.4616126051202659,
"learning_rate": 3.9651965323242704e-05,
"loss": 0.4492,
"step": 187
},
{
"epoch": 0.7728674203494348,
"grad_norm": 0.49195669527847435,
"learning_rate": 3.964120656839541e-05,
"loss": 0.4276,
"step": 188
},
{
"epoch": 0.7769784172661871,
"grad_norm": 0.3807633073682157,
"learning_rate": 3.963028554763961e-05,
"loss": 0.4428,
"step": 189
},
{
"epoch": 0.7810894141829393,
"grad_norm": 0.3811255626131261,
"learning_rate": 3.9619202351199356e-05,
"loss": 0.4337,
"step": 190
},
{
"epoch": 0.7852004110996916,
"grad_norm": 0.3612639948436137,
"learning_rate": 3.960795707063852e-05,
"loss": 0.4363,
"step": 191
},
{
"epoch": 0.789311408016444,
"grad_norm": 0.4353027404982674,
"learning_rate": 3.959654979886005e-05,
"loss": 0.4365,
"step": 192
},
{
"epoch": 0.7934224049331963,
"grad_norm": 0.37923924344854587,
"learning_rate": 3.958498063010516e-05,
"loss": 0.4277,
"step": 193
},
{
"epoch": 0.7975334018499486,
"grad_norm": 0.49016416134919827,
"learning_rate": 3.957324965995257e-05,
"loss": 0.4189,
"step": 194
},
{
"epoch": 0.8016443987667009,
"grad_norm": 0.3808642318097945,
"learning_rate": 3.956135698531777e-05,
"loss": 0.428,
"step": 195
},
{
"epoch": 0.8057553956834532,
"grad_norm": 0.4706420424359872,
"learning_rate": 3.9549302704452104e-05,
"loss": 0.4355,
"step": 196
},
{
"epoch": 0.8098663926002055,
"grad_norm": 0.5558683512038307,
"learning_rate": 3.953708691694208e-05,
"loss": 0.4219,
"step": 197
},
{
"epoch": 0.8139773895169579,
"grad_norm": 0.5188467581658631,
"learning_rate": 3.952470972370848e-05,
"loss": 0.4369,
"step": 198
},
{
"epoch": 0.8180883864337102,
"grad_norm": 0.4485136574531589,
"learning_rate": 3.951217122700554e-05,
"loss": 0.4206,
"step": 199
},
{
"epoch": 0.8221993833504625,
"grad_norm": 0.4872982826961068,
"learning_rate": 3.9499471530420086e-05,
"loss": 0.4434,
"step": 200
},
{
"epoch": 0.8263103802672148,
"grad_norm": 0.5704413227159343,
"learning_rate": 3.9486610738870726e-05,
"loss": 0.4332,
"step": 201
},
{
"epoch": 0.8304213771839671,
"grad_norm": 0.6576571504037381,
"learning_rate": 3.947358895860693e-05,
"loss": 0.4282,
"step": 202
},
{
"epoch": 0.8345323741007195,
"grad_norm": 0.5236083635603117,
"learning_rate": 3.9460406297208204e-05,
"loss": 0.4418,
"step": 203
},
{
"epoch": 0.8386433710174718,
"grad_norm": 0.4856398721711883,
"learning_rate": 3.944706286358315e-05,
"loss": 0.4446,
"step": 204
},
{
"epoch": 0.8427543679342241,
"grad_norm": 0.553946219409764,
"learning_rate": 3.94335587679686e-05,
"loss": 0.4421,
"step": 205
},
{
"epoch": 0.8468653648509764,
"grad_norm": 0.559411380541318,
"learning_rate": 3.94198941219287e-05,
"loss": 0.4628,
"step": 206
},
{
"epoch": 0.8509763617677287,
"grad_norm": 0.4879763317857753,
"learning_rate": 3.940606903835398e-05,
"loss": 0.442,
"step": 207
},
{
"epoch": 0.8550873586844809,
"grad_norm": 0.5054384831570833,
"learning_rate": 3.939208363146041e-05,
"loss": 0.4262,
"step": 208
},
{
"epoch": 0.8591983556012333,
"grad_norm": 0.5553954849786898,
"learning_rate": 3.937793801678851e-05,
"loss": 0.427,
"step": 209
},
{
"epoch": 0.8633093525179856,
"grad_norm": 0.5872415310529557,
"learning_rate": 3.936363231120231e-05,
"loss": 0.4413,
"step": 210
},
{
"epoch": 0.8674203494347379,
"grad_norm": 0.5889656491169154,
"learning_rate": 3.934916663288847e-05,
"loss": 0.4374,
"step": 211
},
{
"epoch": 0.8715313463514902,
"grad_norm": 0.5289928076892064,
"learning_rate": 3.9334541101355244e-05,
"loss": 0.4393,
"step": 212
},
{
"epoch": 0.8756423432682425,
"grad_norm": 0.5133117516646354,
"learning_rate": 3.931975583743152e-05,
"loss": 0.4207,
"step": 213
},
{
"epoch": 0.8797533401849948,
"grad_norm": 0.48037331045870174,
"learning_rate": 3.930481096326583e-05,
"loss": 0.4175,
"step": 214
},
{
"epoch": 0.8838643371017472,
"grad_norm": 0.5410217736097758,
"learning_rate": 3.92897066023253e-05,
"loss": 0.431,
"step": 215
},
{
"epoch": 0.8879753340184995,
"grad_norm": 0.41649001377169803,
"learning_rate": 3.927444287939467e-05,
"loss": 0.4484,
"step": 216
},
{
"epoch": 0.8920863309352518,
"grad_norm": 0.45628332224884727,
"learning_rate": 3.925901992057525e-05,
"loss": 0.4305,
"step": 217
},
{
"epoch": 0.8961973278520041,
"grad_norm": 0.5227314578776049,
"learning_rate": 3.924343785328388e-05,
"loss": 0.4393,
"step": 218
},
{
"epoch": 0.9003083247687564,
"grad_norm": 0.4530459458277021,
"learning_rate": 3.9227696806251875e-05,
"loss": 0.4382,
"step": 219
},
{
"epoch": 0.9044193216855088,
"grad_norm": 0.4488315318208515,
"learning_rate": 3.9211796909523953e-05,
"loss": 0.4209,
"step": 220
},
{
"epoch": 0.9085303186022611,
"grad_norm": 0.4369045769060924,
"learning_rate": 3.9195738294457186e-05,
"loss": 0.4357,
"step": 221
},
{
"epoch": 0.9126413155190134,
"grad_norm": 0.3980678441937295,
"learning_rate": 3.9179521093719876e-05,
"loss": 0.4142,
"step": 222
},
{
"epoch": 0.9167523124357657,
"grad_norm": 0.5003747978502763,
"learning_rate": 3.91631454412905e-05,
"loss": 0.4484,
"step": 223
},
{
"epoch": 0.920863309352518,
"grad_norm": 0.43942976248272747,
"learning_rate": 3.914661147245657e-05,
"loss": 0.434,
"step": 224
},
{
"epoch": 0.9249743062692704,
"grad_norm": 0.4174753367400882,
"learning_rate": 3.912991932381355e-05,
"loss": 0.4282,
"step": 225
},
{
"epoch": 0.9290853031860226,
"grad_norm": 0.36920457252907907,
"learning_rate": 3.91130691332637e-05,
"loss": 0.4347,
"step": 226
},
{
"epoch": 0.9331963001027749,
"grad_norm": 0.45392411540078437,
"learning_rate": 3.9096061040014914e-05,
"loss": 0.4135,
"step": 227
},
{
"epoch": 0.9373072970195272,
"grad_norm": 0.4203872157822759,
"learning_rate": 3.907889518457964e-05,
"loss": 0.4422,
"step": 228
},
{
"epoch": 0.9414182939362795,
"grad_norm": 0.391547280290097,
"learning_rate": 3.9061571708773656e-05,
"loss": 0.428,
"step": 229
},
{
"epoch": 0.9455292908530318,
"grad_norm": 0.5746907556468481,
"learning_rate": 3.9044090755714935e-05,
"loss": 0.4273,
"step": 230
},
{
"epoch": 0.9496402877697842,
"grad_norm": 0.5021218433821051,
"learning_rate": 3.9026452469822435e-05,
"loss": 0.4318,
"step": 231
},
{
"epoch": 0.9537512846865365,
"grad_norm": 0.5118619524543895,
"learning_rate": 3.900865699681494e-05,
"loss": 0.4565,
"step": 232
},
{
"epoch": 0.9578622816032888,
"grad_norm": 0.4269764449835691,
"learning_rate": 3.899070448370981e-05,
"loss": 0.4242,
"step": 233
},
{
"epoch": 0.9619732785200411,
"grad_norm": 0.544830400097823,
"learning_rate": 3.897259507882181e-05,
"loss": 0.4308,
"step": 234
},
{
"epoch": 0.9660842754367934,
"grad_norm": 0.5029148596149111,
"learning_rate": 3.895432893176186e-05,
"loss": 0.4283,
"step": 235
},
{
"epoch": 0.9701952723535457,
"grad_norm": 0.6013585416586662,
"learning_rate": 3.8935906193435814e-05,
"loss": 0.4231,
"step": 236
},
{
"epoch": 0.9743062692702981,
"grad_norm": 0.501625593569375,
"learning_rate": 3.89173270160432e-05,
"loss": 0.4335,
"step": 237
},
{
"epoch": 0.9784172661870504,
"grad_norm": 0.6586654174152249,
"learning_rate": 3.889859155307596e-05,
"loss": 0.4365,
"step": 238
},
{
"epoch": 0.9825282631038027,
"grad_norm": 0.6491985191825143,
"learning_rate": 3.8879699959317204e-05,
"loss": 0.428,
"step": 239
},
{
"epoch": 0.986639260020555,
"grad_norm": 0.36412537479982626,
"learning_rate": 3.8860652390839915e-05,
"loss": 0.4258,
"step": 240
},
{
"epoch": 0.9907502569373073,
"grad_norm": 0.562496507066076,
"learning_rate": 3.884144900500565e-05,
"loss": 0.4352,
"step": 241
},
{
"epoch": 0.9948612538540597,
"grad_norm": 0.4838097185277804,
"learning_rate": 3.882208996046327e-05,
"loss": 0.4422,
"step": 242
},
{
"epoch": 0.998972250770812,
"grad_norm": 0.4422917967441169,
"learning_rate": 3.880257541714759e-05,
"loss": 0.4273,
"step": 243
},
{
"epoch": 1.0030832476875642,
"grad_norm": 0.5334045773924255,
"learning_rate": 3.878290553627809e-05,
"loss": 0.3969,
"step": 244
},
{
"epoch": 1.0071942446043165,
"grad_norm": 0.5378794632121926,
"learning_rate": 3.876308048035758e-05,
"loss": 0.3903,
"step": 245
},
{
"epoch": 1.0113052415210688,
"grad_norm": 0.5944247982125659,
"learning_rate": 3.874310041317084e-05,
"loss": 0.3866,
"step": 246
},
{
"epoch": 1.0154162384378211,
"grad_norm": 0.5659631885785738,
"learning_rate": 3.8722965499783265e-05,
"loss": 0.3859,
"step": 247
},
{
"epoch": 1.0195272353545735,
"grad_norm": 0.6678922530928978,
"learning_rate": 3.8702675906539536e-05,
"loss": 0.3975,
"step": 248
},
{
"epoch": 1.0236382322713258,
"grad_norm": 0.6092071387321932,
"learning_rate": 3.868223180106221e-05,
"loss": 0.3805,
"step": 249
},
{
"epoch": 1.027749229188078,
"grad_norm": 0.48801873476109786,
"learning_rate": 3.866163335225034e-05,
"loss": 0.3924,
"step": 250
},
{
"epoch": 1.0318602261048304,
"grad_norm": 0.5338205820825612,
"learning_rate": 3.8640880730278105e-05,
"loss": 0.4015,
"step": 251
},
{
"epoch": 1.0359712230215827,
"grad_norm": 0.47770709705325853,
"learning_rate": 3.8619974106593365e-05,
"loss": 0.3979,
"step": 252
},
{
"epoch": 1.040082219938335,
"grad_norm": 0.6103179105115757,
"learning_rate": 3.859891365391628e-05,
"loss": 0.388,
"step": 253
},
{
"epoch": 1.0441932168550874,
"grad_norm": 0.5427245439232725,
"learning_rate": 3.8577699546237886e-05,
"loss": 0.3811,
"step": 254
},
{
"epoch": 1.0483042137718397,
"grad_norm": 0.532814479999278,
"learning_rate": 3.8556331958818596e-05,
"loss": 0.3872,
"step": 255
},
{
"epoch": 1.052415210688592,
"grad_norm": 0.5339131788688589,
"learning_rate": 3.853481106818683e-05,
"loss": 0.3914,
"step": 256
},
{
"epoch": 1.0565262076053443,
"grad_norm": 0.5060398381577083,
"learning_rate": 3.851313705213751e-05,
"loss": 0.3876,
"step": 257
},
{
"epoch": 1.0606372045220966,
"grad_norm": 0.562896010283109,
"learning_rate": 3.8491310089730614e-05,
"loss": 0.3946,
"step": 258
},
{
"epoch": 1.064748201438849,
"grad_norm": 0.40638744368816154,
"learning_rate": 3.846933036128968e-05,
"loss": 0.3809,
"step": 259
},
{
"epoch": 1.0688591983556013,
"grad_norm": 0.5821981657729004,
"learning_rate": 3.8447198048400325e-05,
"loss": 0.4041,
"step": 260
},
{
"epoch": 1.0729701952723536,
"grad_norm": 0.5613586250111681,
"learning_rate": 3.8424913333908744e-05,
"loss": 0.3834,
"step": 261
},
{
"epoch": 1.077081192189106,
"grad_norm": 0.5210705488884988,
"learning_rate": 3.840247640192019e-05,
"loss": 0.4053,
"step": 262
},
{
"epoch": 1.0811921891058582,
"grad_norm": 0.5223026076271566,
"learning_rate": 3.837988743779747e-05,
"loss": 0.4057,
"step": 263
},
{
"epoch": 1.0853031860226106,
"grad_norm": 0.40771864088972815,
"learning_rate": 3.8357146628159415e-05,
"loss": 0.3759,
"step": 264
},
{
"epoch": 1.0894141829393629,
"grad_norm": 0.5106166010616134,
"learning_rate": 3.8334254160879296e-05,
"loss": 0.3927,
"step": 265
},
{
"epoch": 1.0935251798561152,
"grad_norm": 0.39265167645057447,
"learning_rate": 3.8311210225083347e-05,
"loss": 0.3772,
"step": 266
},
{
"epoch": 1.0976361767728675,
"grad_norm": 0.5406659317819649,
"learning_rate": 3.8288015011149126e-05,
"loss": 0.3877,
"step": 267
},
{
"epoch": 1.1017471736896198,
"grad_norm": 0.4396566495996877,
"learning_rate": 3.826466871070399e-05,
"loss": 0.3919,
"step": 268
},
{
"epoch": 1.1058581706063721,
"grad_norm": 0.4071828889239751,
"learning_rate": 3.82411715166235e-05,
"loss": 0.3929,
"step": 269
},
{
"epoch": 1.1099691675231242,
"grad_norm": 0.4408302571199858,
"learning_rate": 3.821752362302982e-05,
"loss": 0.3984,
"step": 270
},
{
"epoch": 1.1140801644398768,
"grad_norm": 0.3944864874139757,
"learning_rate": 3.8193725225290105e-05,
"loss": 0.3791,
"step": 271
},
{
"epoch": 1.1181911613566289,
"grad_norm": 0.5086637552588018,
"learning_rate": 3.8169776520014935e-05,
"loss": 0.3981,
"step": 272
},
{
"epoch": 1.1223021582733812,
"grad_norm": 0.37495702811326503,
"learning_rate": 3.814567770505663e-05,
"loss": 0.399,
"step": 273
},
{
"epoch": 1.1264131551901335,
"grad_norm": 0.5606532197558952,
"learning_rate": 3.812142897950765e-05,
"loss": 0.3919,
"step": 274
},
{
"epoch": 1.1305241521068858,
"grad_norm": 0.5021460420776965,
"learning_rate": 3.809703054369893e-05,
"loss": 0.3884,
"step": 275
},
{
"epoch": 1.1346351490236382,
"grad_norm": 0.42349030253760284,
"learning_rate": 3.807248259919826e-05,
"loss": 0.3834,
"step": 276
},
{
"epoch": 1.1387461459403905,
"grad_norm": 0.4379650832741319,
"learning_rate": 3.804778534880858e-05,
"loss": 0.3907,
"step": 277
},
{
"epoch": 1.1428571428571428,
"grad_norm": 0.49289600771412606,
"learning_rate": 3.802293899656632e-05,
"loss": 0.3813,
"step": 278
},
{
"epoch": 1.1469681397738951,
"grad_norm": 0.42490353928620167,
"learning_rate": 3.7997943747739735e-05,
"loss": 0.3853,
"step": 279
},
{
"epoch": 1.1510791366906474,
"grad_norm": 0.518693269655709,
"learning_rate": 3.797279980882716e-05,
"loss": 0.3982,
"step": 280
},
{
"epoch": 1.1551901336073997,
"grad_norm": 0.43881434866880253,
"learning_rate": 3.794750738755536e-05,
"loss": 0.3926,
"step": 281
},
{
"epoch": 1.159301130524152,
"grad_norm": 0.43485781066948115,
"learning_rate": 3.792206669287776e-05,
"loss": 0.3922,
"step": 282
},
{
"epoch": 1.1634121274409044,
"grad_norm": 0.45793373109478,
"learning_rate": 3.789647793497279e-05,
"loss": 0.3949,
"step": 283
},
{
"epoch": 1.1675231243576567,
"grad_norm": 0.4344299254818994,
"learning_rate": 3.787074132524206e-05,
"loss": 0.3895,
"step": 284
},
{
"epoch": 1.171634121274409,
"grad_norm": 0.44080712302633035,
"learning_rate": 3.784485707630868e-05,
"loss": 0.3914,
"step": 285
},
{
"epoch": 1.1757451181911613,
"grad_norm": 0.37695184690701744,
"learning_rate": 3.781882540201547e-05,
"loss": 0.3875,
"step": 286
},
{
"epoch": 1.1798561151079137,
"grad_norm": 0.45884161347743313,
"learning_rate": 3.7792646517423236e-05,
"loss": 0.3744,
"step": 287
},
{
"epoch": 1.183967112024666,
"grad_norm": 0.4017320092584037,
"learning_rate": 3.7766320638808924e-05,
"loss": 0.3922,
"step": 288
},
{
"epoch": 1.1880781089414183,
"grad_norm": 0.4280615463958759,
"learning_rate": 3.773984798366389e-05,
"loss": 0.4006,
"step": 289
},
{
"epoch": 1.1921891058581706,
"grad_norm": 0.3513007234774324,
"learning_rate": 3.7713228770692084e-05,
"loss": 0.3819,
"step": 290
},
{
"epoch": 1.196300102774923,
"grad_norm": 0.4230975793009198,
"learning_rate": 3.768646321980824e-05,
"loss": 0.3819,
"step": 291
},
{
"epoch": 1.2004110996916753,
"grad_norm": 0.38816726480644864,
"learning_rate": 3.765955155213607e-05,
"loss": 0.391,
"step": 292
},
{
"epoch": 1.2045220966084276,
"grad_norm": 0.41570862488108373,
"learning_rate": 3.763249399000643e-05,
"loss": 0.3879,
"step": 293
},
{
"epoch": 1.20863309352518,
"grad_norm": 0.5386712701377521,
"learning_rate": 3.7605290756955476e-05,
"loss": 0.404,
"step": 294
},
{
"epoch": 1.2127440904419322,
"grad_norm": 0.4142838391786987,
"learning_rate": 3.757794207772283e-05,
"loss": 0.394,
"step": 295
},
{
"epoch": 1.2168550873586845,
"grad_norm": 0.3918702689073396,
"learning_rate": 3.755044817824971e-05,
"loss": 0.3833,
"step": 296
},
{
"epoch": 1.2209660842754368,
"grad_norm": 0.4790045020269064,
"learning_rate": 3.752280928567709e-05,
"loss": 0.3827,
"step": 297
},
{
"epoch": 1.2250770811921892,
"grad_norm": 0.4071852105252518,
"learning_rate": 3.749502562834379e-05,
"loss": 0.3972,
"step": 298
},
{
"epoch": 1.2291880781089415,
"grad_norm": 0.433522303940447,
"learning_rate": 3.746709743578462e-05,
"loss": 0.3985,
"step": 299
},
{
"epoch": 1.2332990750256938,
"grad_norm": 0.4222484903892338,
"learning_rate": 3.7439024938728435e-05,
"loss": 0.384,
"step": 300
},
{
"epoch": 1.2374100719424461,
"grad_norm": 0.42821966368019687,
"learning_rate": 3.74108083690963e-05,
"loss": 0.3908,
"step": 301
},
{
"epoch": 1.2415210688591984,
"grad_norm": 0.5269787553817297,
"learning_rate": 3.7382447959999514e-05,
"loss": 0.3869,
"step": 302
},
{
"epoch": 1.2456320657759508,
"grad_norm": 0.4206960432187445,
"learning_rate": 3.7353943945737716e-05,
"loss": 0.3984,
"step": 303
},
{
"epoch": 1.249743062692703,
"grad_norm": 0.3963715348953228,
"learning_rate": 3.7325296561796936e-05,
"loss": 0.3908,
"step": 304
},
{
"epoch": 1.2538540596094552,
"grad_norm": 0.5197873707406762,
"learning_rate": 3.729650604484766e-05,
"loss": 0.3789,
"step": 305
},
{
"epoch": 1.2579650565262077,
"grad_norm": 0.391975059464178,
"learning_rate": 3.7267572632742846e-05,
"loss": 0.39,
"step": 306
},
{
"epoch": 1.2620760534429598,
"grad_norm": 0.4297553917483092,
"learning_rate": 3.7238496564516006e-05,
"loss": 0.398,
"step": 307
},
{
"epoch": 1.2661870503597124,
"grad_norm": 0.32415884735671224,
"learning_rate": 3.720927808037921e-05,
"loss": 0.385,
"step": 308
},
{
"epoch": 1.2702980472764644,
"grad_norm": 0.4050462187721075,
"learning_rate": 3.717991742172106e-05,
"loss": 0.3801,
"step": 309
},
{
"epoch": 1.274409044193217,
"grad_norm": 0.44040991415716113,
"learning_rate": 3.7150414831104765e-05,
"loss": 0.3936,
"step": 310
},
{
"epoch": 1.278520041109969,
"grad_norm": 0.4117947843277416,
"learning_rate": 3.712077055226611e-05,
"loss": 0.3966,
"step": 311
},
{
"epoch": 1.2826310380267214,
"grad_norm": 0.4039039643321521,
"learning_rate": 3.7090984830111415e-05,
"loss": 0.3863,
"step": 312
},
{
"epoch": 1.2867420349434737,
"grad_norm": 0.39088426091872597,
"learning_rate": 3.7061057910715546e-05,
"loss": 0.4019,
"step": 313
},
{
"epoch": 1.290853031860226,
"grad_norm": 0.3364663722128402,
"learning_rate": 3.703099004131988e-05,
"loss": 0.389,
"step": 314
},
{
"epoch": 1.2949640287769784,
"grad_norm": 0.396387529395801,
"learning_rate": 3.700078147033023e-05,
"loss": 0.3826,
"step": 315
},
{
"epoch": 1.2990750256937307,
"grad_norm": 0.4034497950317108,
"learning_rate": 3.697043244731484e-05,
"loss": 0.387,
"step": 316
},
{
"epoch": 1.303186022610483,
"grad_norm": 0.45567545271356036,
"learning_rate": 3.693994322300228e-05,
"loss": 0.3903,
"step": 317
},
{
"epoch": 1.3072970195272353,
"grad_norm": 0.36949826512347733,
"learning_rate": 3.69093140492794e-05,
"loss": 0.3907,
"step": 318
},
{
"epoch": 1.3114080164439876,
"grad_norm": 0.3907383409192243,
"learning_rate": 3.687854517918926e-05,
"loss": 0.3884,
"step": 319
},
{
"epoch": 1.31551901336074,
"grad_norm": 0.400771927655429,
"learning_rate": 3.684763686692898e-05,
"loss": 0.3897,
"step": 320
},
{
"epoch": 1.3196300102774923,
"grad_norm": 0.28349821982969425,
"learning_rate": 3.681658936784773e-05,
"loss": 0.3819,
"step": 321
},
{
"epoch": 1.3237410071942446,
"grad_norm": 0.3707057575475429,
"learning_rate": 3.678540293844455e-05,
"loss": 0.4029,
"step": 322
},
{
"epoch": 1.327852004110997,
"grad_norm": 0.3571877683162145,
"learning_rate": 3.675407783636624e-05,
"loss": 0.3888,
"step": 323
},
{
"epoch": 1.3319630010277492,
"grad_norm": 0.35550987334717343,
"learning_rate": 3.672261432040527e-05,
"loss": 0.388,
"step": 324
},
{
"epoch": 1.3360739979445015,
"grad_norm": 0.33342159219384704,
"learning_rate": 3.6691012650497605e-05,
"loss": 0.3949,
"step": 325
},
{
"epoch": 1.3401849948612539,
"grad_norm": 0.33685885775370095,
"learning_rate": 3.665927308772057e-05,
"loss": 0.3801,
"step": 326
},
{
"epoch": 1.3442959917780062,
"grad_norm": 0.3507256755448898,
"learning_rate": 3.6627395894290685e-05,
"loss": 0.4011,
"step": 327
},
{
"epoch": 1.3484069886947585,
"grad_norm": 0.3729058607264493,
"learning_rate": 3.659538133356153e-05,
"loss": 0.3841,
"step": 328
},
{
"epoch": 1.3525179856115108,
"grad_norm": 0.35654106083716275,
"learning_rate": 3.656322967002151e-05,
"loss": 0.3798,
"step": 329
},
{
"epoch": 1.3566289825282631,
"grad_norm": 0.34899761497768883,
"learning_rate": 3.6530941169291744e-05,
"loss": 0.3769,
"step": 330
},
{
"epoch": 1.3607399794450155,
"grad_norm": 0.3500913238620904,
"learning_rate": 3.649851609812379e-05,
"loss": 0.4005,
"step": 331
},
{
"epoch": 1.3648509763617678,
"grad_norm": 0.4370742910901644,
"learning_rate": 3.646595472439753e-05,
"loss": 0.3812,
"step": 332
},
{
"epoch": 1.36896197327852,
"grad_norm": 0.4531455394409143,
"learning_rate": 3.643325731711888e-05,
"loss": 0.3949,
"step": 333
},
{
"epoch": 1.3730729701952724,
"grad_norm": 0.3382908051688983,
"learning_rate": 3.6400424146417604e-05,
"loss": 0.3951,
"step": 334
},
{
"epoch": 1.3771839671120247,
"grad_norm": 0.48124284386355537,
"learning_rate": 3.6367455483545066e-05,
"loss": 0.3886,
"step": 335
},
{
"epoch": 1.381294964028777,
"grad_norm": 0.4073484304811201,
"learning_rate": 3.633435160087202e-05,
"loss": 0.3833,
"step": 336
},
{
"epoch": 1.3854059609455294,
"grad_norm": 0.37602122616857575,
"learning_rate": 3.6301112771886315e-05,
"loss": 0.3947,
"step": 337
},
{
"epoch": 1.3895169578622815,
"grad_norm": 0.35827821253734476,
"learning_rate": 3.62677392711907e-05,
"loss": 0.39,
"step": 338
},
{
"epoch": 1.393627954779034,
"grad_norm": 0.40220244427058716,
"learning_rate": 3.623423137450046e-05,
"loss": 0.3912,
"step": 339
},
{
"epoch": 1.397738951695786,
"grad_norm": 0.3520064901329717,
"learning_rate": 3.620058935864123e-05,
"loss": 0.3902,
"step": 340
},
{
"epoch": 1.4018499486125386,
"grad_norm": 0.3470206706484027,
"learning_rate": 3.616681350154666e-05,
"loss": 0.3817,
"step": 341
},
{
"epoch": 1.4059609455292907,
"grad_norm": 0.3684394787845421,
"learning_rate": 3.613290408225615e-05,
"loss": 0.3827,
"step": 342
},
{
"epoch": 1.4100719424460433,
"grad_norm": 0.34749273169540446,
"learning_rate": 3.609886138091247e-05,
"loss": 0.3874,
"step": 343
},
{
"epoch": 1.4141829393627954,
"grad_norm": 0.36675012503855753,
"learning_rate": 3.606468567875957e-05,
"loss": 0.3863,
"step": 344
},
{
"epoch": 1.418293936279548,
"grad_norm": 0.3087501681001265,
"learning_rate": 3.603037725814014e-05,
"loss": 0.3878,
"step": 345
},
{
"epoch": 1.4224049331963,
"grad_norm": 0.44389782737034467,
"learning_rate": 3.599593640249334e-05,
"loss": 0.3775,
"step": 346
},
{
"epoch": 1.4265159301130523,
"grad_norm": 0.40184525645257135,
"learning_rate": 3.5961363396352435e-05,
"loss": 0.3878,
"step": 347
},
{
"epoch": 1.4306269270298047,
"grad_norm": 0.3739195862038827,
"learning_rate": 3.592665852534246e-05,
"loss": 0.3882,
"step": 348
},
{
"epoch": 1.434737923946557,
"grad_norm": 0.4667442608373335,
"learning_rate": 3.589182207617785e-05,
"loss": 0.3848,
"step": 349
},
{
"epoch": 1.4388489208633093,
"grad_norm": 0.2954516082957151,
"learning_rate": 3.5856854336660075e-05,
"loss": 0.3737,
"step": 350
},
{
"epoch": 1.4429599177800616,
"grad_norm": 0.3807234664590424,
"learning_rate": 3.582175559567524e-05,
"loss": 0.394,
"step": 351
},
{
"epoch": 1.447070914696814,
"grad_norm": 0.37991638726226773,
"learning_rate": 3.578652614319177e-05,
"loss": 0.3924,
"step": 352
},
{
"epoch": 1.4511819116135662,
"grad_norm": 0.4605647523732803,
"learning_rate": 3.575116627025791e-05,
"loss": 0.3895,
"step": 353
},
{
"epoch": 1.4552929085303186,
"grad_norm": 0.34460237531655397,
"learning_rate": 3.571567626899939e-05,
"loss": 0.3979,
"step": 354
},
{
"epoch": 1.4594039054470709,
"grad_norm": 0.37502366115502783,
"learning_rate": 3.568005643261701e-05,
"loss": 0.3865,
"step": 355
},
{
"epoch": 1.4635149023638232,
"grad_norm": 0.361240868061172,
"learning_rate": 3.5644307055384204e-05,
"loss": 0.3927,
"step": 356
},
{
"epoch": 1.4676258992805755,
"grad_norm": 0.36549527451613106,
"learning_rate": 3.5608428432644574e-05,
"loss": 0.3906,
"step": 357
},
{
"epoch": 1.4717368961973278,
"grad_norm": 0.3841131971215696,
"learning_rate": 3.557242086080953e-05,
"loss": 0.3882,
"step": 358
},
{
"epoch": 1.4758478931140802,
"grad_norm": 0.3053174499069298,
"learning_rate": 3.5536284637355766e-05,
"loss": 0.3882,
"step": 359
},
{
"epoch": 1.4799588900308325,
"grad_norm": 0.42930198135043723,
"learning_rate": 3.5500020060822844e-05,
"loss": 0.39,
"step": 360
},
{
"epoch": 1.4840698869475848,
"grad_norm": 0.3646026910744666,
"learning_rate": 3.54636274308107e-05,
"loss": 0.3919,
"step": 361
},
{
"epoch": 1.4881808838643371,
"grad_norm": 0.4584181730800767,
"learning_rate": 3.542710704797721e-05,
"loss": 0.3841,
"step": 362
},
{
"epoch": 1.4922918807810894,
"grad_norm": 0.3912766247821292,
"learning_rate": 3.539045921403566e-05,
"loss": 0.375,
"step": 363
},
{
"epoch": 1.4964028776978417,
"grad_norm": 0.39561662555483357,
"learning_rate": 3.5353684231752276e-05,
"loss": 0.3884,
"step": 364
},
{
"epoch": 1.500513874614594,
"grad_norm": 0.33669597693884484,
"learning_rate": 3.531678240494373e-05,
"loss": 0.3953,
"step": 365
},
{
"epoch": 1.5046248715313464,
"grad_norm": 0.4156836645972758,
"learning_rate": 3.5279754038474616e-05,
"loss": 0.3864,
"step": 366
},
{
"epoch": 1.5087358684480987,
"grad_norm": 0.3888603103920021,
"learning_rate": 3.524259943825493e-05,
"loss": 0.3864,
"step": 367
},
{
"epoch": 1.512846865364851,
"grad_norm": 0.34153109888601435,
"learning_rate": 3.5205318911237566e-05,
"loss": 0.3829,
"step": 368
},
{
"epoch": 1.5169578622816033,
"grad_norm": 0.4203599723923179,
"learning_rate": 3.516791276541574e-05,
"loss": 0.391,
"step": 369
},
{
"epoch": 1.5210688591983557,
"grad_norm": 0.39707036421576897,
"learning_rate": 3.5130381309820474e-05,
"loss": 0.3852,
"step": 370
},
{
"epoch": 1.5251798561151078,
"grad_norm": 0.35484540902249145,
"learning_rate": 3.509272485451806e-05,
"loss": 0.3813,
"step": 371
},
{
"epoch": 1.5292908530318603,
"grad_norm": 0.35726960151965814,
"learning_rate": 3.5054943710607435e-05,
"loss": 0.3943,
"step": 372
},
{
"epoch": 1.5334018499486124,
"grad_norm": 0.34918237917940137,
"learning_rate": 3.50170381902177e-05,
"loss": 0.3813,
"step": 373
},
{
"epoch": 1.537512846865365,
"grad_norm": 0.3225637816337971,
"learning_rate": 3.497900860650545e-05,
"loss": 0.3818,
"step": 374
},
{
"epoch": 1.541623843782117,
"grad_norm": 0.3243987867777615,
"learning_rate": 3.494085527365224e-05,
"loss": 0.3759,
"step": 375
},
{
"epoch": 1.5457348406988696,
"grad_norm": 0.3158935559652955,
"learning_rate": 3.4902578506861995e-05,
"loss": 0.3893,
"step": 376
},
{
"epoch": 1.5498458376156217,
"grad_norm": 0.3815644429337655,
"learning_rate": 3.486417862235839e-05,
"loss": 0.3905,
"step": 377
},
{
"epoch": 1.5539568345323742,
"grad_norm": 0.3118180182058997,
"learning_rate": 3.4825655937382216e-05,
"loss": 0.3865,
"step": 378
},
{
"epoch": 1.5580678314491263,
"grad_norm": 0.3841430312682266,
"learning_rate": 3.4787010770188795e-05,
"loss": 0.3932,
"step": 379
},
{
"epoch": 1.5621788283658788,
"grad_norm": 0.39242889501386036,
"learning_rate": 3.474824344004534e-05,
"loss": 0.3906,
"step": 380
},
{
"epoch": 1.566289825282631,
"grad_norm": 0.3632635332732287,
"learning_rate": 3.4709354267228294e-05,
"loss": 0.3783,
"step": 381
},
{
"epoch": 1.5704008221993835,
"grad_norm": 0.4216314417617418,
"learning_rate": 3.467034357302073e-05,
"loss": 0.3816,
"step": 382
},
{
"epoch": 1.5745118191161356,
"grad_norm": 0.410057885099804,
"learning_rate": 3.463121167970966e-05,
"loss": 0.3843,
"step": 383
},
{
"epoch": 1.5786228160328881,
"grad_norm": 0.30821430114214227,
"learning_rate": 3.4591958910583365e-05,
"loss": 0.3871,
"step": 384
},
{
"epoch": 1.5827338129496402,
"grad_norm": 0.3935547600639123,
"learning_rate": 3.455258558992877e-05,
"loss": 0.379,
"step": 385
},
{
"epoch": 1.5868448098663928,
"grad_norm": 0.3250409563547552,
"learning_rate": 3.451309204302873e-05,
"loss": 0.3801,
"step": 386
},
{
"epoch": 1.5909558067831449,
"grad_norm": 0.356548846946166,
"learning_rate": 3.447347859615933e-05,
"loss": 0.379,
"step": 387
},
{
"epoch": 1.5950668036998972,
"grad_norm": 0.361163416612143,
"learning_rate": 3.443374557658723e-05,
"loss": 0.3745,
"step": 388
},
{
"epoch": 1.5991778006166495,
"grad_norm": 0.3446242948127641,
"learning_rate": 3.439389331256694e-05,
"loss": 0.3807,
"step": 389
},
{
"epoch": 1.6032887975334018,
"grad_norm": 0.33333480141647187,
"learning_rate": 3.435392213333809e-05,
"loss": 0.3832,
"step": 390
},
{
"epoch": 1.6073997944501541,
"grad_norm": 0.38066181381339836,
"learning_rate": 3.431383236912275e-05,
"loss": 0.3692,
"step": 391
},
{
"epoch": 1.6115107913669064,
"grad_norm": 0.3255044440323713,
"learning_rate": 3.427362435112268e-05,
"loss": 0.3728,
"step": 392
},
{
"epoch": 1.6156217882836588,
"grad_norm": 0.41417790723734144,
"learning_rate": 3.423329841151656e-05,
"loss": 0.3868,
"step": 393
},
{
"epoch": 1.619732785200411,
"grad_norm": 0.32170041659499554,
"learning_rate": 3.4192854883457326e-05,
"loss": 0.3724,
"step": 394
},
{
"epoch": 1.6238437821171634,
"grad_norm": 0.3522389078445349,
"learning_rate": 3.4152294101069345e-05,
"loss": 0.3755,
"step": 395
},
{
"epoch": 1.6279547790339157,
"grad_norm": 0.3154196575435205,
"learning_rate": 3.411161639944568e-05,
"loss": 0.3866,
"step": 396
},
{
"epoch": 1.632065775950668,
"grad_norm": 0.3883625817054837,
"learning_rate": 3.407082211464534e-05,
"loss": 0.3842,
"step": 397
},
{
"epoch": 1.6361767728674204,
"grad_norm": 0.32478029230772587,
"learning_rate": 3.402991158369047e-05,
"loss": 0.3856,
"step": 398
},
{
"epoch": 1.6402877697841727,
"grad_norm": 0.33777536538509645,
"learning_rate": 3.39888851445636e-05,
"loss": 0.3738,
"step": 399
},
{
"epoch": 1.644398766700925,
"grad_norm": 0.3645535574440166,
"learning_rate": 3.394774313620481e-05,
"loss": 0.3768,
"step": 400
},
{
"epoch": 1.6485097636176773,
"grad_norm": 0.33553965225554366,
"learning_rate": 3.390648589850897e-05,
"loss": 0.3854,
"step": 401
},
{
"epoch": 1.6526207605344296,
"grad_norm": 0.35131544263569836,
"learning_rate": 3.386511377232293e-05,
"loss": 0.383,
"step": 402
},
{
"epoch": 1.656731757451182,
"grad_norm": 0.3083698384899604,
"learning_rate": 3.382362709944268e-05,
"loss": 0.3913,
"step": 403
},
{
"epoch": 1.6608427543679343,
"grad_norm": 0.3444920510980315,
"learning_rate": 3.3782026222610525e-05,
"loss": 0.3912,
"step": 404
},
{
"epoch": 1.6649537512846866,
"grad_norm": 0.3109066824781155,
"learning_rate": 3.374031148551229e-05,
"loss": 0.3785,
"step": 405
},
{
"epoch": 1.6690647482014387,
"grad_norm": 0.393332877111885,
"learning_rate": 3.3698483232774435e-05,
"loss": 0.3811,
"step": 406
},
{
"epoch": 1.6731757451181912,
"grad_norm": 0.35010985881480106,
"learning_rate": 3.365654180996126e-05,
"loss": 0.3765,
"step": 407
},
{
"epoch": 1.6772867420349433,
"grad_norm": 0.3994860261819717,
"learning_rate": 3.361448756357199e-05,
"loss": 0.3855,
"step": 408
},
{
"epoch": 1.6813977389516959,
"grad_norm": 0.4026873313554007,
"learning_rate": 3.3572320841037945e-05,
"loss": 0.3776,
"step": 409
},
{
"epoch": 1.685508735868448,
"grad_norm": 0.3888166731552757,
"learning_rate": 3.353004199071969e-05,
"loss": 0.389,
"step": 410
},
{
"epoch": 1.6896197327852005,
"grad_norm": 0.4229642214250034,
"learning_rate": 3.348765136190412e-05,
"loss": 0.3844,
"step": 411
},
{
"epoch": 1.6937307297019526,
"grad_norm": 0.3719493753316055,
"learning_rate": 3.344514930480158e-05,
"loss": 0.3718,
"step": 412
},
{
"epoch": 1.6978417266187051,
"grad_norm": 0.3750792470447336,
"learning_rate": 3.3402536170542985e-05,
"loss": 0.4017,
"step": 413
},
{
"epoch": 1.7019527235354572,
"grad_norm": 0.37953623181883855,
"learning_rate": 3.335981231117694e-05,
"loss": 0.3786,
"step": 414
},
{
"epoch": 1.7060637204522098,
"grad_norm": 0.42228613250314784,
"learning_rate": 3.331697807966676e-05,
"loss": 0.3902,
"step": 415
},
{
"epoch": 1.7101747173689619,
"grad_norm": 0.33605301616513616,
"learning_rate": 3.327403382988764e-05,
"loss": 0.382,
"step": 416
},
{
"epoch": 1.7142857142857144,
"grad_norm": 0.915407319860973,
"learning_rate": 3.3230979916623667e-05,
"loss": 0.3868,
"step": 417
},
{
"epoch": 1.7183967112024665,
"grad_norm": 0.4415883382317921,
"learning_rate": 3.318781669556493e-05,
"loss": 0.4025,
"step": 418
},
{
"epoch": 1.722507708119219,
"grad_norm": 0.29764556226533273,
"learning_rate": 3.3144544523304545e-05,
"loss": 0.3868,
"step": 419
},
{
"epoch": 1.7266187050359711,
"grad_norm": 0.3864981881512229,
"learning_rate": 3.310116375733575e-05,
"loss": 0.3848,
"step": 420
},
{
"epoch": 1.7307297019527237,
"grad_norm": 0.4237534589835872,
"learning_rate": 3.3057674756048906e-05,
"loss": 0.3884,
"step": 421
},
{
"epoch": 1.7348406988694758,
"grad_norm": 0.30622506767945284,
"learning_rate": 3.30140778787286e-05,
"loss": 0.3962,
"step": 422
},
{
"epoch": 1.7389516957862283,
"grad_norm": 0.3715870543554042,
"learning_rate": 3.297037348555059e-05,
"loss": 0.3804,
"step": 423
},
{
"epoch": 1.7430626927029804,
"grad_norm": 0.3158873451974222,
"learning_rate": 3.292656193757891e-05,
"loss": 0.3808,
"step": 424
},
{
"epoch": 1.7471736896197327,
"grad_norm": 3.5993500626700534,
"learning_rate": 3.2882643596762847e-05,
"loss": 0.3766,
"step": 425
},
{
"epoch": 1.751284686536485,
"grad_norm": 0.41799220024756045,
"learning_rate": 3.283861882593394e-05,
"loss": 0.3629,
"step": 426
},
{
"epoch": 1.7553956834532374,
"grad_norm": 0.4028165918419239,
"learning_rate": 3.2794487988803024e-05,
"loss": 0.3946,
"step": 427
},
{
"epoch": 1.7595066803699897,
"grad_norm": 0.45312099756724705,
"learning_rate": 3.275025144995719e-05,
"loss": 0.3826,
"step": 428
},
{
"epoch": 1.763617677286742,
"grad_norm": 0.3682320829470106,
"learning_rate": 3.270590957485678e-05,
"loss": 0.3822,
"step": 429
},
{
"epoch": 1.7677286742034943,
"grad_norm": 0.432471521500914,
"learning_rate": 3.266146272983238e-05,
"loss": 0.379,
"step": 430
},
{
"epoch": 1.7718396711202467,
"grad_norm": 0.49258814180632715,
"learning_rate": 3.261691128208178e-05,
"loss": 0.3781,
"step": 431
},
{
"epoch": 1.775950668036999,
"grad_norm": 0.42039354140050533,
"learning_rate": 3.2572255599666946e-05,
"loss": 0.3858,
"step": 432
},
{
"epoch": 1.7800616649537513,
"grad_norm": 0.4266657139962505,
"learning_rate": 3.252749605151099e-05,
"loss": 0.3889,
"step": 433
},
{
"epoch": 1.7841726618705036,
"grad_norm": 0.4111540760053901,
"learning_rate": 3.24826330073951e-05,
"loss": 0.3828,
"step": 434
},
{
"epoch": 1.788283658787256,
"grad_norm": 0.40054562650751135,
"learning_rate": 3.2437666837955495e-05,
"loss": 0.3821,
"step": 435
},
{
"epoch": 1.7923946557040082,
"grad_norm": 0.4049883565747011,
"learning_rate": 3.239259791468037e-05,
"loss": 0.3782,
"step": 436
},
{
"epoch": 1.7965056526207606,
"grad_norm": 0.3211989179680821,
"learning_rate": 3.234742660990681e-05,
"loss": 0.3886,
"step": 437
},
{
"epoch": 1.8006166495375129,
"grad_norm": 0.3415159428416263,
"learning_rate": 3.230215329681775e-05,
"loss": 0.3865,
"step": 438
},
{
"epoch": 1.8047276464542652,
"grad_norm": 0.30795596054473745,
"learning_rate": 3.225677834943884e-05,
"loss": 0.3798,
"step": 439
},
{
"epoch": 1.8088386433710175,
"grad_norm": 0.3527630027822489,
"learning_rate": 3.22113021426354e-05,
"loss": 0.371,
"step": 440
},
{
"epoch": 1.8129496402877698,
"grad_norm": 0.38597884530280835,
"learning_rate": 3.216572505210929e-05,
"loss": 0.386,
"step": 441
},
{
"epoch": 1.8170606372045222,
"grad_norm": 0.35477892953521534,
"learning_rate": 3.2120047454395845e-05,
"loss": 0.3837,
"step": 442
},
{
"epoch": 1.8211716341212743,
"grad_norm": 0.34702546052353167,
"learning_rate": 3.207426972686071e-05,
"loss": 0.3892,
"step": 443
},
{
"epoch": 1.8252826310380268,
"grad_norm": 0.30619045437996395,
"learning_rate": 3.202839224769678e-05,
"loss": 0.3911,
"step": 444
},
{
"epoch": 1.829393627954779,
"grad_norm": 0.308117763052393,
"learning_rate": 3.198241539592103e-05,
"loss": 0.388,
"step": 445
},
{
"epoch": 1.8335046248715314,
"grad_norm": 0.3813900684937835,
"learning_rate": 3.1936339551371416e-05,
"loss": 0.3733,
"step": 446
},
{
"epoch": 1.8376156217882835,
"grad_norm": 0.37451609765152405,
"learning_rate": 3.1890165094703704e-05,
"loss": 0.382,
"step": 447
},
{
"epoch": 1.841726618705036,
"grad_norm": 0.3343641229801653,
"learning_rate": 3.184389240738838e-05,
"loss": 0.3843,
"step": 448
},
{
"epoch": 1.8458376156217882,
"grad_norm": 0.3671589935937082,
"learning_rate": 3.179752187170741e-05,
"loss": 0.3914,
"step": 449
},
{
"epoch": 1.8499486125385407,
"grad_norm": 0.3997665963907156,
"learning_rate": 3.1751053870751184e-05,
"loss": 0.3843,
"step": 450
},
{
"epoch": 1.8540596094552928,
"grad_norm": 0.3253860699538578,
"learning_rate": 3.1704488788415274e-05,
"loss": 0.3855,
"step": 451
},
{
"epoch": 1.8581706063720453,
"grad_norm": 0.3876573196918091,
"learning_rate": 3.16578270093973e-05,
"loss": 0.386,
"step": 452
},
{
"epoch": 1.8622816032887974,
"grad_norm": 0.38773352168091224,
"learning_rate": 3.1611068919193756e-05,
"loss": 0.3783,
"step": 453
},
{
"epoch": 1.86639260020555,
"grad_norm": 0.31178745493034576,
"learning_rate": 3.1564214904096774e-05,
"loss": 0.385,
"step": 454
},
{
"epoch": 1.870503597122302,
"grad_norm": 0.5723565190756046,
"learning_rate": 3.1517265351191e-05,
"loss": 0.3841,
"step": 455
},
{
"epoch": 1.8746145940390546,
"grad_norm": 0.46777702023370726,
"learning_rate": 3.147022064835036e-05,
"loss": 0.385,
"step": 456
},
{
"epoch": 1.8787255909558067,
"grad_norm": 0.48665108942706403,
"learning_rate": 3.142308118423485e-05,
"loss": 0.3808,
"step": 457
},
{
"epoch": 1.8828365878725593,
"grad_norm": 0.5712910353884142,
"learning_rate": 3.1375847348287365e-05,
"loss": 0.3898,
"step": 458
},
{
"epoch": 1.8869475847893113,
"grad_norm": 0.4140740468743388,
"learning_rate": 3.132851953073041e-05,
"loss": 0.3823,
"step": 459
},
{
"epoch": 1.8910585817060637,
"grad_norm": 0.4981734090282241,
"learning_rate": 3.128109812256296e-05,
"loss": 0.379,
"step": 460
},
{
"epoch": 1.895169578622816,
"grad_norm": 0.3901725689405749,
"learning_rate": 3.1233583515557166e-05,
"loss": 0.3802,
"step": 461
},
{
"epoch": 1.8992805755395683,
"grad_norm": 0.39288710655716796,
"learning_rate": 3.118597610225514e-05,
"loss": 0.3648,
"step": 462
},
{
"epoch": 1.9033915724563206,
"grad_norm": 0.3943553998150945,
"learning_rate": 3.113827627596575e-05,
"loss": 0.3845,
"step": 463
},
{
"epoch": 1.907502569373073,
"grad_norm": 0.3815649604071033,
"learning_rate": 3.1090484430761275e-05,
"loss": 0.3968,
"step": 464
},
{
"epoch": 1.9116135662898253,
"grad_norm": 0.4341716993023021,
"learning_rate": 3.104260096147426e-05,
"loss": 0.3825,
"step": 465
},
{
"epoch": 1.9157245632065776,
"grad_norm": 0.2885815389134767,
"learning_rate": 3.099462626369418e-05,
"loss": 0.379,
"step": 466
},
{
"epoch": 1.91983556012333,
"grad_norm": 0.3706179172517124,
"learning_rate": 3.094656073376419e-05,
"loss": 0.3882,
"step": 467
},
{
"epoch": 1.9239465570400822,
"grad_norm": 0.3443004696246589,
"learning_rate": 3.0898404768777863e-05,
"loss": 0.3855,
"step": 468
},
{
"epoch": 1.9280575539568345,
"grad_norm": 0.29420490623628953,
"learning_rate": 3.0850158766575907e-05,
"loss": 0.3843,
"step": 469
},
{
"epoch": 1.9321685508735869,
"grad_norm": 0.3876924984247156,
"learning_rate": 3.080182312574286e-05,
"loss": 0.3746,
"step": 470
},
{
"epoch": 1.9362795477903392,
"grad_norm": 0.28747642038559285,
"learning_rate": 3.075339824560382e-05,
"loss": 0.3718,
"step": 471
},
{
"epoch": 1.9403905447070915,
"grad_norm": 0.32380146376848085,
"learning_rate": 3.070488452622113e-05,
"loss": 0.3934,
"step": 472
},
{
"epoch": 1.9445015416238438,
"grad_norm": 0.32465438985148803,
"learning_rate": 3.0656282368391086e-05,
"loss": 0.3729,
"step": 473
},
{
"epoch": 1.9486125385405961,
"grad_norm": 0.35563518327266175,
"learning_rate": 3.0607592173640615e-05,
"loss": 0.3795,
"step": 474
},
{
"epoch": 1.9527235354573484,
"grad_norm": 0.331866902145928,
"learning_rate": 3.055881434422395e-05,
"loss": 0.3981,
"step": 475
},
{
"epoch": 1.9568345323741008,
"grad_norm": 0.3342210067868538,
"learning_rate": 3.0509949283119348e-05,
"loss": 0.3717,
"step": 476
},
{
"epoch": 1.960945529290853,
"grad_norm": 0.33585492528175326,
"learning_rate": 3.0460997394025694e-05,
"loss": 0.3993,
"step": 477
},
{
"epoch": 1.9650565262076052,
"grad_norm": 0.31011270781830746,
"learning_rate": 3.0411959081359223e-05,
"loss": 0.3865,
"step": 478
},
{
"epoch": 1.9691675231243577,
"grad_norm": 0.35530598698818877,
"learning_rate": 3.036283475025016e-05,
"loss": 0.3784,
"step": 479
},
{
"epoch": 1.9732785200411098,
"grad_norm": 0.3734052740131826,
"learning_rate": 3.031362480653937e-05,
"loss": 0.3762,
"step": 480
},
{
"epoch": 1.9773895169578624,
"grad_norm": 0.3014940676108034,
"learning_rate": 3.0264329656775e-05,
"loss": 0.3757,
"step": 481
},
{
"epoch": 1.9815005138746145,
"grad_norm": 0.3512117145148321,
"learning_rate": 3.021494970820912e-05,
"loss": 0.3827,
"step": 482
},
{
"epoch": 1.985611510791367,
"grad_norm": 0.37355802443996994,
"learning_rate": 3.01654853687944e-05,
"loss": 0.3642,
"step": 483
},
{
"epoch": 1.989722507708119,
"grad_norm": 0.2861615252457176,
"learning_rate": 3.011593704718067e-05,
"loss": 0.3963,
"step": 484
},
{
"epoch": 1.9938335046248716,
"grad_norm": 0.3745753953644458,
"learning_rate": 3.0066305152711598e-05,
"loss": 0.3878,
"step": 485
},
{
"epoch": 1.9979445015416237,
"grad_norm": 0.26150625589651816,
"learning_rate": 3.0016590095421273e-05,
"loss": 0.3721,
"step": 486
},
{
"epoch": 2.0020554984583763,
"grad_norm": 0.3918210766291012,
"learning_rate": 2.9966792286030853e-05,
"loss": 0.3396,
"step": 487
},
{
"epoch": 2.0061664953751284,
"grad_norm": 0.315886174265335,
"learning_rate": 2.9916912135945147e-05,
"loss": 0.3326,
"step": 488
},
{
"epoch": 2.010277492291881,
"grad_norm": 0.5139005301093035,
"learning_rate": 2.986695005724921e-05,
"loss": 0.3331,
"step": 489
},
{
"epoch": 2.014388489208633,
"grad_norm": 0.4039956111942429,
"learning_rate": 2.9816906462704963e-05,
"loss": 0.3318,
"step": 490
},
{
"epoch": 2.0184994861253855,
"grad_norm": 0.3643964107370674,
"learning_rate": 2.9766781765747775e-05,
"loss": 0.331,
"step": 491
},
{
"epoch": 2.0226104830421376,
"grad_norm": 0.36816703584916016,
"learning_rate": 2.971657638048302e-05,
"loss": 0.3318,
"step": 492
},
{
"epoch": 2.02672147995889,
"grad_norm": 0.42271803167213406,
"learning_rate": 2.966629072168271e-05,
"loss": 0.3344,
"step": 493
},
{
"epoch": 2.0308324768756423,
"grad_norm": 0.3930653905455099,
"learning_rate": 2.9615925204782006e-05,
"loss": 0.3177,
"step": 494
},
{
"epoch": 2.034943473792395,
"grad_norm": 0.40048080993718765,
"learning_rate": 2.9565480245875845e-05,
"loss": 0.3358,
"step": 495
},
{
"epoch": 2.039054470709147,
"grad_norm": 0.3484759664627585,
"learning_rate": 2.9514956261715458e-05,
"loss": 0.3303,
"step": 496
},
{
"epoch": 2.0431654676258995,
"grad_norm": 1.2952724176128951,
"learning_rate": 2.9464353669704943e-05,
"loss": 0.3683,
"step": 497
},
{
"epoch": 2.0472764645426516,
"grad_norm": 0.3554075988337682,
"learning_rate": 2.9413672887897828e-05,
"loss": 0.34,
"step": 498
},
{
"epoch": 2.051387461459404,
"grad_norm": 0.5002880458311052,
"learning_rate": 2.936291433499359e-05,
"loss": 0.3304,
"step": 499
},
{
"epoch": 2.055498458376156,
"grad_norm": 0.37407564917246083,
"learning_rate": 2.9312078430334228e-05,
"loss": 0.3358,
"step": 500
},
{
"epoch": 2.0596094552929087,
"grad_norm": 0.4077464439245042,
"learning_rate": 2.926116559390078e-05,
"loss": 0.3232,
"step": 501
},
{
"epoch": 2.063720452209661,
"grad_norm": 0.3809714773736691,
"learning_rate": 2.921017624630984e-05,
"loss": 0.323,
"step": 502
},
{
"epoch": 2.0678314491264134,
"grad_norm": 0.311417329278231,
"learning_rate": 2.9159110808810125e-05,
"loss": 0.3192,
"step": 503
},
{
"epoch": 2.0719424460431655,
"grad_norm": 0.44030503896413653,
"learning_rate": 2.9107969703278952e-05,
"loss": 0.3354,
"step": 504
},
{
"epoch": 2.0760534429599176,
"grad_norm": 0.3036316440870094,
"learning_rate": 2.905675335221877e-05,
"loss": 0.3299,
"step": 505
},
{
"epoch": 2.08016443987667,
"grad_norm": 0.4247647928977746,
"learning_rate": 2.900546217875368e-05,
"loss": 0.3288,
"step": 506
},
{
"epoch": 2.084275436793422,
"grad_norm": 0.3390034407042314,
"learning_rate": 2.895409660662592e-05,
"loss": 0.3328,
"step": 507
},
{
"epoch": 2.0883864337101747,
"grad_norm": 0.4990365827984489,
"learning_rate": 2.8902657060192366e-05,
"loss": 0.3376,
"step": 508
},
{
"epoch": 2.092497430626927,
"grad_norm": 0.5173705351976455,
"learning_rate": 2.8851143964421048e-05,
"loss": 0.3356,
"step": 509
},
{
"epoch": 2.0966084275436794,
"grad_norm": 0.5151102205707064,
"learning_rate": 2.879955774488762e-05,
"loss": 0.332,
"step": 510
},
{
"epoch": 2.1007194244604315,
"grad_norm": 0.43417520836094964,
"learning_rate": 2.8747898827771846e-05,
"loss": 0.3389,
"step": 511
},
{
"epoch": 2.104830421377184,
"grad_norm": 0.5355654606933186,
"learning_rate": 2.8696167639854073e-05,
"loss": 0.341,
"step": 512
},
{
"epoch": 2.108941418293936,
"grad_norm": 0.4367393823993611,
"learning_rate": 2.864436460851173e-05,
"loss": 0.3299,
"step": 513
},
{
"epoch": 2.1130524152106887,
"grad_norm": 0.45783141095235763,
"learning_rate": 2.8592490161715768e-05,
"loss": 0.3191,
"step": 514
},
{
"epoch": 2.1171634121274407,
"grad_norm": 0.42545114058633565,
"learning_rate": 2.8540544728027145e-05,
"loss": 0.3145,
"step": 515
},
{
"epoch": 2.1212744090441933,
"grad_norm": 0.3661488589187853,
"learning_rate": 2.8488528736593278e-05,
"loss": 0.3275,
"step": 516
},
{
"epoch": 2.1253854059609454,
"grad_norm": 0.5060616601470208,
"learning_rate": 2.843644261714448e-05,
"loss": 0.3384,
"step": 517
},
{
"epoch": 2.129496402877698,
"grad_norm": 0.3580510131662911,
"learning_rate": 2.8384286799990452e-05,
"loss": 0.3296,
"step": 518
},
{
"epoch": 2.13360739979445,
"grad_norm": 0.45075270681673163,
"learning_rate": 2.8332061716016692e-05,
"loss": 0.32,
"step": 519
},
{
"epoch": 2.1377183967112026,
"grad_norm": 0.4708082264494772,
"learning_rate": 2.8279767796680934e-05,
"loss": 0.3332,
"step": 520
},
{
"epoch": 2.1418293936279547,
"grad_norm": 0.35417572710043976,
"learning_rate": 2.8227405474009616e-05,
"loss": 0.325,
"step": 521
},
{
"epoch": 2.145940390544707,
"grad_norm": 0.5178072041280041,
"learning_rate": 2.817497518059428e-05,
"loss": 0.3286,
"step": 522
},
{
"epoch": 2.1500513874614593,
"grad_norm": 0.333153745006992,
"learning_rate": 2.8122477349588005e-05,
"loss": 0.3247,
"step": 523
},
{
"epoch": 2.154162384378212,
"grad_norm": 0.5499040672396817,
"learning_rate": 2.8069912414701842e-05,
"loss": 0.3338,
"step": 524
},
{
"epoch": 2.158273381294964,
"grad_norm": 0.29956362280088755,
"learning_rate": 2.8017280810201213e-05,
"loss": 0.3307,
"step": 525
},
{
"epoch": 2.1623843782117165,
"grad_norm": 0.4276269537060341,
"learning_rate": 2.7964582970902338e-05,
"loss": 0.3263,
"step": 526
},
{
"epoch": 2.1664953751284686,
"grad_norm": 0.3311312720633184,
"learning_rate": 2.7911819332168627e-05,
"loss": 0.3302,
"step": 527
},
{
"epoch": 2.170606372045221,
"grad_norm": 0.32361165480350135,
"learning_rate": 2.78589903299071e-05,
"loss": 0.3307,
"step": 528
},
{
"epoch": 2.174717368961973,
"grad_norm": 0.33128441736832326,
"learning_rate": 2.7806096400564775e-05,
"loss": 0.3234,
"step": 529
},
{
"epoch": 2.1788283658787257,
"grad_norm": 0.2945513597575282,
"learning_rate": 2.7753137981125068e-05,
"loss": 0.3354,
"step": 530
},
{
"epoch": 2.182939362795478,
"grad_norm": 0.35769877925150756,
"learning_rate": 2.7700115509104176e-05,
"loss": 0.336,
"step": 531
},
{
"epoch": 2.1870503597122304,
"grad_norm": 0.3065613778661335,
"learning_rate": 2.7647029422547465e-05,
"loss": 0.3326,
"step": 532
},
{
"epoch": 2.1911613566289825,
"grad_norm": 0.31377341167653106,
"learning_rate": 2.7593880160025864e-05,
"loss": 0.3354,
"step": 533
},
{
"epoch": 2.195272353545735,
"grad_norm": 0.28252520009349796,
"learning_rate": 2.754066816063222e-05,
"loss": 0.3194,
"step": 534
},
{
"epoch": 2.199383350462487,
"grad_norm": 0.2792714767775337,
"learning_rate": 2.7487393863977687e-05,
"loss": 0.3369,
"step": 535
},
{
"epoch": 2.2034943473792397,
"grad_norm": 0.28232252447629436,
"learning_rate": 2.7434057710188077e-05,
"loss": 0.3157,
"step": 536
},
{
"epoch": 2.2076053442959918,
"grad_norm": 0.25752143372328223,
"learning_rate": 2.738066013990025e-05,
"loss": 0.3153,
"step": 537
},
{
"epoch": 2.2117163412127443,
"grad_norm": 0.297593757050134,
"learning_rate": 2.732720159425845e-05,
"loss": 0.3296,
"step": 538
},
{
"epoch": 2.2158273381294964,
"grad_norm": 0.2758026361391992,
"learning_rate": 2.7273682514910668e-05,
"loss": 0.3247,
"step": 539
},
{
"epoch": 2.2199383350462485,
"grad_norm": 0.3422530970797541,
"learning_rate": 2.7220103344004995e-05,
"loss": 0.3293,
"step": 540
},
{
"epoch": 2.224049331963001,
"grad_norm": 0.3668216989996492,
"learning_rate": 2.7166464524185977e-05,
"loss": 0.3419,
"step": 541
},
{
"epoch": 2.2281603288797536,
"grad_norm": 0.3353880633821636,
"learning_rate": 2.7112766498590944e-05,
"loss": 0.3277,
"step": 542
},
{
"epoch": 2.2322713257965057,
"grad_norm": 0.3990592979092236,
"learning_rate": 2.705900971084635e-05,
"loss": 0.3352,
"step": 543
},
{
"epoch": 2.2363823227132578,
"grad_norm": 0.34649020190108354,
"learning_rate": 2.7005194605064122e-05,
"loss": 0.3334,
"step": 544
},
{
"epoch": 2.2404933196300103,
"grad_norm": 0.30730771299144677,
"learning_rate": 2.6951321625837975e-05,
"loss": 0.3299,
"step": 545
},
{
"epoch": 2.2446043165467624,
"grad_norm": 0.3875753398426506,
"learning_rate": 2.6897391218239746e-05,
"loss": 0.3338,
"step": 546
},
{
"epoch": 2.248715313463515,
"grad_norm": 0.27365792996452604,
"learning_rate": 2.6843403827815714e-05,
"loss": 0.3353,
"step": 547
},
{
"epoch": 2.252826310380267,
"grad_norm": 0.3913041787492654,
"learning_rate": 2.6789359900582935e-05,
"loss": 0.3274,
"step": 548
},
{
"epoch": 2.2569373072970196,
"grad_norm": 0.25296864218831433,
"learning_rate": 2.673525988302553e-05,
"loss": 0.344,
"step": 549
},
{
"epoch": 2.2610483042137717,
"grad_norm": 0.3931054005221806,
"learning_rate": 2.6681104222091018e-05,
"loss": 0.3387,
"step": 550
},
{
"epoch": 2.265159301130524,
"grad_norm": 0.25058187158942646,
"learning_rate": 2.662689336518661e-05,
"loss": 0.3306,
"step": 551
},
{
"epoch": 2.2692702980472763,
"grad_norm": 0.34466530037047466,
"learning_rate": 2.6572627760175523e-05,
"loss": 0.334,
"step": 552
},
{
"epoch": 2.273381294964029,
"grad_norm": 0.27034275974079125,
"learning_rate": 2.6518307855373276e-05,
"loss": 0.3245,
"step": 553
},
{
"epoch": 2.277492291880781,
"grad_norm": 0.32305382508070213,
"learning_rate": 2.6463934099543992e-05,
"loss": 0.3337,
"step": 554
},
{
"epoch": 2.2816032887975335,
"grad_norm": 0.2943172520547782,
"learning_rate": 2.6409506941896665e-05,
"loss": 0.336,
"step": 555
},
{
"epoch": 2.2857142857142856,
"grad_norm": 0.3589575171976915,
"learning_rate": 2.6355026832081493e-05,
"loss": 0.331,
"step": 556
},
{
"epoch": 2.289825282631038,
"grad_norm": 0.32434981120796447,
"learning_rate": 2.6300494220186113e-05,
"loss": 0.3318,
"step": 557
},
{
"epoch": 2.2939362795477902,
"grad_norm": 0.3207602935494296,
"learning_rate": 2.6245909556731937e-05,
"loss": 0.3244,
"step": 558
},
{
"epoch": 2.2980472764645428,
"grad_norm": 0.2994113594865251,
"learning_rate": 2.6191273292670372e-05,
"loss": 0.3342,
"step": 559
},
{
"epoch": 2.302158273381295,
"grad_norm": 0.3188506407975691,
"learning_rate": 2.6136585879379145e-05,
"loss": 0.3394,
"step": 560
},
{
"epoch": 2.3062692702980474,
"grad_norm": 0.3098768532791999,
"learning_rate": 2.608184776865854e-05,
"loss": 0.3289,
"step": 561
},
{
"epoch": 2.3103802672147995,
"grad_norm": 0.30655390743691074,
"learning_rate": 2.602705941272769e-05,
"loss": 0.322,
"step": 562
},
{
"epoch": 2.314491264131552,
"grad_norm": 0.3063020702615511,
"learning_rate": 2.597222126422081e-05,
"loss": 0.3332,
"step": 563
},
{
"epoch": 2.318602261048304,
"grad_norm": 0.3156706802866886,
"learning_rate": 2.5917333776183503e-05,
"loss": 0.3228,
"step": 564
},
{
"epoch": 2.3227132579650567,
"grad_norm": 0.28723121088598647,
"learning_rate": 2.586239740206897e-05,
"loss": 0.3197,
"step": 565
},
{
"epoch": 2.3268242548818088,
"grad_norm": 0.30433476866497944,
"learning_rate": 2.5807412595734283e-05,
"loss": 0.3279,
"step": 566
},
{
"epoch": 2.3309352517985613,
"grad_norm": 0.25756394575456126,
"learning_rate": 2.5752379811436655e-05,
"loss": 0.324,
"step": 567
},
{
"epoch": 2.3350462487153134,
"grad_norm": 0.3098697810639567,
"learning_rate": 2.5697299503829657e-05,
"loss": 0.3347,
"step": 568
},
{
"epoch": 2.339157245632066,
"grad_norm": 0.30837147852538255,
"learning_rate": 2.5642172127959475e-05,
"loss": 0.3292,
"step": 569
},
{
"epoch": 2.343268242548818,
"grad_norm": 0.3069265524451522,
"learning_rate": 2.558699813926115e-05,
"loss": 0.3323,
"step": 570
},
{
"epoch": 2.3473792394655706,
"grad_norm": 0.32407359000068336,
"learning_rate": 2.5531777993554813e-05,
"loss": 0.3317,
"step": 571
},
{
"epoch": 2.3514902363823227,
"grad_norm": 0.3118278827236543,
"learning_rate": 2.5476512147041926e-05,
"loss": 0.3428,
"step": 572
},
{
"epoch": 2.3556012332990752,
"grad_norm": 0.3342461379651357,
"learning_rate": 2.5421201056301507e-05,
"loss": 0.3284,
"step": 573
},
{
"epoch": 2.3597122302158273,
"grad_norm": 0.29958642203118996,
"learning_rate": 2.5365845178286358e-05,
"loss": 0.3275,
"step": 574
},
{
"epoch": 2.3638232271325794,
"grad_norm": 0.3328808710382115,
"learning_rate": 2.5310444970319292e-05,
"loss": 0.3301,
"step": 575
},
{
"epoch": 2.367934224049332,
"grad_norm": 0.3782109254880134,
"learning_rate": 2.525500089008936e-05,
"loss": 0.3375,
"step": 576
},
{
"epoch": 2.3720452209660845,
"grad_norm": 0.3451832289715049,
"learning_rate": 2.5199513395648047e-05,
"loss": 0.3207,
"step": 577
},
{
"epoch": 2.3761562178828366,
"grad_norm": 0.34430102536898843,
"learning_rate": 2.5143982945405527e-05,
"loss": 0.3335,
"step": 578
},
{
"epoch": 2.3802672147995887,
"grad_norm": 0.3480033297872511,
"learning_rate": 2.5088409998126827e-05,
"loss": 0.3364,
"step": 579
},
{
"epoch": 2.3843782117163412,
"grad_norm": 0.2864218297613634,
"learning_rate": 2.5032795012928093e-05,
"loss": 0.3296,
"step": 580
},
{
"epoch": 2.3884892086330938,
"grad_norm": 0.27656367755763744,
"learning_rate": 2.4977138449272746e-05,
"loss": 0.3252,
"step": 581
},
{
"epoch": 2.392600205549846,
"grad_norm": 0.3210933436925842,
"learning_rate": 2.4921440766967718e-05,
"loss": 0.3292,
"step": 582
},
{
"epoch": 2.396711202466598,
"grad_norm": 0.2695136624951651,
"learning_rate": 2.4865702426159633e-05,
"loss": 0.3345,
"step": 583
},
{
"epoch": 2.4008221993833505,
"grad_norm": 0.2853367762196653,
"learning_rate": 2.4809923887331028e-05,
"loss": 0.3272,
"step": 584
},
{
"epoch": 2.4049331963001026,
"grad_norm": 0.26413255067697416,
"learning_rate": 2.4754105611296534e-05,
"loss": 0.3244,
"step": 585
},
{
"epoch": 2.409044193216855,
"grad_norm": 0.2788852049644498,
"learning_rate": 2.4698248059199056e-05,
"loss": 0.3211,
"step": 586
},
{
"epoch": 2.4131551901336072,
"grad_norm": 0.2839043038975584,
"learning_rate": 2.4642351692505998e-05,
"loss": 0.3227,
"step": 587
},
{
"epoch": 2.41726618705036,
"grad_norm": 0.26795507769344473,
"learning_rate": 2.4586416973005414e-05,
"loss": 0.3255,
"step": 588
},
{
"epoch": 2.421377183967112,
"grad_norm": 0.28061689093119546,
"learning_rate": 2.453044436280223e-05,
"loss": 0.3297,
"step": 589
},
{
"epoch": 2.4254881808838644,
"grad_norm": 0.2768666834694627,
"learning_rate": 2.4474434324314388e-05,
"loss": 0.3351,
"step": 590
},
{
"epoch": 2.4295991778006165,
"grad_norm": 0.2992581902997908,
"learning_rate": 2.4418387320269047e-05,
"loss": 0.3185,
"step": 591
},
{
"epoch": 2.433710174717369,
"grad_norm": 0.2582002464494716,
"learning_rate": 2.4362303813698766e-05,
"loss": 0.3262,
"step": 592
},
{
"epoch": 2.437821171634121,
"grad_norm": 0.288633230170238,
"learning_rate": 2.4306184267937654e-05,
"loss": 0.3317,
"step": 593
},
{
"epoch": 2.4419321685508737,
"grad_norm": 0.2642364711177551,
"learning_rate": 2.425002914661758e-05,
"loss": 0.3325,
"step": 594
},
{
"epoch": 2.446043165467626,
"grad_norm": 0.28079190224987655,
"learning_rate": 2.419383891366431e-05,
"loss": 0.3186,
"step": 595
},
{
"epoch": 2.4501541623843783,
"grad_norm": 0.2517139036005103,
"learning_rate": 2.4137614033293676e-05,
"loss": 0.3325,
"step": 596
},
{
"epoch": 2.4542651593011304,
"grad_norm": 0.3048176393508488,
"learning_rate": 2.408135497000776e-05,
"loss": 0.3258,
"step": 597
},
{
"epoch": 2.458376156217883,
"grad_norm": 0.2553179117187841,
"learning_rate": 2.4025062188591046e-05,
"loss": 0.3286,
"step": 598
},
{
"epoch": 2.462487153134635,
"grad_norm": 0.297522330860201,
"learning_rate": 2.3968736154106574e-05,
"loss": 0.3257,
"step": 599
},
{
"epoch": 2.4665981500513876,
"grad_norm": 0.26227969960383657,
"learning_rate": 2.3912377331892112e-05,
"loss": 0.3348,
"step": 600
},
{
"epoch": 2.4707091469681397,
"grad_norm": 0.2678339048494993,
"learning_rate": 2.3855986187556295e-05,
"loss": 0.3247,
"step": 601
},
{
"epoch": 2.4748201438848922,
"grad_norm": 0.2782462750099432,
"learning_rate": 2.3799563186974802e-05,
"loss": 0.3288,
"step": 602
},
{
"epoch": 2.4789311408016443,
"grad_norm": 0.26942196233507953,
"learning_rate": 2.374310879628647e-05,
"loss": 0.3343,
"step": 603
},
{
"epoch": 2.483042137718397,
"grad_norm": 0.25192857481609987,
"learning_rate": 2.3686623481889496e-05,
"loss": 0.3355,
"step": 604
},
{
"epoch": 2.487153134635149,
"grad_norm": 0.27024432726841424,
"learning_rate": 2.3630107710437526e-05,
"loss": 0.3296,
"step": 605
},
{
"epoch": 2.4912641315519015,
"grad_norm": 0.2491507090752715,
"learning_rate": 2.3573561948835836e-05,
"loss": 0.3421,
"step": 606
},
{
"epoch": 2.4953751284686536,
"grad_norm": 0.3126612318343971,
"learning_rate": 2.3516986664237474e-05,
"loss": 0.3254,
"step": 607
},
{
"epoch": 2.499486125385406,
"grad_norm": 0.2643267374371664,
"learning_rate": 2.3460382324039377e-05,
"loss": 0.3272,
"step": 608
},
{
"epoch": 2.5035971223021583,
"grad_norm": 0.2786020179741824,
"learning_rate": 2.3403749395878542e-05,
"loss": 0.3292,
"step": 609
},
{
"epoch": 2.5077081192189103,
"grad_norm": 0.2873861028514028,
"learning_rate": 2.3347088347628128e-05,
"loss": 0.3307,
"step": 610
},
{
"epoch": 2.511819116135663,
"grad_norm": 0.2772071301023664,
"learning_rate": 2.3290399647393628e-05,
"loss": 0.324,
"step": 611
},
{
"epoch": 2.5159301130524154,
"grad_norm": 0.27537902676408144,
"learning_rate": 2.3233683763508957e-05,
"loss": 0.3343,
"step": 612
},
{
"epoch": 2.5200411099691675,
"grad_norm": 0.2859207355422494,
"learning_rate": 2.317694116453263e-05,
"loss": 0.34,
"step": 613
},
{
"epoch": 2.5241521068859196,
"grad_norm": 0.2743835098944321,
"learning_rate": 2.3120172319243864e-05,
"loss": 0.3338,
"step": 614
},
{
"epoch": 2.528263103802672,
"grad_norm": 0.27595779721707764,
"learning_rate": 2.3063377696638707e-05,
"loss": 0.3311,
"step": 615
},
{
"epoch": 2.5323741007194247,
"grad_norm": 0.26843945933414415,
"learning_rate": 2.300655776592616e-05,
"loss": 0.3335,
"step": 616
},
{
"epoch": 2.536485097636177,
"grad_norm": 0.25648610540979605,
"learning_rate": 2.294971299652432e-05,
"loss": 0.3235,
"step": 617
},
{
"epoch": 2.540596094552929,
"grad_norm": 0.3013681074148862,
"learning_rate": 2.2892843858056474e-05,
"loss": 0.3321,
"step": 618
},
{
"epoch": 2.5447070914696814,
"grad_norm": 0.24919313896376655,
"learning_rate": 2.283595082034725e-05,
"loss": 0.3167,
"step": 619
},
{
"epoch": 2.548818088386434,
"grad_norm": 0.2688625414735968,
"learning_rate": 2.2779034353418707e-05,
"loss": 0.3324,
"step": 620
},
{
"epoch": 2.552929085303186,
"grad_norm": 0.26263149016674175,
"learning_rate": 2.2722094927486472e-05,
"loss": 0.3286,
"step": 621
},
{
"epoch": 2.557040082219938,
"grad_norm": 0.2823135658125824,
"learning_rate": 2.2665133012955844e-05,
"loss": 0.3383,
"step": 622
},
{
"epoch": 2.5611510791366907,
"grad_norm": 0.276217133090313,
"learning_rate": 2.2608149080417913e-05,
"loss": 0.3289,
"step": 623
},
{
"epoch": 2.565262076053443,
"grad_norm": 0.2850565964189695,
"learning_rate": 2.2551143600645672e-05,
"loss": 0.3244,
"step": 624
},
{
"epoch": 2.5693730729701953,
"grad_norm": 0.26362750208519725,
"learning_rate": 2.249411704459013e-05,
"loss": 0.3361,
"step": 625
},
{
"epoch": 2.5734840698869474,
"grad_norm": 0.2798643173147311,
"learning_rate": 2.2437069883376404e-05,
"loss": 0.3142,
"step": 626
},
{
"epoch": 2.5775950668037,
"grad_norm": 0.2563288966482464,
"learning_rate": 2.238000258829986e-05,
"loss": 0.324,
"step": 627
},
{
"epoch": 2.581706063720452,
"grad_norm": 0.30508227611107025,
"learning_rate": 2.2322915630822184e-05,
"loss": 0.3226,
"step": 628
},
{
"epoch": 2.5858170606372046,
"grad_norm": 0.25091522041815256,
"learning_rate": 2.226580948256751e-05,
"loss": 0.3315,
"step": 629
},
{
"epoch": 2.5899280575539567,
"grad_norm": 0.32219820646356984,
"learning_rate": 2.2208684615318515e-05,
"loss": 0.3291,
"step": 630
},
{
"epoch": 2.5940390544707093,
"grad_norm": 0.2642592817496665,
"learning_rate": 2.2151541501012526e-05,
"loss": 0.3348,
"step": 631
},
{
"epoch": 2.5981500513874614,
"grad_norm": 0.2696044140880529,
"learning_rate": 2.2094380611737615e-05,
"loss": 0.336,
"step": 632
},
{
"epoch": 2.602261048304214,
"grad_norm": 0.2778886911936094,
"learning_rate": 2.20372024197287e-05,
"loss": 0.3221,
"step": 633
},
{
"epoch": 2.606372045220966,
"grad_norm": 0.26840999460258913,
"learning_rate": 2.1980007397363653e-05,
"loss": 0.3283,
"step": 634
},
{
"epoch": 2.6104830421377185,
"grad_norm": 0.30677147336816346,
"learning_rate": 2.1922796017159382e-05,
"loss": 0.3391,
"step": 635
},
{
"epoch": 2.6145940390544706,
"grad_norm": 0.26204192651719005,
"learning_rate": 2.186556875176794e-05,
"loss": 0.3181,
"step": 636
},
{
"epoch": 2.618705035971223,
"grad_norm": 0.29330464889106106,
"learning_rate": 2.1808326073972618e-05,
"loss": 0.3334,
"step": 637
},
{
"epoch": 2.6228160328879753,
"grad_norm": 0.3611585390826276,
"learning_rate": 2.1751068456684026e-05,
"loss": 0.3328,
"step": 638
},
{
"epoch": 2.626927029804728,
"grad_norm": 0.32081188768018193,
"learning_rate": 2.1693796372936207e-05,
"loss": 0.3348,
"step": 639
},
{
"epoch": 2.63103802672148,
"grad_norm": 0.3466957904122417,
"learning_rate": 2.1636510295882723e-05,
"loss": 0.3287,
"step": 640
},
{
"epoch": 2.635149023638232,
"grad_norm": 0.3325190809270464,
"learning_rate": 2.1579210698792724e-05,
"loss": 0.3357,
"step": 641
},
{
"epoch": 2.6392600205549845,
"grad_norm": 0.323613824705376,
"learning_rate": 2.1521898055047065e-05,
"loss": 0.3254,
"step": 642
},
{
"epoch": 2.643371017471737,
"grad_norm": 0.31700835111056935,
"learning_rate": 2.1464572838134393e-05,
"loss": 0.3405,
"step": 643
},
{
"epoch": 2.647482014388489,
"grad_norm": 0.31194064100135144,
"learning_rate": 2.1407235521647216e-05,
"loss": 0.3337,
"step": 644
},
{
"epoch": 2.6515930113052413,
"grad_norm": 0.291054868309333,
"learning_rate": 2.134988657927802e-05,
"loss": 0.3223,
"step": 645
},
{
"epoch": 2.655704008221994,
"grad_norm": 0.28960930247219024,
"learning_rate": 2.129252648481532e-05,
"loss": 0.3399,
"step": 646
},
{
"epoch": 2.6598150051387464,
"grad_norm": 0.262272292175284,
"learning_rate": 2.123515571213977e-05,
"loss": 0.3199,
"step": 647
},
{
"epoch": 2.6639260020554985,
"grad_norm": 0.3430422990168527,
"learning_rate": 2.1177774735220246e-05,
"loss": 0.3211,
"step": 648
},
{
"epoch": 2.6680369989722506,
"grad_norm": 0.24490577578554293,
"learning_rate": 2.1120384028109928e-05,
"loss": 0.3347,
"step": 649
},
{
"epoch": 2.672147995889003,
"grad_norm": 0.3135561697948168,
"learning_rate": 2.106298406494237e-05,
"loss": 0.337,
"step": 650
},
{
"epoch": 2.6762589928057556,
"grad_norm": 0.2536708220913538,
"learning_rate": 2.1005575319927606e-05,
"loss": 0.3286,
"step": 651
},
{
"epoch": 2.6803699897225077,
"grad_norm": 0.2905534330712754,
"learning_rate": 2.094815826734822e-05,
"loss": 0.3344,
"step": 652
},
{
"epoch": 2.68448098663926,
"grad_norm": 0.255577529722107,
"learning_rate": 2.089073338155542e-05,
"loss": 0.3347,
"step": 653
},
{
"epoch": 2.6885919835560124,
"grad_norm": 0.3169225043435795,
"learning_rate": 2.0833301136965138e-05,
"loss": 0.3368,
"step": 654
},
{
"epoch": 2.692702980472765,
"grad_norm": 0.24523301662966585,
"learning_rate": 2.0775862008054102e-05,
"loss": 0.3317,
"step": 655
},
{
"epoch": 2.696813977389517,
"grad_norm": 0.3377639592657221,
"learning_rate": 2.0718416469355917e-05,
"loss": 0.3327,
"step": 656
},
{
"epoch": 2.700924974306269,
"grad_norm": 0.2760670088077706,
"learning_rate": 2.066096499545712e-05,
"loss": 0.3254,
"step": 657
},
{
"epoch": 2.7050359712230216,
"grad_norm": 0.33508069878850794,
"learning_rate": 2.0603508060993306e-05,
"loss": 0.3324,
"step": 658
},
{
"epoch": 2.7091469681397737,
"grad_norm": 0.2888367467526053,
"learning_rate": 2.0546046140645178e-05,
"loss": 0.33,
"step": 659
},
{
"epoch": 2.7132579650565263,
"grad_norm": 0.2851449912230599,
"learning_rate": 2.0488579709134623e-05,
"loss": 0.3375,
"step": 660
},
{
"epoch": 2.7173689619732784,
"grad_norm": 0.28857625298935113,
"learning_rate": 2.04311092412208e-05,
"loss": 0.3324,
"step": 661
},
{
"epoch": 2.721479958890031,
"grad_norm": 0.3099583754195003,
"learning_rate": 2.0373635211696214e-05,
"loss": 0.331,
"step": 662
},
{
"epoch": 2.725590955806783,
"grad_norm": 0.3033491487781276,
"learning_rate": 2.0316158095382797e-05,
"loss": 0.3354,
"step": 663
},
{
"epoch": 2.7297019527235356,
"grad_norm": 0.2619645136131476,
"learning_rate": 2.0258678367127972e-05,
"loss": 0.3236,
"step": 664
},
{
"epoch": 2.7338129496402876,
"grad_norm": 0.30087165164939694,
"learning_rate": 2.0201196501800768e-05,
"loss": 0.3232,
"step": 665
},
{
"epoch": 2.73792394655704,
"grad_norm": 0.2738871408955049,
"learning_rate": 2.0143712974287838e-05,
"loss": 0.3277,
"step": 666
},
{
"epoch": 2.7420349434737923,
"grad_norm": 0.29720770878473823,
"learning_rate": 2.0086228259489578e-05,
"loss": 0.3419,
"step": 667
},
{
"epoch": 2.746145940390545,
"grad_norm": 0.24745189549975016,
"learning_rate": 2.0028742832316202e-05,
"loss": 0.3241,
"step": 668
},
{
"epoch": 2.750256937307297,
"grad_norm": 0.26535109416407787,
"learning_rate": 1.99712571676838e-05,
"loss": 0.3206,
"step": 669
},
{
"epoch": 2.7543679342240495,
"grad_norm": 0.28264530686991374,
"learning_rate": 1.9913771740510426e-05,
"loss": 0.3441,
"step": 670
},
{
"epoch": 2.7584789311408016,
"grad_norm": 0.2439511505193991,
"learning_rate": 1.9856287025712172e-05,
"loss": 0.3327,
"step": 671
},
{
"epoch": 2.762589928057554,
"grad_norm": 0.25559437564690174,
"learning_rate": 1.979880349819924e-05,
"loss": 0.3325,
"step": 672
},
{
"epoch": 2.766700924974306,
"grad_norm": 0.22699680175413017,
"learning_rate": 1.974132163287203e-05,
"loss": 0.329,
"step": 673
},
{
"epoch": 2.7708119218910587,
"grad_norm": 0.2582116792070818,
"learning_rate": 1.9683841904617217e-05,
"loss": 0.3319,
"step": 674
},
{
"epoch": 2.774922918807811,
"grad_norm": 0.24489677360477968,
"learning_rate": 1.9626364788303796e-05,
"loss": 0.3313,
"step": 675
},
{
"epoch": 2.779033915724563,
"grad_norm": 0.29973564906267575,
"learning_rate": 1.956889075877921e-05,
"loss": 0.3359,
"step": 676
},
{
"epoch": 2.7831449126413155,
"grad_norm": 0.24725710379682117,
"learning_rate": 1.9511420290865387e-05,
"loss": 0.3269,
"step": 677
},
{
"epoch": 2.787255909558068,
"grad_norm": 0.254114192213977,
"learning_rate": 1.945395385935483e-05,
"loss": 0.3315,
"step": 678
},
{
"epoch": 2.79136690647482,
"grad_norm": 0.2500901168277256,
"learning_rate": 1.9396491939006693e-05,
"loss": 0.317,
"step": 679
},
{
"epoch": 2.795477903391572,
"grad_norm": 0.2506907960447071,
"learning_rate": 1.9339035004542883e-05,
"loss": 0.3355,
"step": 680
},
{
"epoch": 2.7995889003083247,
"grad_norm": 0.254060820338398,
"learning_rate": 1.9281583530644087e-05,
"loss": 0.3274,
"step": 681
},
{
"epoch": 2.8036998972250773,
"grad_norm": 0.22470917674479732,
"learning_rate": 1.9224137991945898e-05,
"loss": 0.3161,
"step": 682
},
{
"epoch": 2.8078108941418294,
"grad_norm": 0.25617294354628883,
"learning_rate": 1.9166698863034865e-05,
"loss": 0.3326,
"step": 683
},
{
"epoch": 2.8119218910585815,
"grad_norm": 0.24637096854415516,
"learning_rate": 1.910926661844459e-05,
"loss": 0.3306,
"step": 684
},
{
"epoch": 2.816032887975334,
"grad_norm": 0.23065590461427085,
"learning_rate": 1.905184173265179e-05,
"loss": 0.3285,
"step": 685
},
{
"epoch": 2.8201438848920866,
"grad_norm": 0.25223738900179504,
"learning_rate": 1.89944246800724e-05,
"loss": 0.3315,
"step": 686
},
{
"epoch": 2.8242548818088387,
"grad_norm": 0.2813788401987118,
"learning_rate": 1.8937015935057637e-05,
"loss": 0.343,
"step": 687
},
{
"epoch": 2.8283658787255908,
"grad_norm": 0.23658155464390826,
"learning_rate": 1.887961597189008e-05,
"loss": 0.3361,
"step": 688
},
{
"epoch": 2.8324768756423433,
"grad_norm": 0.2560263043866784,
"learning_rate": 1.8822225264779757e-05,
"loss": 0.336,
"step": 689
},
{
"epoch": 2.836587872559096,
"grad_norm": 0.24171456841261904,
"learning_rate": 1.8764844287860235e-05,
"loss": 0.3155,
"step": 690
},
{
"epoch": 2.840698869475848,
"grad_norm": 0.2709130278349106,
"learning_rate": 1.8707473515184686e-05,
"loss": 0.3347,
"step": 691
},
{
"epoch": 2.8448098663926,
"grad_norm": 0.2389464904458257,
"learning_rate": 1.8650113420721985e-05,
"loss": 0.3261,
"step": 692
},
{
"epoch": 2.8489208633093526,
"grad_norm": 0.23853438478287736,
"learning_rate": 1.8592764478352788e-05,
"loss": 0.3269,
"step": 693
},
{
"epoch": 2.8530318602261047,
"grad_norm": 0.24002347978417551,
"learning_rate": 1.8535427161865617e-05,
"loss": 0.3273,
"step": 694
},
{
"epoch": 2.857142857142857,
"grad_norm": 0.24648736679259559,
"learning_rate": 1.8478101944952946e-05,
"loss": 0.3336,
"step": 695
},
{
"epoch": 2.8612538540596093,
"grad_norm": 0.2539305109029615,
"learning_rate": 1.842078930120729e-05,
"loss": 0.3285,
"step": 696
},
{
"epoch": 2.865364850976362,
"grad_norm": 0.25402455868598073,
"learning_rate": 1.836348970411729e-05,
"loss": 0.3415,
"step": 697
},
{
"epoch": 2.869475847893114,
"grad_norm": 0.2557721072489133,
"learning_rate": 1.8306203627063803e-05,
"loss": 0.3324,
"step": 698
},
{
"epoch": 2.8735868448098665,
"grad_norm": 0.24149579536024937,
"learning_rate": 1.8248931543315974e-05,
"loss": 0.3327,
"step": 699
},
{
"epoch": 2.8776978417266186,
"grad_norm": 0.2618118707433648,
"learning_rate": 1.8191673926027386e-05,
"loss": 0.336,
"step": 700
},
{
"epoch": 2.881808838643371,
"grad_norm": 0.22607821223026145,
"learning_rate": 1.813443124823206e-05,
"loss": 0.3366,
"step": 701
},
{
"epoch": 2.885919835560123,
"grad_norm": 0.2939881318908373,
"learning_rate": 1.807720398284062e-05,
"loss": 0.3279,
"step": 702
},
{
"epoch": 2.8900308324768758,
"grad_norm": 0.2260123918569214,
"learning_rate": 1.801999260263635e-05,
"loss": 0.3337,
"step": 703
},
{
"epoch": 2.894141829393628,
"grad_norm": 0.24275524474075025,
"learning_rate": 1.7962797580271303e-05,
"loss": 0.3244,
"step": 704
},
{
"epoch": 2.8982528263103804,
"grad_norm": 0.2665250980386111,
"learning_rate": 1.790561938826239e-05,
"loss": 0.3341,
"step": 705
},
{
"epoch": 2.9023638232271325,
"grad_norm": 0.2439607446787407,
"learning_rate": 1.784845849898748e-05,
"loss": 0.3177,
"step": 706
},
{
"epoch": 2.906474820143885,
"grad_norm": 0.22414006570173825,
"learning_rate": 1.7791315384681488e-05,
"loss": 0.3199,
"step": 707
},
{
"epoch": 2.910585817060637,
"grad_norm": 0.29428160100150474,
"learning_rate": 1.7734190517432498e-05,
"loss": 0.3276,
"step": 708
},
{
"epoch": 2.9146968139773897,
"grad_norm": 0.23652581393109087,
"learning_rate": 1.7677084369177823e-05,
"loss": 0.3252,
"step": 709
},
{
"epoch": 2.9188078108941418,
"grad_norm": 0.2638103273242896,
"learning_rate": 1.7619997411700146e-05,
"loss": 0.3269,
"step": 710
},
{
"epoch": 2.9229188078108943,
"grad_norm": 0.23905327112200858,
"learning_rate": 1.7562930116623602e-05,
"loss": 0.3221,
"step": 711
},
{
"epoch": 2.9270298047276464,
"grad_norm": 0.24681045434883284,
"learning_rate": 1.750588295540988e-05,
"loss": 0.3265,
"step": 712
},
{
"epoch": 2.931140801644399,
"grad_norm": 0.22345235272171315,
"learning_rate": 1.7448856399354335e-05,
"loss": 0.331,
"step": 713
},
{
"epoch": 2.935251798561151,
"grad_norm": 0.2620096260034816,
"learning_rate": 1.7391850919582097e-05,
"loss": 0.3133,
"step": 714
},
{
"epoch": 2.939362795477903,
"grad_norm": 0.24255411379013975,
"learning_rate": 1.733486698704417e-05,
"loss": 0.3345,
"step": 715
},
{
"epoch": 2.9434737923946557,
"grad_norm": 0.2677764953102307,
"learning_rate": 1.7277905072513538e-05,
"loss": 0.3125,
"step": 716
},
{
"epoch": 2.947584789311408,
"grad_norm": 0.23338472896647094,
"learning_rate": 1.7220965646581304e-05,
"loss": 0.3329,
"step": 717
},
{
"epoch": 2.9516957862281603,
"grad_norm": 0.256626369930556,
"learning_rate": 1.7164049179652762e-05,
"loss": 0.342,
"step": 718
},
{
"epoch": 2.9558067831449124,
"grad_norm": 0.24204486389961907,
"learning_rate": 1.7107156141943536e-05,
"loss": 0.3317,
"step": 719
},
{
"epoch": 2.959917780061665,
"grad_norm": 0.23154113899503126,
"learning_rate": 1.7050287003475684e-05,
"loss": 0.338,
"step": 720
},
{
"epoch": 2.9640287769784175,
"grad_norm": 0.2515108652826035,
"learning_rate": 1.699344223407384e-05,
"loss": 0.3221,
"step": 721
},
{
"epoch": 2.9681397738951696,
"grad_norm": 0.2508115805000718,
"learning_rate": 1.6936622303361292e-05,
"loss": 0.3272,
"step": 722
},
{
"epoch": 2.9722507708119217,
"grad_norm": 0.24166143586067093,
"learning_rate": 1.6879827680756132e-05,
"loss": 0.3283,
"step": 723
},
{
"epoch": 2.9763617677286742,
"grad_norm": 0.25924430328001846,
"learning_rate": 1.682305883546737e-05,
"loss": 0.3297,
"step": 724
},
{
"epoch": 2.9804727646454268,
"grad_norm": 0.26555305739163787,
"learning_rate": 1.6766316236491046e-05,
"loss": 0.3314,
"step": 725
},
{
"epoch": 2.984583761562179,
"grad_norm": 0.24257798147799245,
"learning_rate": 1.6709600352606382e-05,
"loss": 0.3238,
"step": 726
},
{
"epoch": 2.988694758478931,
"grad_norm": 0.2772710049751061,
"learning_rate": 1.665291165237188e-05,
"loss": 0.3274,
"step": 727
},
{
"epoch": 2.9928057553956835,
"grad_norm": 0.22542030051008188,
"learning_rate": 1.6596250604121468e-05,
"loss": 0.3328,
"step": 728
},
{
"epoch": 2.996916752312436,
"grad_norm": 0.27500500626350044,
"learning_rate": 1.653961767596063e-05,
"loss": 0.3335,
"step": 729
},
{
"epoch": 3.001027749229188,
"grad_norm": 0.270309023452525,
"learning_rate": 1.6483013335762536e-05,
"loss": 0.2946,
"step": 730
},
{
"epoch": 3.0051387461459402,
"grad_norm": 0.31524306413172215,
"learning_rate": 1.6426438051164168e-05,
"loss": 0.2781,
"step": 731
},
{
"epoch": 3.0092497430626928,
"grad_norm": 0.5781264673297727,
"learning_rate": 1.636989228956248e-05,
"loss": 0.2843,
"step": 732
},
{
"epoch": 3.013360739979445,
"grad_norm": 0.3024860387001426,
"learning_rate": 1.631337651811051e-05,
"loss": 0.2747,
"step": 733
},
{
"epoch": 3.0174717368961974,
"grad_norm": 0.3934556205957313,
"learning_rate": 1.6256891203713533e-05,
"loss": 0.2728,
"step": 734
},
{
"epoch": 3.0215827338129495,
"grad_norm": 0.3129040186390879,
"learning_rate": 1.6200436813025208e-05,
"loss": 0.2736,
"step": 735
},
{
"epoch": 3.025693730729702,
"grad_norm": 0.35042448311035457,
"learning_rate": 1.6144013812443712e-05,
"loss": 0.288,
"step": 736
},
{
"epoch": 3.029804727646454,
"grad_norm": 0.32431756468327383,
"learning_rate": 1.60876226681079e-05,
"loss": 0.2675,
"step": 737
},
{
"epoch": 3.0339157245632067,
"grad_norm": 0.3106864020345642,
"learning_rate": 1.6031263845893436e-05,
"loss": 0.2696,
"step": 738
},
{
"epoch": 3.038026721479959,
"grad_norm": 0.2918561704720298,
"learning_rate": 1.5974937811408964e-05,
"loss": 0.2806,
"step": 739
},
{
"epoch": 3.0421377183967113,
"grad_norm": 0.2824200303583143,
"learning_rate": 1.5918645029992237e-05,
"loss": 0.2669,
"step": 740
},
{
"epoch": 3.0462487153134634,
"grad_norm": 0.2751071529215866,
"learning_rate": 1.5862385966706324e-05,
"loss": 0.2827,
"step": 741
},
{
"epoch": 3.050359712230216,
"grad_norm": 0.2659741638733503,
"learning_rate": 1.580616108633569e-05,
"loss": 0.2772,
"step": 742
},
{
"epoch": 3.054470709146968,
"grad_norm": 0.2705999112071291,
"learning_rate": 1.5749970853382416e-05,
"loss": 0.2813,
"step": 743
},
{
"epoch": 3.0585817060637206,
"grad_norm": 0.2678804686149958,
"learning_rate": 1.5693815732062346e-05,
"loss": 0.2786,
"step": 744
},
{
"epoch": 3.0626927029804727,
"grad_norm": 0.28891827034365974,
"learning_rate": 1.563769618630124e-05,
"loss": 0.2781,
"step": 745
},
{
"epoch": 3.0668036998972252,
"grad_norm": 0.2608278243848426,
"learning_rate": 1.558161267973096e-05,
"loss": 0.2811,
"step": 746
},
{
"epoch": 3.0709146968139773,
"grad_norm": 0.28779231459872495,
"learning_rate": 1.552556567568562e-05,
"loss": 0.2731,
"step": 747
},
{
"epoch": 3.07502569373073,
"grad_norm": 0.27172400888062603,
"learning_rate": 1.5469555637197775e-05,
"loss": 0.273,
"step": 748
},
{
"epoch": 3.079136690647482,
"grad_norm": 0.284714544394066,
"learning_rate": 1.541358302699459e-05,
"loss": 0.2737,
"step": 749
},
{
"epoch": 3.0832476875642345,
"grad_norm": 0.27108204101689876,
"learning_rate": 1.535764830749401e-05,
"loss": 0.2719,
"step": 750
},
{
"epoch": 3.0873586844809866,
"grad_norm": 0.26218248343837663,
"learning_rate": 1.5301751940800947e-05,
"loss": 0.2702,
"step": 751
},
{
"epoch": 3.091469681397739,
"grad_norm": 0.27396785993274086,
"learning_rate": 1.5245894388703473e-05,
"loss": 0.2746,
"step": 752
},
{
"epoch": 3.0955806783144912,
"grad_norm": 0.26457645017367387,
"learning_rate": 1.5190076112668975e-05,
"loss": 0.2741,
"step": 753
},
{
"epoch": 3.099691675231244,
"grad_norm": 0.2669417946440861,
"learning_rate": 1.5134297573840373e-05,
"loss": 0.2609,
"step": 754
},
{
"epoch": 3.103802672147996,
"grad_norm": 0.24350309961263825,
"learning_rate": 1.507855923303229e-05,
"loss": 0.2683,
"step": 755
},
{
"epoch": 3.1079136690647484,
"grad_norm": 0.24218499055629,
"learning_rate": 1.5022861550727261e-05,
"loss": 0.2753,
"step": 756
},
{
"epoch": 3.1120246659815005,
"grad_norm": 0.23884760385788692,
"learning_rate": 1.4967204987071916e-05,
"loss": 0.2674,
"step": 757
},
{
"epoch": 3.1161356628982526,
"grad_norm": 0.24427494625864407,
"learning_rate": 1.491159000187318e-05,
"loss": 0.2766,
"step": 758
},
{
"epoch": 3.120246659815005,
"grad_norm": 0.22462887698775066,
"learning_rate": 1.4856017054594487e-05,
"loss": 0.2817,
"step": 759
},
{
"epoch": 3.1243576567317572,
"grad_norm": 0.22935229598945833,
"learning_rate": 1.4800486604351953e-05,
"loss": 0.2692,
"step": 760
},
{
"epoch": 3.12846865364851,
"grad_norm": 0.2283641464528615,
"learning_rate": 1.4744999109910642e-05,
"loss": 0.2881,
"step": 761
},
{
"epoch": 3.132579650565262,
"grad_norm": 0.2402319884401938,
"learning_rate": 1.4689555029680706e-05,
"loss": 0.2811,
"step": 762
},
{
"epoch": 3.1366906474820144,
"grad_norm": 0.24689907618158027,
"learning_rate": 1.4634154821713642e-05,
"loss": 0.2748,
"step": 763
},
{
"epoch": 3.1408016443987665,
"grad_norm": 0.21908840749036268,
"learning_rate": 1.4578798943698495e-05,
"loss": 0.2775,
"step": 764
},
{
"epoch": 3.144912641315519,
"grad_norm": 0.2706839692520901,
"learning_rate": 1.4523487852958078e-05,
"loss": 0.274,
"step": 765
},
{
"epoch": 3.149023638232271,
"grad_norm": 0.21829989516446477,
"learning_rate": 1.4468222006445194e-05,
"loss": 0.2846,
"step": 766
},
{
"epoch": 3.1531346351490237,
"grad_norm": 0.254462615428386,
"learning_rate": 1.4413001860738857e-05,
"loss": 0.2751,
"step": 767
},
{
"epoch": 3.157245632065776,
"grad_norm": 0.22996784925457855,
"learning_rate": 1.4357827872040533e-05,
"loss": 0.2763,
"step": 768
},
{
"epoch": 3.1613566289825283,
"grad_norm": 0.24916315195392996,
"learning_rate": 1.4302700496170348e-05,
"loss": 0.273,
"step": 769
},
{
"epoch": 3.1654676258992804,
"grad_norm": 0.2394151755505642,
"learning_rate": 1.424762018856335e-05,
"loss": 0.2733,
"step": 770
},
{
"epoch": 3.169578622816033,
"grad_norm": 0.24554198081740938,
"learning_rate": 1.4192587404265723e-05,
"loss": 0.2739,
"step": 771
},
{
"epoch": 3.173689619732785,
"grad_norm": 0.23659011712793626,
"learning_rate": 1.4137602597931039e-05,
"loss": 0.2819,
"step": 772
},
{
"epoch": 3.1778006166495376,
"grad_norm": 0.22564337444058383,
"learning_rate": 1.4082666223816503e-05,
"loss": 0.2792,
"step": 773
},
{
"epoch": 3.1819116135662897,
"grad_norm": 0.2317146590014487,
"learning_rate": 1.4027778735779194e-05,
"loss": 0.2777,
"step": 774
},
{
"epoch": 3.1860226104830422,
"grad_norm": 0.2150695765539657,
"learning_rate": 1.397294058727232e-05,
"loss": 0.2765,
"step": 775
},
{
"epoch": 3.1901336073997943,
"grad_norm": 0.23401579346325868,
"learning_rate": 1.3918152231341466e-05,
"loss": 0.2859,
"step": 776
},
{
"epoch": 3.194244604316547,
"grad_norm": 0.22190869981693315,
"learning_rate": 1.3863414120620866e-05,
"loss": 0.2739,
"step": 777
},
{
"epoch": 3.198355601233299,
"grad_norm": 0.24505629782483931,
"learning_rate": 1.3808726707329636e-05,
"loss": 0.2854,
"step": 778
},
{
"epoch": 3.2024665981500515,
"grad_norm": 0.23527894624102066,
"learning_rate": 1.3754090443268073e-05,
"loss": 0.2739,
"step": 779
},
{
"epoch": 3.2065775950668036,
"grad_norm": 0.250451185169838,
"learning_rate": 1.3699505779813885e-05,
"loss": 0.2779,
"step": 780
},
{
"epoch": 3.210688591983556,
"grad_norm": 0.24199976474098944,
"learning_rate": 1.3644973167918509e-05,
"loss": 0.2819,
"step": 781
},
{
"epoch": 3.2147995889003083,
"grad_norm": 0.2295279753606739,
"learning_rate": 1.3590493058103334e-05,
"loss": 0.2912,
"step": 782
},
{
"epoch": 3.218910585817061,
"grad_norm": 0.24479637435880175,
"learning_rate": 1.353606590045601e-05,
"loss": 0.2625,
"step": 783
},
{
"epoch": 3.223021582733813,
"grad_norm": 0.22366855597040158,
"learning_rate": 1.3481692144626723e-05,
"loss": 0.2716,
"step": 784
},
{
"epoch": 3.2271325796505654,
"grad_norm": 0.23386804780243653,
"learning_rate": 1.3427372239824478e-05,
"loss": 0.2833,
"step": 785
},
{
"epoch": 3.2312435765673175,
"grad_norm": 0.21138476617701588,
"learning_rate": 1.3373106634813395e-05,
"loss": 0.2815,
"step": 786
},
{
"epoch": 3.23535457348407,
"grad_norm": 0.21868921231797736,
"learning_rate": 1.3318895777908989e-05,
"loss": 0.2737,
"step": 787
},
{
"epoch": 3.239465570400822,
"grad_norm": 0.22091301457511603,
"learning_rate": 1.3264740116974477e-05,
"loss": 0.2784,
"step": 788
},
{
"epoch": 3.2435765673175747,
"grad_norm": 0.21177976498652176,
"learning_rate": 1.3210640099417071e-05,
"loss": 0.2677,
"step": 789
},
{
"epoch": 3.247687564234327,
"grad_norm": 0.22075747796505304,
"learning_rate": 1.3156596172184291e-05,
"loss": 0.2843,
"step": 790
},
{
"epoch": 3.2517985611510793,
"grad_norm": 0.2093957071643158,
"learning_rate": 1.3102608781760262e-05,
"loss": 0.2783,
"step": 791
},
{
"epoch": 3.2559095580678314,
"grad_norm": 0.2502487297475507,
"learning_rate": 1.3048678374162033e-05,
"loss": 0.2764,
"step": 792
},
{
"epoch": 3.2600205549845835,
"grad_norm": 0.21828424241121014,
"learning_rate": 1.2994805394935883e-05,
"loss": 0.2783,
"step": 793
},
{
"epoch": 3.264131551901336,
"grad_norm": 0.2225059796962467,
"learning_rate": 1.2940990289153654e-05,
"loss": 0.2818,
"step": 794
},
{
"epoch": 3.2682425488180886,
"grad_norm": 0.2384933128418085,
"learning_rate": 1.2887233501409062e-05,
"loss": 0.2715,
"step": 795
},
{
"epoch": 3.2723535457348407,
"grad_norm": 0.23881527823885554,
"learning_rate": 1.283353547581403e-05,
"loss": 0.2815,
"step": 796
},
{
"epoch": 3.276464542651593,
"grad_norm": 0.21990598524463273,
"learning_rate": 1.2779896655995012e-05,
"loss": 0.2649,
"step": 797
},
{
"epoch": 3.2805755395683454,
"grad_norm": 0.22014058980246703,
"learning_rate": 1.2726317485089345e-05,
"loss": 0.2857,
"step": 798
},
{
"epoch": 3.2846865364850975,
"grad_norm": 0.2128741761301097,
"learning_rate": 1.2672798405741565e-05,
"loss": 0.2744,
"step": 799
},
{
"epoch": 3.28879753340185,
"grad_norm": 0.2093902141995586,
"learning_rate": 1.261933986009976e-05,
"loss": 0.2714,
"step": 800
},
{
"epoch": 3.292908530318602,
"grad_norm": 0.2317393277270657,
"learning_rate": 1.2565942289811926e-05,
"loss": 0.2821,
"step": 801
},
{
"epoch": 3.2970195272353546,
"grad_norm": 0.2124614991624517,
"learning_rate": 1.2512606136022316e-05,
"loss": 0.2684,
"step": 802
},
{
"epoch": 3.3011305241521067,
"grad_norm": 0.2237964393839327,
"learning_rate": 1.245933183936778e-05,
"loss": 0.28,
"step": 803
},
{
"epoch": 3.3052415210688593,
"grad_norm": 0.20179137845865386,
"learning_rate": 1.2406119839974137e-05,
"loss": 0.2791,
"step": 804
},
{
"epoch": 3.3093525179856114,
"grad_norm": 0.21344599872935055,
"learning_rate": 1.2352970577452536e-05,
"loss": 0.282,
"step": 805
},
{
"epoch": 3.313463514902364,
"grad_norm": 0.21405309956045562,
"learning_rate": 1.2299884490895829e-05,
"loss": 0.2705,
"step": 806
},
{
"epoch": 3.317574511819116,
"grad_norm": 0.20836540998453448,
"learning_rate": 1.2246862018874937e-05,
"loss": 0.2675,
"step": 807
},
{
"epoch": 3.3216855087358685,
"grad_norm": 0.21917814502090704,
"learning_rate": 1.2193903599435229e-05,
"loss": 0.2867,
"step": 808
},
{
"epoch": 3.3257965056526206,
"grad_norm": 0.21478503443145303,
"learning_rate": 1.2141009670092905e-05,
"loss": 0.263,
"step": 809
},
{
"epoch": 3.329907502569373,
"grad_norm": 0.24017325608140172,
"learning_rate": 1.2088180667831378e-05,
"loss": 0.285,
"step": 810
},
{
"epoch": 3.3340184994861253,
"grad_norm": 0.21263315635103802,
"learning_rate": 1.2035417029097669e-05,
"loss": 0.2794,
"step": 811
},
{
"epoch": 3.338129496402878,
"grad_norm": 0.2208436673519513,
"learning_rate": 1.198271918979879e-05,
"loss": 0.2661,
"step": 812
},
{
"epoch": 3.34224049331963,
"grad_norm": 0.21410801362761014,
"learning_rate": 1.1930087585298163e-05,
"loss": 0.2691,
"step": 813
},
{
"epoch": 3.3463514902363825,
"grad_norm": 0.2189540505149734,
"learning_rate": 1.1877522650412002e-05,
"loss": 0.2777,
"step": 814
},
{
"epoch": 3.3504624871531345,
"grad_norm": 0.2235412920660751,
"learning_rate": 1.1825024819405728e-05,
"loss": 0.2829,
"step": 815
},
{
"epoch": 3.354573484069887,
"grad_norm": 0.22891833469755685,
"learning_rate": 1.177259452599039e-05,
"loss": 0.2883,
"step": 816
},
{
"epoch": 3.358684480986639,
"grad_norm": 0.20951091444066108,
"learning_rate": 1.1720232203319072e-05,
"loss": 0.2703,
"step": 817
},
{
"epoch": 3.3627954779033917,
"grad_norm": 0.2291000642315933,
"learning_rate": 1.1667938283983318e-05,
"loss": 0.2818,
"step": 818
},
{
"epoch": 3.366906474820144,
"grad_norm": 0.24820675241373585,
"learning_rate": 1.1615713200009555e-05,
"loss": 0.2894,
"step": 819
},
{
"epoch": 3.3710174717368964,
"grad_norm": 0.2112186174561992,
"learning_rate": 1.1563557382855527e-05,
"loss": 0.2765,
"step": 820
},
{
"epoch": 3.3751284686536485,
"grad_norm": 0.23516315367694957,
"learning_rate": 1.1511471263406727e-05,
"loss": 0.2783,
"step": 821
},
{
"epoch": 3.379239465570401,
"grad_norm": 0.20429288664608256,
"learning_rate": 1.1459455271972855e-05,
"loss": 0.2826,
"step": 822
},
{
"epoch": 3.383350462487153,
"grad_norm": 0.22120456160119745,
"learning_rate": 1.1407509838284234e-05,
"loss": 0.2702,
"step": 823
},
{
"epoch": 3.3874614594039056,
"grad_norm": 0.22196158784290934,
"learning_rate": 1.1355635391488273e-05,
"loss": 0.2816,
"step": 824
},
{
"epoch": 3.3915724563206577,
"grad_norm": 0.23198563181248005,
"learning_rate": 1.130383236014593e-05,
"loss": 0.2807,
"step": 825
},
{
"epoch": 3.3956834532374103,
"grad_norm": 0.21328959797566183,
"learning_rate": 1.1252101172228161e-05,
"loss": 0.2812,
"step": 826
},
{
"epoch": 3.3997944501541624,
"grad_norm": 0.20829955768200162,
"learning_rate": 1.1200442255112382e-05,
"loss": 0.2781,
"step": 827
},
{
"epoch": 3.4039054470709145,
"grad_norm": 0.210021918847506,
"learning_rate": 1.1148856035578954e-05,
"loss": 0.2793,
"step": 828
},
{
"epoch": 3.408016443987667,
"grad_norm": 0.21953053255099053,
"learning_rate": 1.1097342939807639e-05,
"loss": 0.2826,
"step": 829
},
{
"epoch": 3.4121274409044196,
"grad_norm": 0.21197251533168365,
"learning_rate": 1.1045903393374088e-05,
"loss": 0.2678,
"step": 830
},
{
"epoch": 3.4162384378211716,
"grad_norm": 0.22402861818250405,
"learning_rate": 1.0994537821246322e-05,
"loss": 0.2768,
"step": 831
},
{
"epoch": 3.4203494347379237,
"grad_norm": 0.20866347607213415,
"learning_rate": 1.0943246647781231e-05,
"loss": 0.2822,
"step": 832
},
{
"epoch": 3.4244604316546763,
"grad_norm": 0.20588546575745492,
"learning_rate": 1.0892030296721053e-05,
"loss": 0.274,
"step": 833
},
{
"epoch": 3.4285714285714284,
"grad_norm": 0.21933896518445742,
"learning_rate": 1.0840889191189881e-05,
"loss": 0.2815,
"step": 834
},
{
"epoch": 3.432682425488181,
"grad_norm": 0.21986260521948456,
"learning_rate": 1.0789823753690165e-05,
"loss": 0.265,
"step": 835
},
{
"epoch": 3.436793422404933,
"grad_norm": 0.2472526595417136,
"learning_rate": 1.073883440609923e-05,
"loss": 0.2819,
"step": 836
},
{
"epoch": 3.4409044193216856,
"grad_norm": 0.23044008878105163,
"learning_rate": 1.0687921569665778e-05,
"loss": 0.2743,
"step": 837
},
{
"epoch": 3.4450154162384377,
"grad_norm": 0.2127401189830073,
"learning_rate": 1.0637085665006416e-05,
"loss": 0.2757,
"step": 838
},
{
"epoch": 3.44912641315519,
"grad_norm": 0.23011524871297998,
"learning_rate": 1.058632711210218e-05,
"loss": 0.2867,
"step": 839
},
{
"epoch": 3.4532374100719423,
"grad_norm": 0.2143448487687264,
"learning_rate": 1.0535646330295064e-05,
"loss": 0.2775,
"step": 840
},
{
"epoch": 3.457348406988695,
"grad_norm": 0.2157327739595805,
"learning_rate": 1.0485043738284543e-05,
"loss": 0.2772,
"step": 841
},
{
"epoch": 3.461459403905447,
"grad_norm": 0.21901388123050422,
"learning_rate": 1.0434519754124155e-05,
"loss": 0.2883,
"step": 842
},
{
"epoch": 3.4655704008221995,
"grad_norm": 0.20706260357694797,
"learning_rate": 1.0384074795217995e-05,
"loss": 0.2729,
"step": 843
},
{
"epoch": 3.4696813977389516,
"grad_norm": 0.20562057118619545,
"learning_rate": 1.0333709278317295e-05,
"loss": 0.2794,
"step": 844
},
{
"epoch": 3.473792394655704,
"grad_norm": 0.2053621886084836,
"learning_rate": 1.0283423619516984e-05,
"loss": 0.2831,
"step": 845
},
{
"epoch": 3.477903391572456,
"grad_norm": 0.21559967106224392,
"learning_rate": 1.0233218234252233e-05,
"loss": 0.2798,
"step": 846
},
{
"epoch": 3.4820143884892087,
"grad_norm": 0.20693716417643127,
"learning_rate": 1.0183093537295038e-05,
"loss": 0.2834,
"step": 847
},
{
"epoch": 3.486125385405961,
"grad_norm": 0.196045141198551,
"learning_rate": 1.0133049942750794e-05,
"loss": 0.2815,
"step": 848
},
{
"epoch": 3.4902363823227134,
"grad_norm": 0.22184037078133786,
"learning_rate": 1.0083087864054862e-05,
"loss": 0.2782,
"step": 849
},
{
"epoch": 3.4943473792394655,
"grad_norm": 0.21680925373572774,
"learning_rate": 1.0033207713969152e-05,
"loss": 0.2668,
"step": 850
},
{
"epoch": 3.498458376156218,
"grad_norm": 0.20929159215700033,
"learning_rate": 9.983409904578732e-06,
"loss": 0.2771,
"step": 851
},
{
"epoch": 3.50256937307297,
"grad_norm": 0.20085747960075442,
"learning_rate": 9.93369484728841e-06,
"loss": 0.2769,
"step": 852
},
{
"epoch": 3.5066803699897227,
"grad_norm": 0.21740453796251422,
"learning_rate": 9.884062952819336e-06,
"loss": 0.2809,
"step": 853
},
{
"epoch": 3.5107913669064748,
"grad_norm": 0.19826634602771384,
"learning_rate": 9.834514631205607e-06,
"loss": 0.2826,
"step": 854
},
{
"epoch": 3.5149023638232273,
"grad_norm": 0.19929040918628962,
"learning_rate": 9.785050291790886e-06,
"loss": 0.27,
"step": 855
},
{
"epoch": 3.5190133607399794,
"grad_norm": 0.20544687803262818,
"learning_rate": 9.735670343225015e-06,
"loss": 0.2759,
"step": 856
},
{
"epoch": 3.523124357656732,
"grad_norm": 0.20100075338402584,
"learning_rate": 9.68637519346064e-06,
"loss": 0.2842,
"step": 857
},
{
"epoch": 3.527235354573484,
"grad_norm": 0.19998157251828666,
"learning_rate": 9.637165249749847e-06,
"loss": 0.2677,
"step": 858
},
{
"epoch": 3.531346351490236,
"grad_norm": 0.20946212814759255,
"learning_rate": 9.588040918640784e-06,
"loss": 0.2819,
"step": 859
},
{
"epoch": 3.5354573484069887,
"grad_norm": 0.19305869769870324,
"learning_rate": 9.539002605974315e-06,
"loss": 0.2762,
"step": 860
},
{
"epoch": 3.539568345323741,
"grad_norm": 0.22246584009743214,
"learning_rate": 9.490050716880652e-06,
"loss": 0.2761,
"step": 861
},
{
"epoch": 3.5436793422404933,
"grad_norm": 0.2106791686837925,
"learning_rate": 9.441185655776044e-06,
"loss": 0.2836,
"step": 862
},
{
"epoch": 3.5477903391572454,
"grad_norm": 0.20735417375234855,
"learning_rate": 9.392407826359386e-06,
"loss": 0.2797,
"step": 863
},
{
"epoch": 3.551901336073998,
"grad_norm": 0.22319834142117814,
"learning_rate": 9.343717631608913e-06,
"loss": 0.2805,
"step": 864
},
{
"epoch": 3.5560123329907505,
"grad_norm": 0.21387661139677305,
"learning_rate": 9.295115473778871e-06,
"loss": 0.2737,
"step": 865
},
{
"epoch": 3.5601233299075026,
"grad_norm": 0.19614268534753534,
"learning_rate": 9.246601754396184e-06,
"loss": 0.2775,
"step": 866
},
{
"epoch": 3.5642343268242547,
"grad_norm": 0.21426258046660832,
"learning_rate": 9.198176874257147e-06,
"loss": 0.2801,
"step": 867
},
{
"epoch": 3.568345323741007,
"grad_norm": 0.20833350511079968,
"learning_rate": 9.149841233424102e-06,
"loss": 0.2903,
"step": 868
},
{
"epoch": 3.5724563206577598,
"grad_norm": 0.2054636105867438,
"learning_rate": 9.101595231222142e-06,
"loss": 0.2714,
"step": 869
},
{
"epoch": 3.576567317574512,
"grad_norm": 0.21677433378750463,
"learning_rate": 9.053439266235817e-06,
"loss": 0.2747,
"step": 870
},
{
"epoch": 3.580678314491264,
"grad_norm": 0.21258620908795176,
"learning_rate": 9.005373736305827e-06,
"loss": 0.2866,
"step": 871
},
{
"epoch": 3.5847893114080165,
"grad_norm": 0.21696485235415786,
"learning_rate": 8.957399038525742e-06,
"loss": 0.2768,
"step": 872
},
{
"epoch": 3.588900308324769,
"grad_norm": 0.2048645071808934,
"learning_rate": 8.909515569238727e-06,
"loss": 0.2805,
"step": 873
},
{
"epoch": 3.593011305241521,
"grad_norm": 0.2041872125610518,
"learning_rate": 8.861723724034256e-06,
"loss": 0.281,
"step": 874
},
{
"epoch": 3.597122302158273,
"grad_norm": 0.22105486900940344,
"learning_rate": 8.814023897744861e-06,
"loss": 0.2722,
"step": 875
},
{
"epoch": 3.6012332990750258,
"grad_norm": 0.20870597023983126,
"learning_rate": 8.766416484442845e-06,
"loss": 0.288,
"step": 876
},
{
"epoch": 3.605344295991778,
"grad_norm": 0.22305369395665908,
"learning_rate": 8.71890187743705e-06,
"loss": 0.2833,
"step": 877
},
{
"epoch": 3.6094552929085304,
"grad_norm": 0.20984704638631244,
"learning_rate": 8.6714804692696e-06,
"loss": 0.2815,
"step": 878
},
{
"epoch": 3.6135662898252825,
"grad_norm": 0.205661449222605,
"learning_rate": 8.624152651712647e-06,
"loss": 0.2796,
"step": 879
},
{
"epoch": 3.617677286742035,
"grad_norm": 0.21670437077691945,
"learning_rate": 8.576918815765155e-06,
"loss": 0.276,
"step": 880
},
{
"epoch": 3.621788283658787,
"grad_norm": 0.21657223103082457,
"learning_rate": 8.52977935164965e-06,
"loss": 0.2793,
"step": 881
},
{
"epoch": 3.6258992805755397,
"grad_norm": 0.19481374974543536,
"learning_rate": 8.482734648808998e-06,
"loss": 0.2828,
"step": 882
},
{
"epoch": 3.6300102774922918,
"grad_norm": 0.22945439732292053,
"learning_rate": 8.435785095903226e-06,
"loss": 0.2767,
"step": 883
},
{
"epoch": 3.6341212744090443,
"grad_norm": 0.21026587275904124,
"learning_rate": 8.388931080806244e-06,
"loss": 0.277,
"step": 884
},
{
"epoch": 3.6382322713257964,
"grad_norm": 0.1905115883548281,
"learning_rate": 8.342172990602692e-06,
"loss": 0.2743,
"step": 885
},
{
"epoch": 3.642343268242549,
"grad_norm": 0.21168502442048126,
"learning_rate": 8.295511211584726e-06,
"loss": 0.2684,
"step": 886
},
{
"epoch": 3.646454265159301,
"grad_norm": 0.21498006850055293,
"learning_rate": 8.248946129248821e-06,
"loss": 0.2762,
"step": 887
},
{
"epoch": 3.6505652620760536,
"grad_norm": 0.19489762757982362,
"learning_rate": 8.202478128292594e-06,
"loss": 0.279,
"step": 888
},
{
"epoch": 3.6546762589928057,
"grad_norm": 0.21734478601458554,
"learning_rate": 8.15610759261163e-06,
"loss": 0.2743,
"step": 889
},
{
"epoch": 3.6587872559095582,
"grad_norm": 0.2067502254964237,
"learning_rate": 8.109834905296296e-06,
"loss": 0.2687,
"step": 890
},
{
"epoch": 3.6628982528263103,
"grad_norm": 0.20837277018256964,
"learning_rate": 8.06366044862859e-06,
"loss": 0.2776,
"step": 891
},
{
"epoch": 3.667009249743063,
"grad_norm": 0.2024301743577271,
"learning_rate": 8.017584604078974e-06,
"loss": 0.2801,
"step": 892
},
{
"epoch": 3.671120246659815,
"grad_norm": 0.21530744818182257,
"learning_rate": 7.971607752303226e-06,
"loss": 0.28,
"step": 893
},
{
"epoch": 3.675231243576567,
"grad_norm": 0.2139811361890938,
"learning_rate": 7.925730273139294e-06,
"loss": 0.2712,
"step": 894
},
{
"epoch": 3.6793422404933196,
"grad_norm": 0.20799988041239068,
"learning_rate": 7.879952545604163e-06,
"loss": 0.2926,
"step": 895
},
{
"epoch": 3.683453237410072,
"grad_norm": 0.20418864938595824,
"learning_rate": 7.834274947890715e-06,
"loss": 0.2798,
"step": 896
},
{
"epoch": 3.6875642343268242,
"grad_norm": 0.20416263025450562,
"learning_rate": 7.78869785736461e-06,
"loss": 0.2694,
"step": 897
},
{
"epoch": 3.6916752312435763,
"grad_norm": 0.19066134679044647,
"learning_rate": 7.74322165056117e-06,
"loss": 0.2667,
"step": 898
},
{
"epoch": 3.695786228160329,
"grad_norm": 0.2121667278903765,
"learning_rate": 7.697846703182262e-06,
"loss": 0.2784,
"step": 899
},
{
"epoch": 3.6998972250770814,
"grad_norm": 0.2071705071263635,
"learning_rate": 7.652573390093199e-06,
"loss": 0.285,
"step": 900
},
{
"epoch": 3.7040082219938335,
"grad_norm": 0.2035973761053005,
"learning_rate": 7.607402085319644e-06,
"loss": 0.2759,
"step": 901
},
{
"epoch": 3.7081192189105856,
"grad_norm": 0.2083326930999411,
"learning_rate": 7.562333162044508e-06,
"loss": 0.2775,
"step": 902
},
{
"epoch": 3.712230215827338,
"grad_norm": 0.21563075052521988,
"learning_rate": 7.517366992604902e-06,
"loss": 0.2767,
"step": 903
},
{
"epoch": 3.7163412127440907,
"grad_norm": 0.20432779262539,
"learning_rate": 7.4725039484890094e-06,
"loss": 0.2874,
"step": 904
},
{
"epoch": 3.720452209660843,
"grad_norm": 0.2047844251053815,
"learning_rate": 7.427744400333053e-06,
"loss": 0.2789,
"step": 905
},
{
"epoch": 3.724563206577595,
"grad_norm": 0.2055231569256932,
"learning_rate": 7.383088717918223e-06,
"loss": 0.2748,
"step": 906
},
{
"epoch": 3.7286742034943474,
"grad_norm": 0.20467879963763858,
"learning_rate": 7.338537270167625e-06,
"loss": 0.277,
"step": 907
},
{
"epoch": 3.7327852004111,
"grad_norm": 0.21544746620927177,
"learning_rate": 7.294090425143225e-06,
"loss": 0.273,
"step": 908
},
{
"epoch": 3.736896197327852,
"grad_norm": 0.2000666684512926,
"learning_rate": 7.249748550042817e-06,
"loss": 0.2806,
"step": 909
},
{
"epoch": 3.741007194244604,
"grad_norm": 0.20770589378766816,
"learning_rate": 7.20551201119698e-06,
"loss": 0.2705,
"step": 910
},
{
"epoch": 3.7451181911613567,
"grad_norm": 0.20437780757014407,
"learning_rate": 7.161381174066065e-06,
"loss": 0.2829,
"step": 911
},
{
"epoch": 3.749229188078109,
"grad_norm": 0.19567720371080252,
"learning_rate": 7.117356403237161e-06,
"loss": 0.2813,
"step": 912
},
{
"epoch": 3.7533401849948613,
"grad_norm": 0.19171574936304334,
"learning_rate": 7.073438062421094e-06,
"loss": 0.2782,
"step": 913
},
{
"epoch": 3.7574511819116134,
"grad_norm": 0.20924848866916773,
"learning_rate": 7.029626514449414e-06,
"loss": 0.27,
"step": 914
},
{
"epoch": 3.761562178828366,
"grad_norm": 0.20438696705099926,
"learning_rate": 6.985922121271409e-06,
"loss": 0.2728,
"step": 915
},
{
"epoch": 3.765673175745118,
"grad_norm": 0.2084495335702813,
"learning_rate": 6.942325243951098e-06,
"loss": 0.2824,
"step": 916
},
{
"epoch": 3.7697841726618706,
"grad_norm": 0.1993990523612008,
"learning_rate": 6.898836242664262e-06,
"loss": 0.282,
"step": 917
},
{
"epoch": 3.7738951695786227,
"grad_norm": 0.19347775656849484,
"learning_rate": 6.855455476695465e-06,
"loss": 0.2706,
"step": 918
},
{
"epoch": 3.7780061664953752,
"grad_norm": 0.20109622486576145,
"learning_rate": 6.812183304435083e-06,
"loss": 0.2801,
"step": 919
},
{
"epoch": 3.7821171634121273,
"grad_norm": 0.18886838722656143,
"learning_rate": 6.769020083376341e-06,
"loss": 0.2721,
"step": 920
},
{
"epoch": 3.78622816032888,
"grad_norm": 0.208430820513582,
"learning_rate": 6.725966170112368e-06,
"loss": 0.2686,
"step": 921
},
{
"epoch": 3.790339157245632,
"grad_norm": 0.1967578418393911,
"learning_rate": 6.6830219203332415e-06,
"loss": 0.2721,
"step": 922
},
{
"epoch": 3.7944501541623845,
"grad_norm": 0.2015892872246403,
"learning_rate": 6.640187688823065e-06,
"loss": 0.2792,
"step": 923
},
{
"epoch": 3.7985611510791366,
"grad_norm": 0.1938822600108583,
"learning_rate": 6.597463829457014e-06,
"loss": 0.2799,
"step": 924
},
{
"epoch": 3.802672147995889,
"grad_norm": 0.2023587740694427,
"learning_rate": 6.554850695198427e-06,
"loss": 0.2695,
"step": 925
},
{
"epoch": 3.8067831449126412,
"grad_norm": 0.19570583847216003,
"learning_rate": 6.512348638095887e-06,
"loss": 0.2858,
"step": 926
},
{
"epoch": 3.810894141829394,
"grad_norm": 0.19738903231975544,
"learning_rate": 6.469958009280315e-06,
"loss": 0.2681,
"step": 927
},
{
"epoch": 3.815005138746146,
"grad_norm": 0.20083483818328293,
"learning_rate": 6.4276791589620595e-06,
"loss": 0.2852,
"step": 928
},
{
"epoch": 3.819116135662898,
"grad_norm": 0.19273874331489446,
"learning_rate": 6.385512436428021e-06,
"loss": 0.2864,
"step": 929
},
{
"epoch": 3.8232271325796505,
"grad_norm": 0.1869845010972472,
"learning_rate": 6.343458190038747e-06,
"loss": 0.2727,
"step": 930
},
{
"epoch": 3.827338129496403,
"grad_norm": 0.19346715289339741,
"learning_rate": 6.301516767225568e-06,
"loss": 0.2739,
"step": 931
},
{
"epoch": 3.831449126413155,
"grad_norm": 0.19227603993401987,
"learning_rate": 6.259688514487718e-06,
"loss": 0.2758,
"step": 932
},
{
"epoch": 3.8355601233299073,
"grad_norm": 0.20411187735886127,
"learning_rate": 6.217973777389483e-06,
"loss": 0.2761,
"step": 933
},
{
"epoch": 3.83967112024666,
"grad_norm": 0.18675711473098772,
"learning_rate": 6.1763729005573284e-06,
"loss": 0.2829,
"step": 934
},
{
"epoch": 3.8437821171634123,
"grad_norm": 0.2123802835671684,
"learning_rate": 6.134886227677073e-06,
"loss": 0.2922,
"step": 935
},
{
"epoch": 3.8478931140801644,
"grad_norm": 0.18956127541911397,
"learning_rate": 6.093514101491034e-06,
"loss": 0.2763,
"step": 936
},
{
"epoch": 3.8520041109969165,
"grad_norm": 0.18788309236848885,
"learning_rate": 6.052256863795198e-06,
"loss": 0.2711,
"step": 937
},
{
"epoch": 3.856115107913669,
"grad_norm": 0.19828249178491697,
"learning_rate": 6.0111148554364084e-06,
"loss": 0.2799,
"step": 938
},
{
"epoch": 3.8602261048304216,
"grad_norm": 0.18431610567167325,
"learning_rate": 5.970088416309532e-06,
"loss": 0.2689,
"step": 939
},
{
"epoch": 3.8643371017471737,
"grad_norm": 0.21004802063561837,
"learning_rate": 5.929177885354665e-06,
"loss": 0.279,
"step": 940
},
{
"epoch": 3.868448098663926,
"grad_norm": 0.18145712447424242,
"learning_rate": 5.888383600554326e-06,
"loss": 0.2769,
"step": 941
},
{
"epoch": 3.8725590955806783,
"grad_norm": 0.1998489072868665,
"learning_rate": 5.8477058989306605e-06,
"loss": 0.2902,
"step": 942
},
{
"epoch": 3.876670092497431,
"grad_norm": 0.19349791063075825,
"learning_rate": 5.807145116542678e-06,
"loss": 0.2772,
"step": 943
},
{
"epoch": 3.880781089414183,
"grad_norm": 0.20224120336775228,
"learning_rate": 5.766701588483443e-06,
"loss": 0.2766,
"step": 944
},
{
"epoch": 3.884892086330935,
"grad_norm": 0.20201558369754713,
"learning_rate": 5.726375648877329e-06,
"loss": 0.2711,
"step": 945
},
{
"epoch": 3.8890030832476876,
"grad_norm": 0.186362787594006,
"learning_rate": 5.68616763087725e-06,
"loss": 0.2637,
"step": 946
},
{
"epoch": 3.8931140801644397,
"grad_norm": 0.18827723220330278,
"learning_rate": 5.646077866661912e-06,
"loss": 0.2728,
"step": 947
},
{
"epoch": 3.8972250770811923,
"grad_norm": 0.20621122766057245,
"learning_rate": 5.606106687433066e-06,
"loss": 0.277,
"step": 948
},
{
"epoch": 3.9013360739979444,
"grad_norm": 0.1997165387359167,
"learning_rate": 5.5662544234127735e-06,
"loss": 0.2852,
"step": 949
},
{
"epoch": 3.905447070914697,
"grad_norm": 0.1986176597393475,
"learning_rate": 5.526521403840677e-06,
"loss": 0.2724,
"step": 950
},
{
"epoch": 3.909558067831449,
"grad_norm": 0.19315083170854766,
"learning_rate": 5.486907956971277e-06,
"loss": 0.2654,
"step": 951
},
{
"epoch": 3.9136690647482015,
"grad_norm": 0.19208269826966257,
"learning_rate": 5.447414410071232e-06,
"loss": 0.28,
"step": 952
},
{
"epoch": 3.9177800616649536,
"grad_norm": 0.1986061425594109,
"learning_rate": 5.40804108941664e-06,
"loss": 0.2809,
"step": 953
},
{
"epoch": 3.921891058581706,
"grad_norm": 0.18060496237659096,
"learning_rate": 5.36878832029035e-06,
"loss": 0.2753,
"step": 954
},
{
"epoch": 3.9260020554984583,
"grad_norm": 0.19007144815119342,
"learning_rate": 5.329656426979275e-06,
"loss": 0.2844,
"step": 955
},
{
"epoch": 3.930113052415211,
"grad_norm": 0.18228170358892676,
"learning_rate": 5.290645732771711e-06,
"loss": 0.2776,
"step": 956
},
{
"epoch": 3.934224049331963,
"grad_norm": 0.20611317253574513,
"learning_rate": 5.251756559954668e-06,
"loss": 0.2752,
"step": 957
},
{
"epoch": 3.9383350462487154,
"grad_norm": 0.19496510102086326,
"learning_rate": 5.212989229811209e-06,
"loss": 0.2703,
"step": 958
},
{
"epoch": 3.9424460431654675,
"grad_norm": 0.18813827312165923,
"learning_rate": 5.174344062617789e-06,
"loss": 0.2817,
"step": 959
},
{
"epoch": 3.94655704008222,
"grad_norm": 0.19091427031439173,
"learning_rate": 5.135821377641616e-06,
"loss": 0.2787,
"step": 960
},
{
"epoch": 3.950668036998972,
"grad_norm": 0.1901592123123516,
"learning_rate": 5.097421493138008e-06,
"loss": 0.2766,
"step": 961
},
{
"epoch": 3.9547790339157247,
"grad_norm": 0.1870098631363826,
"learning_rate": 5.059144726347765e-06,
"loss": 0.2728,
"step": 962
},
{
"epoch": 3.958890030832477,
"grad_norm": 0.17796954931972553,
"learning_rate": 5.020991393494558e-06,
"loss": 0.2867,
"step": 963
},
{
"epoch": 3.963001027749229,
"grad_norm": 0.19046713852280395,
"learning_rate": 4.9829618097823055e-06,
"loss": 0.2675,
"step": 964
},
{
"epoch": 3.9671120246659815,
"grad_norm": 0.19367792434634498,
"learning_rate": 4.945056289392565e-06,
"loss": 0.2765,
"step": 965
},
{
"epoch": 3.971223021582734,
"grad_norm": 0.18974765427392373,
"learning_rate": 4.907275145481947e-06,
"loss": 0.2731,
"step": 966
},
{
"epoch": 3.975334018499486,
"grad_norm": 0.18889755922787974,
"learning_rate": 4.8696186901795275e-06,
"loss": 0.2817,
"step": 967
},
{
"epoch": 3.979445015416238,
"grad_norm": 0.19028199023394596,
"learning_rate": 4.832087234584266e-06,
"loss": 0.2783,
"step": 968
},
{
"epoch": 3.9835560123329907,
"grad_norm": 0.1964825876876656,
"learning_rate": 4.794681088762438e-06,
"loss": 0.2744,
"step": 969
},
{
"epoch": 3.9876670092497433,
"grad_norm": 0.17957398832039587,
"learning_rate": 4.757400561745069e-06,
"loss": 0.2762,
"step": 970
},
{
"epoch": 3.9917780061664954,
"grad_norm": 0.20987505932024647,
"learning_rate": 4.720245961525387e-06,
"loss": 0.2949,
"step": 971
},
{
"epoch": 3.9958890030832475,
"grad_norm": 0.18879687589648914,
"learning_rate": 4.683217595056275e-06,
"loss": 0.2746,
"step": 972
},
{
"epoch": 4.0,
"grad_norm": 1.707156689602904,
"learning_rate": 4.646315768247731e-06,
"loss": 0.2868,
"step": 973
},
{
"epoch": 4.0041109969167525,
"grad_norm": 0.3673275720964706,
"learning_rate": 4.609540785964348e-06,
"loss": 0.2379,
"step": 974
},
{
"epoch": 4.008221993833504,
"grad_norm": 0.26013071708722996,
"learning_rate": 4.572892952022796e-06,
"loss": 0.2495,
"step": 975
},
{
"epoch": 4.012332990750257,
"grad_norm": 0.30039166221512403,
"learning_rate": 4.5363725691893045e-06,
"loss": 0.2434,
"step": 976
},
{
"epoch": 4.016443987667009,
"grad_norm": 0.40331206801802966,
"learning_rate": 4.499979939177164e-06,
"loss": 0.2413,
"step": 977
},
{
"epoch": 4.020554984583762,
"grad_norm": 0.2653915725640132,
"learning_rate": 4.463715362644239e-06,
"loss": 0.2415,
"step": 978
},
{
"epoch": 4.0246659815005135,
"grad_norm": 0.2706794398843468,
"learning_rate": 4.427579139190474e-06,
"loss": 0.2353,
"step": 979
},
{
"epoch": 4.028776978417266,
"grad_norm": 0.33800513453404296,
"learning_rate": 4.391571567355428e-06,
"loss": 0.244,
"step": 980
},
{
"epoch": 4.0328879753340185,
"grad_norm": 0.2848868937309266,
"learning_rate": 4.355692944615806e-06,
"loss": 0.2446,
"step": 981
},
{
"epoch": 4.036998972250771,
"grad_norm": 0.213052312700043,
"learning_rate": 4.319943567382991e-06,
"loss": 0.2446,
"step": 982
},
{
"epoch": 4.041109969167523,
"grad_norm": 0.24448300665475436,
"learning_rate": 4.28432373100061e-06,
"loss": 0.2383,
"step": 983
},
{
"epoch": 4.045220966084275,
"grad_norm": 0.28289541109409083,
"learning_rate": 4.248833729742095e-06,
"loss": 0.2335,
"step": 984
},
{
"epoch": 4.049331963001028,
"grad_norm": 0.27075279957678594,
"learning_rate": 4.2134738568082325e-06,
"loss": 0.2388,
"step": 985
},
{
"epoch": 4.05344295991778,
"grad_norm": 0.2271083193598205,
"learning_rate": 4.1782444043247565e-06,
"loss": 0.2386,
"step": 986
},
{
"epoch": 4.057553956834532,
"grad_norm": 0.22324730717439883,
"learning_rate": 4.143145663339932e-06,
"loss": 0.2447,
"step": 987
},
{
"epoch": 4.061664953751285,
"grad_norm": 0.26100760343340185,
"learning_rate": 4.108177923822154e-06,
"loss": 0.2426,
"step": 988
},
{
"epoch": 4.065775950668037,
"grad_norm": 0.23257567018511596,
"learning_rate": 4.073341474657544e-06,
"loss": 0.2482,
"step": 989
},
{
"epoch": 4.06988694758479,
"grad_norm": 0.1994071326027501,
"learning_rate": 4.03863660364757e-06,
"loss": 0.2389,
"step": 990
},
{
"epoch": 4.073997944501541,
"grad_norm": 0.21371643270197568,
"learning_rate": 4.004063597506664e-06,
"loss": 0.2337,
"step": 991
},
{
"epoch": 4.078108941418294,
"grad_norm": 0.24512669596399653,
"learning_rate": 3.969622741859862e-06,
"loss": 0.2477,
"step": 992
},
{
"epoch": 4.082219938335046,
"grad_norm": 0.21744045295237915,
"learning_rate": 3.935314321240433e-06,
"loss": 0.2405,
"step": 993
},
{
"epoch": 4.086330935251799,
"grad_norm": 0.20192278557379797,
"learning_rate": 3.90113861908753e-06,
"loss": 0.2394,
"step": 994
},
{
"epoch": 4.090441932168551,
"grad_norm": 0.2027471703666848,
"learning_rate": 3.867095917743862e-06,
"loss": 0.2326,
"step": 995
},
{
"epoch": 4.094552929085303,
"grad_norm": 0.20882580151186148,
"learning_rate": 3.8331864984533404e-06,
"loss": 0.2362,
"step": 996
},
{
"epoch": 4.098663926002056,
"grad_norm": 0.1930471416017011,
"learning_rate": 3.799410641358776e-06,
"loss": 0.2462,
"step": 997
},
{
"epoch": 4.102774922918808,
"grad_norm": 0.19859635746881463,
"learning_rate": 3.7657686254995483e-06,
"loss": 0.2404,
"step": 998
},
{
"epoch": 4.10688591983556,
"grad_norm": 0.1983957254871405,
"learning_rate": 3.7322607288093117e-06,
"loss": 0.2398,
"step": 999
},
{
"epoch": 4.110996916752312,
"grad_norm": 0.22293857279886048,
"learning_rate": 3.6988872281136855e-06,
"loss": 0.2363,
"step": 1000
},
{
"epoch": 4.115107913669065,
"grad_norm": 0.20443840443106004,
"learning_rate": 3.66564839912799e-06,
"loss": 0.2318,
"step": 1001
},
{
"epoch": 4.1192189105858175,
"grad_norm": 0.17966769630726293,
"learning_rate": 3.632544516454941e-06,
"loss": 0.2359,
"step": 1002
},
{
"epoch": 4.123329907502569,
"grad_norm": 0.19432549741475053,
"learning_rate": 3.5995758535823997e-06,
"loss": 0.2316,
"step": 1003
},
{
"epoch": 4.127440904419322,
"grad_norm": 0.18881014978005276,
"learning_rate": 3.566742682881119e-06,
"loss": 0.2608,
"step": 1004
},
{
"epoch": 4.131551901336074,
"grad_norm": 0.19088807670118796,
"learning_rate": 3.534045275602467e-06,
"loss": 0.242,
"step": 1005
},
{
"epoch": 4.135662898252827,
"grad_norm": 0.1816637262264018,
"learning_rate": 3.501483901876208e-06,
"loss": 0.244,
"step": 1006
},
{
"epoch": 4.139773895169578,
"grad_norm": 0.19010713069523394,
"learning_rate": 3.469058830708263e-06,
"loss": 0.2324,
"step": 1007
},
{
"epoch": 4.143884892086331,
"grad_norm": 0.19620537155899534,
"learning_rate": 3.436770329978494e-06,
"loss": 0.2481,
"step": 1008
},
{
"epoch": 4.1479958890030835,
"grad_norm": 0.18566900979279455,
"learning_rate": 3.4046186664384795e-06,
"loss": 0.2432,
"step": 1009
},
{
"epoch": 4.152106885919835,
"grad_norm": 0.1755700170371331,
"learning_rate": 3.3726041057093186e-06,
"loss": 0.2386,
"step": 1010
},
{
"epoch": 4.156217882836588,
"grad_norm": 0.18096902328410783,
"learning_rate": 3.3407269122794373e-06,
"loss": 0.2487,
"step": 1011
},
{
"epoch": 4.16032887975334,
"grad_norm": 0.192754387487128,
"learning_rate": 3.3089873495023995e-06,
"loss": 0.234,
"step": 1012
},
{
"epoch": 4.164439876670093,
"grad_norm": 0.19892387100550088,
"learning_rate": 3.2773856795947336e-06,
"loss": 0.2339,
"step": 1013
},
{
"epoch": 4.168550873586844,
"grad_norm": 0.18465157283491226,
"learning_rate": 3.2459221636337633e-06,
"loss": 0.2379,
"step": 1014
},
{
"epoch": 4.172661870503597,
"grad_norm": 0.1899662552430034,
"learning_rate": 3.214597061555458e-06,
"loss": 0.2292,
"step": 1015
},
{
"epoch": 4.1767728674203495,
"grad_norm": 0.18665807494909734,
"learning_rate": 3.1834106321522727e-06,
"loss": 0.2371,
"step": 1016
},
{
"epoch": 4.180883864337102,
"grad_norm": 0.1854509036542964,
"learning_rate": 3.152363133071024e-06,
"loss": 0.2433,
"step": 1017
},
{
"epoch": 4.184994861253854,
"grad_norm": 0.20338354606609246,
"learning_rate": 3.12145482081075e-06,
"loss": 0.2373,
"step": 1018
},
{
"epoch": 4.189105858170606,
"grad_norm": 0.1823424640205926,
"learning_rate": 3.0906859507206044e-06,
"loss": 0.2425,
"step": 1019
},
{
"epoch": 4.193216855087359,
"grad_norm": 0.18646817047228667,
"learning_rate": 3.0600567769977286e-06,
"loss": 0.2388,
"step": 1020
},
{
"epoch": 4.197327852004111,
"grad_norm": 0.19248044840190429,
"learning_rate": 3.0295675526851686e-06,
"loss": 0.2327,
"step": 1021
},
{
"epoch": 4.201438848920863,
"grad_norm": 0.1895682953006883,
"learning_rate": 2.9992185296697763e-06,
"loss": 0.2494,
"step": 1022
},
{
"epoch": 4.2055498458376155,
"grad_norm": 0.1775774161260345,
"learning_rate": 2.9690099586801223e-06,
"loss": 0.2431,
"step": 1023
},
{
"epoch": 4.209660842754368,
"grad_norm": 0.18688744320331976,
"learning_rate": 2.938942089284453e-06,
"loss": 0.2243,
"step": 1024
},
{
"epoch": 4.213771839671121,
"grad_norm": 0.18321913204838605,
"learning_rate": 2.909015169888587e-06,
"loss": 0.2361,
"step": 1025
},
{
"epoch": 4.217882836587872,
"grad_norm": 0.18558364928419416,
"learning_rate": 2.879229447733893e-06,
"loss": 0.2438,
"step": 1026
},
{
"epoch": 4.221993833504625,
"grad_norm": 0.18370056819501662,
"learning_rate": 2.849585168895237e-06,
"loss": 0.2372,
"step": 1027
},
{
"epoch": 4.226104830421377,
"grad_norm": 0.17922623411257754,
"learning_rate": 2.8200825782789466e-06,
"loss": 0.2389,
"step": 1028
},
{
"epoch": 4.23021582733813,
"grad_norm": 0.1814704060047799,
"learning_rate": 2.790721919620798e-06,
"loss": 0.2299,
"step": 1029
},
{
"epoch": 4.2343268242548815,
"grad_norm": 0.18999808738781843,
"learning_rate": 2.7615034354839942e-06,
"loss": 0.2346,
"step": 1030
},
{
"epoch": 4.238437821171634,
"grad_norm": 0.18017749013937312,
"learning_rate": 2.7324273672571577e-06,
"loss": 0.2337,
"step": 1031
},
{
"epoch": 4.242548818088387,
"grad_norm": 0.1799591693551389,
"learning_rate": 2.7034939551523476e-06,
"loss": 0.2439,
"step": 1032
},
{
"epoch": 4.246659815005139,
"grad_norm": 0.18327330613798448,
"learning_rate": 2.6747034382030655e-06,
"loss": 0.2445,
"step": 1033
},
{
"epoch": 4.250770811921891,
"grad_norm": 0.17889375387571904,
"learning_rate": 2.646056054262287e-06,
"loss": 0.2467,
"step": 1034
},
{
"epoch": 4.254881808838643,
"grad_norm": 0.17679323974968908,
"learning_rate": 2.6175520400004907e-06,
"loss": 0.2405,
"step": 1035
},
{
"epoch": 4.258992805755396,
"grad_norm": 0.17336200096095578,
"learning_rate": 2.5891916309037046e-06,
"loss": 0.2367,
"step": 1036
},
{
"epoch": 4.263103802672148,
"grad_norm": 0.1862342732350899,
"learning_rate": 2.560975061271569e-06,
"loss": 0.2294,
"step": 1037
},
{
"epoch": 4.2672147995889,
"grad_norm": 0.1761467998629582,
"learning_rate": 2.5329025642153873e-06,
"loss": 0.2448,
"step": 1038
},
{
"epoch": 4.271325796505653,
"grad_norm": 0.17762679763602063,
"learning_rate": 2.5049743716562104e-06,
"loss": 0.2459,
"step": 1039
},
{
"epoch": 4.275436793422405,
"grad_norm": 0.17679704716813474,
"learning_rate": 2.4771907143229124e-06,
"loss": 0.2366,
"step": 1040
},
{
"epoch": 4.279547790339157,
"grad_norm": 0.18512587191088023,
"learning_rate": 2.4495518217502936e-06,
"loss": 0.2334,
"step": 1041
},
{
"epoch": 4.283658787255909,
"grad_norm": 0.17098883894517244,
"learning_rate": 2.422057922277179e-06,
"loss": 0.2366,
"step": 1042
},
{
"epoch": 4.287769784172662,
"grad_norm": 0.19038355517722344,
"learning_rate": 2.3947092430445284e-06,
"loss": 0.2361,
"step": 1043
},
{
"epoch": 4.291880781089414,
"grad_norm": 0.1807683215327849,
"learning_rate": 2.367506009993572e-06,
"loss": 0.2314,
"step": 1044
},
{
"epoch": 4.295991778006167,
"grad_norm": 0.18180874713294695,
"learning_rate": 2.34044844786393e-06,
"loss": 0.2385,
"step": 1045
},
{
"epoch": 4.300102774922919,
"grad_norm": 0.18147750075878016,
"learning_rate": 2.313536780191763e-06,
"loss": 0.2336,
"step": 1046
},
{
"epoch": 4.304213771839671,
"grad_norm": 0.1782373533284217,
"learning_rate": 2.2867712293079223e-06,
"loss": 0.2356,
"step": 1047
},
{
"epoch": 4.308324768756424,
"grad_norm": 0.17802709783230702,
"learning_rate": 2.2601520163361166e-06,
"loss": 0.2445,
"step": 1048
},
{
"epoch": 4.312435765673175,
"grad_norm": 0.17602254086438468,
"learning_rate": 2.233679361191081e-06,
"loss": 0.2296,
"step": 1049
},
{
"epoch": 4.316546762589928,
"grad_norm": 0.17604437821882946,
"learning_rate": 2.2073534825767683e-06,
"loss": 0.2493,
"step": 1050
},
{
"epoch": 4.32065775950668,
"grad_norm": 0.18364670883928147,
"learning_rate": 2.18117459798453e-06,
"loss": 0.2332,
"step": 1051
},
{
"epoch": 4.324768756423433,
"grad_norm": 0.17647874008223446,
"learning_rate": 2.155142923691329e-06,
"loss": 0.2434,
"step": 1052
},
{
"epoch": 4.328879753340185,
"grad_norm": 0.1821284298329628,
"learning_rate": 2.129258674757948e-06,
"loss": 0.2405,
"step": 1053
},
{
"epoch": 4.332990750256937,
"grad_norm": 0.17780536510155415,
"learning_rate": 2.103522065027217e-06,
"loss": 0.2352,
"step": 1054
},
{
"epoch": 4.33710174717369,
"grad_norm": 0.17826171239681762,
"learning_rate": 2.07793330712224e-06,
"loss": 0.2389,
"step": 1055
},
{
"epoch": 4.341212744090442,
"grad_norm": 0.17939747251527152,
"learning_rate": 2.0524926124446497e-06,
"loss": 0.2419,
"step": 1056
},
{
"epoch": 4.345323741007194,
"grad_norm": 0.18203279406090278,
"learning_rate": 2.0272001911728466e-06,
"loss": 0.237,
"step": 1057
},
{
"epoch": 4.349434737923946,
"grad_norm": 0.1797008020492476,
"learning_rate": 2.0020562522602716e-06,
"loss": 0.2341,
"step": 1058
},
{
"epoch": 4.353545734840699,
"grad_norm": 0.1739746708895779,
"learning_rate": 1.9770610034336823e-06,
"loss": 0.2391,
"step": 1059
},
{
"epoch": 4.3576567317574515,
"grad_norm": 0.18058019470972567,
"learning_rate": 1.9522146511914265e-06,
"loss": 0.2322,
"step": 1060
},
{
"epoch": 4.361767728674203,
"grad_norm": 0.17826352479236454,
"learning_rate": 1.927517400801746e-06,
"loss": 0.2422,
"step": 1061
},
{
"epoch": 4.365878725590956,
"grad_norm": 0.16893750635969274,
"learning_rate": 1.902969456301076e-06,
"loss": 0.2332,
"step": 1062
},
{
"epoch": 4.369989722507708,
"grad_norm": 0.1782753702044134,
"learning_rate": 1.8785710204923612e-06,
"loss": 0.2385,
"step": 1063
},
{
"epoch": 4.374100719424461,
"grad_norm": 0.18141238302838078,
"learning_rate": 1.8543222949433736e-06,
"loss": 0.2463,
"step": 1064
},
{
"epoch": 4.378211716341212,
"grad_norm": 0.17681464500466545,
"learning_rate": 1.8302234799850671e-06,
"loss": 0.2441,
"step": 1065
},
{
"epoch": 4.382322713257965,
"grad_norm": 0.17491562407701997,
"learning_rate": 1.8062747747098974e-06,
"loss": 0.2359,
"step": 1066
},
{
"epoch": 4.3864337101747175,
"grad_norm": 0.17582489650428018,
"learning_rate": 1.782476376970188e-06,
"loss": 0.2518,
"step": 1067
},
{
"epoch": 4.39054470709147,
"grad_norm": 0.179309298657522,
"learning_rate": 1.7588284833765024e-06,
"loss": 0.2509,
"step": 1068
},
{
"epoch": 4.394655704008222,
"grad_norm": 0.17355362198390345,
"learning_rate": 1.7353312892960095e-06,
"loss": 0.2396,
"step": 1069
},
{
"epoch": 4.398766700924974,
"grad_norm": 0.20669657827730264,
"learning_rate": 1.7119849888508766e-06,
"loss": 0.2401,
"step": 1070
},
{
"epoch": 4.402877697841727,
"grad_norm": 0.1759615590766294,
"learning_rate": 1.6887897749166548e-06,
"loss": 0.239,
"step": 1071
},
{
"epoch": 4.406988694758479,
"grad_norm": 0.1867129692924266,
"learning_rate": 1.6657458391207049e-06,
"loss": 0.24,
"step": 1072
},
{
"epoch": 4.411099691675231,
"grad_norm": 0.17724290686658428,
"learning_rate": 1.6428533718405914e-06,
"loss": 0.2485,
"step": 1073
},
{
"epoch": 4.4152106885919835,
"grad_norm": 0.1769255669225784,
"learning_rate": 1.6201125622025315e-06,
"loss": 0.2343,
"step": 1074
},
{
"epoch": 4.419321685508736,
"grad_norm": 0.17424978161900648,
"learning_rate": 1.5975235980798153e-06,
"loss": 0.2299,
"step": 1075
},
{
"epoch": 4.423432682425489,
"grad_norm": 0.17407143372977305,
"learning_rate": 1.5750866660912634e-06,
"loss": 0.2294,
"step": 1076
},
{
"epoch": 4.42754367934224,
"grad_norm": 0.17791660590457703,
"learning_rate": 1.5528019515996783e-06,
"loss": 0.2425,
"step": 1077
},
{
"epoch": 4.431654676258993,
"grad_norm": 0.18301382705782807,
"learning_rate": 1.5306696387103227e-06,
"loss": 0.2343,
"step": 1078
},
{
"epoch": 4.435765673175745,
"grad_norm": 0.17589070591387826,
"learning_rate": 1.5086899102693875e-06,
"loss": 0.2469,
"step": 1079
},
{
"epoch": 4.439876670092497,
"grad_norm": 0.17198527500762634,
"learning_rate": 1.486862947862493e-06,
"loss": 0.2463,
"step": 1080
},
{
"epoch": 4.4439876670092495,
"grad_norm": 0.17792281862140422,
"learning_rate": 1.465188931813175e-06,
"loss": 0.2301,
"step": 1081
},
{
"epoch": 4.448098663926002,
"grad_norm": 0.17628369792032114,
"learning_rate": 1.4436680411814097e-06,
"loss": 0.2399,
"step": 1082
},
{
"epoch": 4.452209660842755,
"grad_norm": 0.17439136560526375,
"learning_rate": 1.42230045376212e-06,
"loss": 0.237,
"step": 1083
},
{
"epoch": 4.456320657759507,
"grad_norm": 0.17943067929919523,
"learning_rate": 1.4010863460837132e-06,
"loss": 0.2405,
"step": 1084
},
{
"epoch": 4.460431654676259,
"grad_norm": 0.17235533535415476,
"learning_rate": 1.380025893406638e-06,
"loss": 0.2397,
"step": 1085
},
{
"epoch": 4.464542651593011,
"grad_norm": 0.17886870223554543,
"learning_rate": 1.3591192697219003e-06,
"loss": 0.2409,
"step": 1086
},
{
"epoch": 4.468653648509764,
"grad_norm": 0.16738079998494204,
"learning_rate": 1.3383666477496627e-06,
"loss": 0.2387,
"step": 1087
},
{
"epoch": 4.4727646454265155,
"grad_norm": 0.16758787660813548,
"learning_rate": 1.3177681989377944e-06,
"loss": 0.2417,
"step": 1088
},
{
"epoch": 4.476875642343268,
"grad_norm": 0.1752202184059869,
"learning_rate": 1.2973240934604658e-06,
"loss": 0.2274,
"step": 1089
},
{
"epoch": 4.480986639260021,
"grad_norm": 0.17463210365794904,
"learning_rate": 1.277034500216736e-06,
"loss": 0.226,
"step": 1090
},
{
"epoch": 4.485097636176773,
"grad_norm": 0.17601812963083546,
"learning_rate": 1.2568995868291656e-06,
"loss": 0.2491,
"step": 1091
},
{
"epoch": 4.489208633093525,
"grad_norm": 0.1775954820916016,
"learning_rate": 1.236919519642421e-06,
"loss": 0.2432,
"step": 1092
},
{
"epoch": 4.493319630010277,
"grad_norm": 0.17531188232428954,
"learning_rate": 1.2170944637219106e-06,
"loss": 0.2417,
"step": 1093
},
{
"epoch": 4.49743062692703,
"grad_norm": 0.1752476486120662,
"learning_rate": 1.1974245828524156e-06,
"loss": 0.2274,
"step": 1094
},
{
"epoch": 4.501541623843782,
"grad_norm": 0.18283725978641932,
"learning_rate": 1.177910039536736e-06,
"loss": 0.2408,
"step": 1095
},
{
"epoch": 4.505652620760534,
"grad_norm": 0.17390337086901564,
"learning_rate": 1.1585509949943518e-06,
"loss": 0.2374,
"step": 1096
},
{
"epoch": 4.509763617677287,
"grad_norm": 0.1780578101655513,
"learning_rate": 1.1393476091600886e-06,
"loss": 0.2473,
"step": 1097
},
{
"epoch": 4.513874614594039,
"grad_norm": 0.17965538239208087,
"learning_rate": 1.120300040682798e-06,
"loss": 0.244,
"step": 1098
},
{
"epoch": 4.517985611510792,
"grad_norm": 0.17589641804084827,
"learning_rate": 1.1014084469240461e-06,
"loss": 0.2435,
"step": 1099
},
{
"epoch": 4.522096608427543,
"grad_norm": 0.17442573382633822,
"learning_rate": 1.0826729839568073e-06,
"loss": 0.2417,
"step": 1100
},
{
"epoch": 4.526207605344296,
"grad_norm": 0.1807965837015226,
"learning_rate": 1.0640938065641926e-06,
"loss": 0.2424,
"step": 1101
},
{
"epoch": 4.530318602261048,
"grad_norm": 0.17785902254626473,
"learning_rate": 1.0456710682381455e-06,
"loss": 0.2546,
"step": 1102
},
{
"epoch": 4.534429599177801,
"grad_norm": 0.17258906030475882,
"learning_rate": 1.0274049211781967e-06,
"loss": 0.2422,
"step": 1103
},
{
"epoch": 4.538540596094553,
"grad_norm": 1.2728190292585875,
"learning_rate": 1.009295516290194e-06,
"loss": 0.2608,
"step": 1104
},
{
"epoch": 4.542651593011305,
"grad_norm": 0.17125359621163755,
"learning_rate": 9.913430031850635e-07,
"loss": 0.2356,
"step": 1105
},
{
"epoch": 4.546762589928058,
"grad_norm": 0.17651600362640116,
"learning_rate": 9.735475301775632e-07,
"loss": 0.246,
"step": 1106
},
{
"epoch": 4.55087358684481,
"grad_norm": 0.16931432337364227,
"learning_rate": 9.559092442850671e-07,
"loss": 0.2289,
"step": 1107
},
{
"epoch": 4.554984583761562,
"grad_norm": 0.17727932117256406,
"learning_rate": 9.384282912263475e-07,
"loss": 0.2334,
"step": 1108
},
{
"epoch": 4.559095580678314,
"grad_norm": 0.17674518726323318,
"learning_rate": 9.211048154203661e-07,
"loss": 0.2512,
"step": 1109
},
{
"epoch": 4.563206577595067,
"grad_norm": 0.17332686615916262,
"learning_rate": 9.039389599850912e-07,
"loss": 0.2339,
"step": 1110
},
{
"epoch": 4.567317574511819,
"grad_norm": 0.1651967177583125,
"learning_rate": 8.869308667363063e-07,
"loss": 0.241,
"step": 1111
},
{
"epoch": 4.571428571428571,
"grad_norm": 0.17416950220257496,
"learning_rate": 8.700806761864466e-07,
"loss": 0.2329,
"step": 1112
},
{
"epoch": 4.575539568345324,
"grad_norm": 0.16574888125550336,
"learning_rate": 8.533885275434283e-07,
"loss": 0.2429,
"step": 1113
},
{
"epoch": 4.579650565262076,
"grad_norm": 0.1731537234968332,
"learning_rate": 8.368545587095056e-07,
"loss": 0.2414,
"step": 1114
},
{
"epoch": 4.583761562178829,
"grad_norm": 0.17116481210634382,
"learning_rate": 8.20478906280131e-07,
"loss": 0.2405,
"step": 1115
},
{
"epoch": 4.5878725590955804,
"grad_norm": 0.17008303524989962,
"learning_rate": 8.042617055428215e-07,
"loss": 0.2313,
"step": 1116
},
{
"epoch": 4.591983556012333,
"grad_norm": 0.1724744667796713,
"learning_rate": 7.882030904760518e-07,
"loss": 0.238,
"step": 1117
},
{
"epoch": 4.5960945529290855,
"grad_norm": 0.17588735295940247,
"learning_rate": 7.723031937481318e-07,
"loss": 0.2497,
"step": 1118
},
{
"epoch": 4.600205549845837,
"grad_norm": 0.17565531889855016,
"learning_rate": 7.565621467161244e-07,
"loss": 0.2563,
"step": 1119
},
{
"epoch": 4.60431654676259,
"grad_norm": 0.1730755101477369,
"learning_rate": 7.409800794247557e-07,
"loss": 0.2337,
"step": 1120
},
{
"epoch": 4.608427543679342,
"grad_norm": 0.16854464186311885,
"learning_rate": 7.25557120605338e-07,
"loss": 0.2445,
"step": 1121
},
{
"epoch": 4.612538540596095,
"grad_norm": 0.17181736526067778,
"learning_rate": 7.102933976747084e-07,
"loss": 0.2356,
"step": 1122
},
{
"epoch": 4.616649537512847,
"grad_norm": 0.17224841346483227,
"learning_rate": 6.951890367341763e-07,
"loss": 0.2404,
"step": 1123
},
{
"epoch": 4.620760534429599,
"grad_norm": 0.174217100719342,
"learning_rate": 6.802441625684774e-07,
"loss": 0.2505,
"step": 1124
},
{
"epoch": 4.6248715313463515,
"grad_norm": 0.172147168668161,
"learning_rate": 6.654588986447597e-07,
"loss": 0.2387,
"step": 1125
},
{
"epoch": 4.628982528263104,
"grad_norm": 0.17025566837994996,
"learning_rate": 6.508333671115341e-07,
"loss": 0.2445,
"step": 1126
},
{
"epoch": 4.633093525179856,
"grad_norm": 0.1791903634128583,
"learning_rate": 6.363676887976944e-07,
"loss": 0.2458,
"step": 1127
},
{
"epoch": 4.637204522096608,
"grad_norm": 0.17050764877819538,
"learning_rate": 6.220619832114971e-07,
"loss": 0.2504,
"step": 1128
},
{
"epoch": 4.641315519013361,
"grad_norm": 0.17379476155710213,
"learning_rate": 6.079163685395917e-07,
"loss": 0.2426,
"step": 1129
},
{
"epoch": 4.645426515930113,
"grad_norm": 0.17731694259336384,
"learning_rate": 5.939309616460276e-07,
"loss": 0.2356,
"step": 1130
},
{
"epoch": 4.649537512846865,
"grad_norm": 0.17342657076328494,
"learning_rate": 5.801058780713021e-07,
"loss": 0.2454,
"step": 1131
},
{
"epoch": 4.6536485097636175,
"grad_norm": 0.17506716139804732,
"learning_rate": 5.664412320314027e-07,
"loss": 0.2466,
"step": 1132
},
{
"epoch": 4.65775950668037,
"grad_norm": 0.1760318065249804,
"learning_rate": 5.529371364168535e-07,
"loss": 0.2298,
"step": 1133
},
{
"epoch": 4.661870503597123,
"grad_norm": 0.17418792165950514,
"learning_rate": 5.395937027918008e-07,
"loss": 0.2352,
"step": 1134
},
{
"epoch": 4.665981500513874,
"grad_norm": 0.1706417249640066,
"learning_rate": 5.264110413930735e-07,
"loss": 0.2398,
"step": 1135
},
{
"epoch": 4.670092497430627,
"grad_norm": 0.17721194204664026,
"learning_rate": 5.133892611292846e-07,
"loss": 0.2378,
"step": 1136
},
{
"epoch": 4.674203494347379,
"grad_norm": 0.18022478809124595,
"learning_rate": 5.005284695799217e-07,
"loss": 0.2491,
"step": 1137
},
{
"epoch": 4.678314491264132,
"grad_norm": 0.17519368559887136,
"learning_rate": 4.878287729944697e-07,
"loss": 0.2438,
"step": 1138
},
{
"epoch": 4.6824254881808836,
"grad_norm": 0.1777158927515907,
"learning_rate": 4.7529027629152234e-07,
"loss": 0.2364,
"step": 1139
},
{
"epoch": 4.686536485097636,
"grad_norm": 0.1707755427569748,
"learning_rate": 4.6291308305792315e-07,
"loss": 0.2453,
"step": 1140
},
{
"epoch": 4.690647482014389,
"grad_norm": 0.16668258759849372,
"learning_rate": 4.5069729554790386e-07,
"loss": 0.2402,
"step": 1141
},
{
"epoch": 4.694758478931141,
"grad_norm": 0.16459971581816116,
"learning_rate": 4.386430146822429e-07,
"loss": 0.2483,
"step": 1142
},
{
"epoch": 4.698869475847893,
"grad_norm": 0.17875849392738621,
"learning_rate": 4.2675034004743045e-07,
"loss": 0.241,
"step": 1143
},
{
"epoch": 4.702980472764645,
"grad_norm": 0.17296505254178202,
"learning_rate": 4.150193698948468e-07,
"loss": 0.2465,
"step": 1144
},
{
"epoch": 4.707091469681398,
"grad_norm": 0.17224757284909492,
"learning_rate": 4.034502011399499e-07,
"loss": 0.2385,
"step": 1145
},
{
"epoch": 4.7112024665981505,
"grad_norm": 0.17411551470001055,
"learning_rate": 3.92042929361478e-07,
"loss": 0.2362,
"step": 1146
},
{
"epoch": 4.715313463514902,
"grad_norm": 0.17065619759470402,
"learning_rate": 3.8079764880064817e-07,
"loss": 0.2367,
"step": 1147
},
{
"epoch": 4.719424460431655,
"grad_norm": 0.1692629166872625,
"learning_rate": 3.6971445236039685e-07,
"loss": 0.2441,
"step": 1148
},
{
"epoch": 4.723535457348407,
"grad_norm": 0.1727903716287266,
"learning_rate": 3.587934316045938e-07,
"loss": 0.2332,
"step": 1149
},
{
"epoch": 4.727646454265159,
"grad_norm": 0.16765805252498014,
"learning_rate": 3.4803467675729843e-07,
"loss": 0.2436,
"step": 1150
},
{
"epoch": 4.731757451181911,
"grad_norm": 0.1685165614846259,
"learning_rate": 3.374382767020068e-07,
"loss": 0.2462,
"step": 1151
},
{
"epoch": 4.735868448098664,
"grad_norm": 0.17176323835480908,
"learning_rate": 3.270043189809213e-07,
"loss": 0.2475,
"step": 1152
},
{
"epoch": 4.7399794450154165,
"grad_norm": 0.17054983226303438,
"learning_rate": 3.167328897942268e-07,
"loss": 0.2396,
"step": 1153
},
{
"epoch": 4.744090441932169,
"grad_norm": 0.17377255003391895,
"learning_rate": 3.0662407399937757e-07,
"loss": 0.2414,
"step": 1154
},
{
"epoch": 4.748201438848921,
"grad_norm": 0.1707561407227221,
"learning_rate": 2.96677955110396e-07,
"loss": 0.2374,
"step": 1155
},
{
"epoch": 4.752312435765673,
"grad_norm": 0.1720410967594104,
"learning_rate": 2.8689461529718634e-07,
"loss": 0.2439,
"step": 1156
},
{
"epoch": 4.756423432682426,
"grad_norm": 0.18296516134186272,
"learning_rate": 2.7727413538484625e-07,
"loss": 0.2361,
"step": 1157
},
{
"epoch": 4.760534429599177,
"grad_norm": 0.17896247286039071,
"learning_rate": 2.678165948530143e-07,
"loss": 0.2356,
"step": 1158
},
{
"epoch": 4.76464542651593,
"grad_norm": 0.17407845197351318,
"learning_rate": 2.5852207183519885e-07,
"loss": 0.2251,
"step": 1159
},
{
"epoch": 4.7687564234326825,
"grad_norm": 0.17062892002775074,
"learning_rate": 2.493906431181392e-07,
"loss": 0.2438,
"step": 1160
},
{
"epoch": 4.772867420349435,
"grad_norm": 0.1672404807221335,
"learning_rate": 2.4042238414117016e-07,
"loss": 0.2261,
"step": 1161
},
{
"epoch": 4.7769784172661875,
"grad_norm": 0.17275226054969903,
"learning_rate": 2.3161736899560249e-07,
"loss": 0.2394,
"step": 1162
},
{
"epoch": 4.781089414182939,
"grad_norm": 0.17356630567530312,
"learning_rate": 2.2297567042410372e-07,
"loss": 0.2345,
"step": 1163
},
{
"epoch": 4.785200411099692,
"grad_norm": 0.17592520932271608,
"learning_rate": 2.1449735982010278e-07,
"loss": 0.2431,
"step": 1164
},
{
"epoch": 4.789311408016444,
"grad_norm": 0.17039915938031028,
"learning_rate": 2.0618250722719501e-07,
"loss": 0.2431,
"step": 1165
},
{
"epoch": 4.793422404933196,
"grad_norm": 0.1730081549716055,
"learning_rate": 1.9803118133857157e-07,
"loss": 0.2486,
"step": 1166
},
{
"epoch": 4.7975334018499485,
"grad_norm": 0.17038132685762578,
"learning_rate": 1.9004344949644425e-07,
"loss": 0.2409,
"step": 1167
},
{
"epoch": 4.801644398766701,
"grad_norm": 0.1743699418678854,
"learning_rate": 1.8221937769149045e-07,
"loss": 0.2365,
"step": 1168
},
{
"epoch": 4.805755395683454,
"grad_norm": 0.16868706789230756,
"learning_rate": 1.745590305623157e-07,
"loss": 0.2415,
"step": 1169
},
{
"epoch": 4.809866392600205,
"grad_norm": 0.17209405727243507,
"learning_rate": 1.6706247139490318e-07,
"loss": 0.2434,
"step": 1170
},
{
"epoch": 4.813977389516958,
"grad_norm": 0.17768294930126322,
"learning_rate": 1.5972976212211388e-07,
"loss": 0.2333,
"step": 1171
},
{
"epoch": 4.81808838643371,
"grad_norm": 0.1692714430601308,
"learning_rate": 1.525609633231495e-07,
"loss": 0.247,
"step": 1172
},
{
"epoch": 4.822199383350463,
"grad_norm": 0.16825815560990823,
"learning_rate": 1.455561342230749e-07,
"loss": 0.249,
"step": 1173
},
{
"epoch": 4.8263103802672145,
"grad_norm": 0.17141290386405647,
"learning_rate": 1.3871533269231187e-07,
"loss": 0.2547,
"step": 1174
},
{
"epoch": 4.830421377183967,
"grad_norm": 0.17040562779973573,
"learning_rate": 1.3203861524617278e-07,
"loss": 0.2519,
"step": 1175
},
{
"epoch": 4.83453237410072,
"grad_norm": 0.1677164586091278,
"learning_rate": 1.2552603704438115e-07,
"loss": 0.2334,
"step": 1176
},
{
"epoch": 4.838643371017472,
"grad_norm": 0.1695089968957749,
"learning_rate": 1.1917765189063402e-07,
"loss": 0.243,
"step": 1177
},
{
"epoch": 4.842754367934224,
"grad_norm": 0.17083241390302353,
"learning_rate": 1.1299351223214017e-07,
"loss": 0.2349,
"step": 1178
},
{
"epoch": 4.846865364850976,
"grad_norm": 0.17067355512311883,
"learning_rate": 1.069736691591916e-07,
"loss": 0.2392,
"step": 1179
},
{
"epoch": 4.850976361767729,
"grad_norm": 0.16913426842634588,
"learning_rate": 1.0111817240475052e-07,
"loss": 0.23,
"step": 1180
},
{
"epoch": 4.8550873586844805,
"grad_norm": 0.17062118220717767,
"learning_rate": 9.542707034402299e-08,
"loss": 0.2358,
"step": 1181
},
{
"epoch": 4.859198355601233,
"grad_norm": 0.17029660865240914,
"learning_rate": 8.990040999407701e-08,
"loss": 0.2302,
"step": 1182
},
{
"epoch": 4.863309352517986,
"grad_norm": 0.16773211882510938,
"learning_rate": 8.453823701343622e-08,
"loss": 0.245,
"step": 1183
},
{
"epoch": 4.867420349434738,
"grad_norm": 0.16985639544523204,
"learning_rate": 7.93405957017157e-08,
"loss": 0.2345,
"step": 1184
},
{
"epoch": 4.871531346351491,
"grad_norm": 0.16908464736483494,
"learning_rate": 7.430752899924898e-08,
"loss": 0.2413,
"step": 1185
},
{
"epoch": 4.875642343268242,
"grad_norm": 0.1759346258041749,
"learning_rate": 6.943907848673937e-08,
"loss": 0.2427,
"step": 1186
},
{
"epoch": 4.879753340184995,
"grad_norm": 0.17174700886638744,
"learning_rate": 6.473528438490916e-08,
"loss": 0.2439,
"step": 1187
},
{
"epoch": 4.883864337101747,
"grad_norm": 0.19693454366502605,
"learning_rate": 6.019618555417328e-08,
"loss": 0.2377,
"step": 1188
},
{
"epoch": 4.887975334018499,
"grad_norm": 0.17754029524221127,
"learning_rate": 5.58218194943172e-08,
"loss": 0.2293,
"step": 1189
},
{
"epoch": 4.892086330935252,
"grad_norm": 0.1700686268287155,
"learning_rate": 5.161222234418173e-08,
"loss": 0.2416,
"step": 1190
},
{
"epoch": 4.896197327852004,
"grad_norm": 0.1778073111241822,
"learning_rate": 4.756742888136989e-08,
"loss": 0.245,
"step": 1191
},
{
"epoch": 4.900308324768757,
"grad_norm": 0.17085644813691567,
"learning_rate": 4.3687472521962704e-08,
"loss": 0.2386,
"step": 1192
},
{
"epoch": 4.904419321685509,
"grad_norm": 0.17614275321803505,
"learning_rate": 3.997238532023273e-08,
"loss": 0.2378,
"step": 1193
},
{
"epoch": 4.908530318602261,
"grad_norm": 0.16573489640277006,
"learning_rate": 3.642219796839097e-08,
"loss": 0.2386,
"step": 1194
},
{
"epoch": 4.912641315519013,
"grad_norm": 0.16678550464295586,
"learning_rate": 3.303693979632039e-08,
"loss": 0.2411,
"step": 1195
},
{
"epoch": 4.916752312435766,
"grad_norm": 0.1683860743917499,
"learning_rate": 2.981663877134944e-08,
"loss": 0.2443,
"step": 1196
},
{
"epoch": 4.920863309352518,
"grad_norm": 0.16597729886542864,
"learning_rate": 2.6761321498005587e-08,
"loss": 0.2408,
"step": 1197
},
{
"epoch": 4.92497430626927,
"grad_norm": 0.1718127781713061,
"learning_rate": 2.3871013217806605e-08,
"loss": 0.2412,
"step": 1198
},
{
"epoch": 4.929085303186023,
"grad_norm": 0.17097349823279034,
"learning_rate": 2.1145737809045162e-08,
"loss": 0.2421,
"step": 1199
},
{
"epoch": 4.933196300102775,
"grad_norm": 0.17427216115981084,
"learning_rate": 1.8585517786597894e-08,
"loss": 0.2381,
"step": 1200
},
{
"epoch": 4.937307297019527,
"grad_norm": 0.1744216439196038,
"learning_rate": 1.6190374301727762e-08,
"loss": 0.2282,
"step": 1201
},
{
"epoch": 4.941418293936279,
"grad_norm": 0.17445833543773084,
"learning_rate": 1.3960327141926411e-08,
"loss": 0.2299,
"step": 1202
},
{
"epoch": 4.945529290853032,
"grad_norm": 0.17503271001824838,
"learning_rate": 1.1895394730738751e-08,
"loss": 0.2333,
"step": 1203
},
{
"epoch": 4.9496402877697845,
"grad_norm": 0.1689882913201602,
"learning_rate": 9.995594127607534e-09,
"loss": 0.2426,
"step": 1204
},
{
"epoch": 4.953751284686536,
"grad_norm": 0.17067106138398636,
"learning_rate": 8.260941027746772e-09,
"loss": 0.2477,
"step": 1205
},
{
"epoch": 4.957862281603289,
"grad_norm": 0.17523534599452864,
"learning_rate": 6.6914497619996465e-09,
"loss": 0.2362,
"step": 1206
},
{
"epoch": 4.961973278520041,
"grad_norm": 0.16958624262500685,
"learning_rate": 5.287133296723035e-09,
"loss": 0.2416,
"step": 1207
},
{
"epoch": 4.966084275436794,
"grad_norm": 0.17141643376988966,
"learning_rate": 4.048003233687592e-09,
"loss": 0.2319,
"step": 1208
},
{
"epoch": 4.970195272353545,
"grad_norm": 0.17444671360259928,
"learning_rate": 2.974069809964508e-09,
"loss": 0.2442,
"step": 1209
},
{
"epoch": 4.974306269270298,
"grad_norm": 0.16938004185341077,
"learning_rate": 2.065341897865558e-09,
"loss": 0.2482,
"step": 1210
},
{
"epoch": 4.9784172661870505,
"grad_norm": 0.16777811457050917,
"learning_rate": 1.32182700484762e-09,
"loss": 0.2465,
"step": 1211
},
{
"epoch": 4.982528263103803,
"grad_norm": 0.17657705108779342,
"learning_rate": 7.435312734593858e-10,
"loss": 0.2395,
"step": 1212
},
{
"epoch": 4.986639260020555,
"grad_norm": 0.16972832488616352,
"learning_rate": 3.304594812991724e-10,
"loss": 0.2413,
"step": 1213
},
{
"epoch": 4.990750256937307,
"grad_norm": 0.16507566272358404,
"learning_rate": 8.261504095496976e-11,
"loss": 0.2336,
"step": 1214
},
{
"epoch": 4.99486125385406,
"grad_norm": 0.17100744078272323,
"learning_rate": 0.0,
"loss": 0.2357,
"step": 1215
},
{
"epoch": 4.99486125385406,
"step": 1215,
"total_flos": 4.757804886857613e+18,
"train_loss": 0.34405366748939326,
"train_runtime": 28473.715,
"train_samples_per_second": 5.463,
"train_steps_per_second": 0.043
}
],
"logging_steps": 1,
"max_steps": 1215,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 4.757804886857613e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}