deepmath / trainer_state.json
sedrickkeh's picture
End of training
b84657d verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 4.981412639405205,
"eval_steps": 500,
"global_step": 1005,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004956629491945477,
"grad_norm": 5.755599171540443,
"learning_rate": 7.920792079207921e-07,
"loss": 0.8036,
"step": 1
},
{
"epoch": 0.009913258983890954,
"grad_norm": 5.709874822891588,
"learning_rate": 1.5841584158415842e-06,
"loss": 0.7966,
"step": 2
},
{
"epoch": 0.01486988847583643,
"grad_norm": 5.669629961381398,
"learning_rate": 2.3762376237623762e-06,
"loss": 0.7948,
"step": 3
},
{
"epoch": 0.01982651796778191,
"grad_norm": 5.3157614476293835,
"learning_rate": 3.1683168316831685e-06,
"loss": 0.7762,
"step": 4
},
{
"epoch": 0.024783147459727387,
"grad_norm": 4.356092541330962,
"learning_rate": 3.960396039603961e-06,
"loss": 0.7499,
"step": 5
},
{
"epoch": 0.02973977695167286,
"grad_norm": 2.250263146370207,
"learning_rate": 4.7524752475247525e-06,
"loss": 0.6956,
"step": 6
},
{
"epoch": 0.03469640644361834,
"grad_norm": 1.914117915511351,
"learning_rate": 5.544554455445545e-06,
"loss": 0.6839,
"step": 7
},
{
"epoch": 0.03965303593556382,
"grad_norm": 3.1492377262690066,
"learning_rate": 6.336633663366337e-06,
"loss": 0.6668,
"step": 8
},
{
"epoch": 0.04460966542750929,
"grad_norm": 3.4712456437876367,
"learning_rate": 7.128712871287129e-06,
"loss": 0.677,
"step": 9
},
{
"epoch": 0.04956629491945477,
"grad_norm": 3.421087454928539,
"learning_rate": 7.920792079207921e-06,
"loss": 0.6624,
"step": 10
},
{
"epoch": 0.05452292441140025,
"grad_norm": 2.9620186432080806,
"learning_rate": 8.712871287128714e-06,
"loss": 0.6236,
"step": 11
},
{
"epoch": 0.05947955390334572,
"grad_norm": 2.8338118123601115,
"learning_rate": 9.504950495049505e-06,
"loss": 0.6228,
"step": 12
},
{
"epoch": 0.0644361833952912,
"grad_norm": 1.8504400593431658,
"learning_rate": 1.0297029702970298e-05,
"loss": 0.6038,
"step": 13
},
{
"epoch": 0.06939281288723669,
"grad_norm": 1.4609039363755218,
"learning_rate": 1.108910891089109e-05,
"loss": 0.5935,
"step": 14
},
{
"epoch": 0.07434944237918216,
"grad_norm": 1.97639910368095,
"learning_rate": 1.1881188118811881e-05,
"loss": 0.572,
"step": 15
},
{
"epoch": 0.07930607187112763,
"grad_norm": 1.7151285016710816,
"learning_rate": 1.2673267326732674e-05,
"loss": 0.5634,
"step": 16
},
{
"epoch": 0.08426270136307311,
"grad_norm": 1.1070303436473,
"learning_rate": 1.3465346534653467e-05,
"loss": 0.5572,
"step": 17
},
{
"epoch": 0.08921933085501858,
"grad_norm": 1.2639833149957966,
"learning_rate": 1.4257425742574257e-05,
"loss": 0.5474,
"step": 18
},
{
"epoch": 0.09417596034696406,
"grad_norm": 1.3091402588020495,
"learning_rate": 1.504950495049505e-05,
"loss": 0.5477,
"step": 19
},
{
"epoch": 0.09913258983890955,
"grad_norm": 0.8044594878857394,
"learning_rate": 1.5841584158415843e-05,
"loss": 0.5397,
"step": 20
},
{
"epoch": 0.10408921933085502,
"grad_norm": 1.0704477258758005,
"learning_rate": 1.6633663366336635e-05,
"loss": 0.5281,
"step": 21
},
{
"epoch": 0.1090458488228005,
"grad_norm": 0.9039341093983634,
"learning_rate": 1.7425742574257428e-05,
"loss": 0.5199,
"step": 22
},
{
"epoch": 0.11400247831474597,
"grad_norm": 0.8044388950158062,
"learning_rate": 1.821782178217822e-05,
"loss": 0.5139,
"step": 23
},
{
"epoch": 0.11895910780669144,
"grad_norm": 0.9382332156976467,
"learning_rate": 1.900990099009901e-05,
"loss": 0.5193,
"step": 24
},
{
"epoch": 0.12391573729863693,
"grad_norm": 0.6982723380926246,
"learning_rate": 1.9801980198019803e-05,
"loss": 0.5147,
"step": 25
},
{
"epoch": 0.1288723667905824,
"grad_norm": 0.783184579204947,
"learning_rate": 2.0594059405940595e-05,
"loss": 0.5062,
"step": 26
},
{
"epoch": 0.13382899628252787,
"grad_norm": 0.6904331997041252,
"learning_rate": 2.1386138613861388e-05,
"loss": 0.4938,
"step": 27
},
{
"epoch": 0.13878562577447337,
"grad_norm": 0.7021840211104649,
"learning_rate": 2.217821782178218e-05,
"loss": 0.4978,
"step": 28
},
{
"epoch": 0.14374225526641884,
"grad_norm": 0.683654391663701,
"learning_rate": 2.297029702970297e-05,
"loss": 0.4961,
"step": 29
},
{
"epoch": 0.14869888475836432,
"grad_norm": 0.5156280172267378,
"learning_rate": 2.3762376237623762e-05,
"loss": 0.4928,
"step": 30
},
{
"epoch": 0.1536555142503098,
"grad_norm": 0.6200967438468933,
"learning_rate": 2.4554455445544555e-05,
"loss": 0.4962,
"step": 31
},
{
"epoch": 0.15861214374225527,
"grad_norm": 0.44876109447418644,
"learning_rate": 2.5346534653465348e-05,
"loss": 0.4841,
"step": 32
},
{
"epoch": 0.16356877323420074,
"grad_norm": 0.6136329783927771,
"learning_rate": 2.613861386138614e-05,
"loss": 0.486,
"step": 33
},
{
"epoch": 0.16852540272614622,
"grad_norm": 0.5297287329726431,
"learning_rate": 2.6930693069306933e-05,
"loss": 0.4742,
"step": 34
},
{
"epoch": 0.1734820322180917,
"grad_norm": 0.5768447884060003,
"learning_rate": 2.7722772277227722e-05,
"loss": 0.4825,
"step": 35
},
{
"epoch": 0.17843866171003717,
"grad_norm": 0.5982890948235946,
"learning_rate": 2.8514851485148515e-05,
"loss": 0.4824,
"step": 36
},
{
"epoch": 0.18339529120198264,
"grad_norm": 0.5067059248621495,
"learning_rate": 2.9306930693069308e-05,
"loss": 0.4816,
"step": 37
},
{
"epoch": 0.18835192069392812,
"grad_norm": 0.6942780068708954,
"learning_rate": 3.00990099009901e-05,
"loss": 0.4775,
"step": 38
},
{
"epoch": 0.19330855018587362,
"grad_norm": 0.8143766320066766,
"learning_rate": 3.0891089108910896e-05,
"loss": 0.4804,
"step": 39
},
{
"epoch": 0.1982651796778191,
"grad_norm": 1.3888869925003242,
"learning_rate": 3.1683168316831686e-05,
"loss": 0.4754,
"step": 40
},
{
"epoch": 0.20322180916976457,
"grad_norm": 0.8396306197733768,
"learning_rate": 3.247524752475248e-05,
"loss": 0.4694,
"step": 41
},
{
"epoch": 0.20817843866171004,
"grad_norm": 0.6817330424057285,
"learning_rate": 3.326732673267327e-05,
"loss": 0.4584,
"step": 42
},
{
"epoch": 0.21313506815365552,
"grad_norm": 1.366427166379272,
"learning_rate": 3.405940594059406e-05,
"loss": 0.4645,
"step": 43
},
{
"epoch": 0.218091697645601,
"grad_norm": 0.8371176675680243,
"learning_rate": 3.4851485148514856e-05,
"loss": 0.4708,
"step": 44
},
{
"epoch": 0.22304832713754646,
"grad_norm": 1.0401044191645252,
"learning_rate": 3.5643564356435645e-05,
"loss": 0.4703,
"step": 45
},
{
"epoch": 0.22800495662949194,
"grad_norm": 1.3486576493438978,
"learning_rate": 3.643564356435644e-05,
"loss": 0.4661,
"step": 46
},
{
"epoch": 0.23296158612143741,
"grad_norm": 0.9562289057981055,
"learning_rate": 3.722772277227723e-05,
"loss": 0.4527,
"step": 47
},
{
"epoch": 0.2379182156133829,
"grad_norm": 0.8711388828137366,
"learning_rate": 3.801980198019802e-05,
"loss": 0.4541,
"step": 48
},
{
"epoch": 0.24287484510532836,
"grad_norm": 1.0960375147187478,
"learning_rate": 3.8811881188118816e-05,
"loss": 0.451,
"step": 49
},
{
"epoch": 0.24783147459727387,
"grad_norm": 1.7582704593641059,
"learning_rate": 3.9603960396039605e-05,
"loss": 0.4628,
"step": 50
},
{
"epoch": 0.2527881040892193,
"grad_norm": 0.7712567737809581,
"learning_rate": 4.03960396039604e-05,
"loss": 0.4572,
"step": 51
},
{
"epoch": 0.2577447335811648,
"grad_norm": 1.5724932491218706,
"learning_rate": 4.118811881188119e-05,
"loss": 0.4561,
"step": 52
},
{
"epoch": 0.26270136307311026,
"grad_norm": 1.113040327791958,
"learning_rate": 4.1980198019801987e-05,
"loss": 0.4577,
"step": 53
},
{
"epoch": 0.26765799256505574,
"grad_norm": 1.4339506148154477,
"learning_rate": 4.2772277227722776e-05,
"loss": 0.4518,
"step": 54
},
{
"epoch": 0.27261462205700127,
"grad_norm": 1.0054609210522674,
"learning_rate": 4.356435643564357e-05,
"loss": 0.4592,
"step": 55
},
{
"epoch": 0.27757125154894674,
"grad_norm": 1.677173196248357,
"learning_rate": 4.435643564356436e-05,
"loss": 0.4526,
"step": 56
},
{
"epoch": 0.2825278810408922,
"grad_norm": 1.6091305822608544,
"learning_rate": 4.514851485148515e-05,
"loss": 0.4543,
"step": 57
},
{
"epoch": 0.2874845105328377,
"grad_norm": 0.8750200586980326,
"learning_rate": 4.594059405940594e-05,
"loss": 0.4562,
"step": 58
},
{
"epoch": 0.29244114002478316,
"grad_norm": 1.6157973616198276,
"learning_rate": 4.6732673267326736e-05,
"loss": 0.4572,
"step": 59
},
{
"epoch": 0.29739776951672864,
"grad_norm": 1.1414962367611863,
"learning_rate": 4.7524752475247525e-05,
"loss": 0.4563,
"step": 60
},
{
"epoch": 0.3023543990086741,
"grad_norm": 1.2158957352523736,
"learning_rate": 4.831683168316832e-05,
"loss": 0.4535,
"step": 61
},
{
"epoch": 0.3073110285006196,
"grad_norm": 1.3830326369244816,
"learning_rate": 4.910891089108911e-05,
"loss": 0.4536,
"step": 62
},
{
"epoch": 0.31226765799256506,
"grad_norm": 0.9017071808108595,
"learning_rate": 4.9900990099009906e-05,
"loss": 0.4421,
"step": 63
},
{
"epoch": 0.31722428748451054,
"grad_norm": 1.0504602546546122,
"learning_rate": 5.0693069306930696e-05,
"loss": 0.45,
"step": 64
},
{
"epoch": 0.322180916976456,
"grad_norm": 1.4474448248935323,
"learning_rate": 5.148514851485149e-05,
"loss": 0.4474,
"step": 65
},
{
"epoch": 0.3271375464684015,
"grad_norm": 0.8895932355362427,
"learning_rate": 5.227722772277228e-05,
"loss": 0.445,
"step": 66
},
{
"epoch": 0.33209417596034696,
"grad_norm": 1.403939101234426,
"learning_rate": 5.306930693069308e-05,
"loss": 0.4485,
"step": 67
},
{
"epoch": 0.33705080545229243,
"grad_norm": 1.255761190764639,
"learning_rate": 5.3861386138613866e-05,
"loss": 0.4441,
"step": 68
},
{
"epoch": 0.3420074349442379,
"grad_norm": 1.4235767315147947,
"learning_rate": 5.465346534653466e-05,
"loss": 0.4352,
"step": 69
},
{
"epoch": 0.3469640644361834,
"grad_norm": 1.0711027609931336,
"learning_rate": 5.5445544554455445e-05,
"loss": 0.4422,
"step": 70
},
{
"epoch": 0.35192069392812886,
"grad_norm": 0.8536336221070487,
"learning_rate": 5.623762376237624e-05,
"loss": 0.4387,
"step": 71
},
{
"epoch": 0.35687732342007433,
"grad_norm": 1.0968094429380055,
"learning_rate": 5.702970297029703e-05,
"loss": 0.4395,
"step": 72
},
{
"epoch": 0.3618339529120198,
"grad_norm": 1.6921836428631551,
"learning_rate": 5.7821782178217826e-05,
"loss": 0.4412,
"step": 73
},
{
"epoch": 0.3667905824039653,
"grad_norm": 0.9943215031713406,
"learning_rate": 5.8613861386138615e-05,
"loss": 0.4471,
"step": 74
},
{
"epoch": 0.37174721189591076,
"grad_norm": 1.3369972747378565,
"learning_rate": 5.940594059405941e-05,
"loss": 0.4462,
"step": 75
},
{
"epoch": 0.37670384138785623,
"grad_norm": 1.1434125178085341,
"learning_rate": 6.01980198019802e-05,
"loss": 0.437,
"step": 76
},
{
"epoch": 0.38166047087980176,
"grad_norm": 1.9891182727972199,
"learning_rate": 6.0990099009900997e-05,
"loss": 0.4393,
"step": 77
},
{
"epoch": 0.38661710037174724,
"grad_norm": 1.3853668626293705,
"learning_rate": 6.178217821782179e-05,
"loss": 0.4443,
"step": 78
},
{
"epoch": 0.3915737298636927,
"grad_norm": 1.346996528108508,
"learning_rate": 6.257425742574258e-05,
"loss": 0.4362,
"step": 79
},
{
"epoch": 0.3965303593556382,
"grad_norm": 1.660430248946372,
"learning_rate": 6.336633663366337e-05,
"loss": 0.4418,
"step": 80
},
{
"epoch": 0.40148698884758366,
"grad_norm": 1.5145413014843,
"learning_rate": 6.415841584158417e-05,
"loss": 0.4432,
"step": 81
},
{
"epoch": 0.40644361833952913,
"grad_norm": 1.046097641466344,
"learning_rate": 6.495049504950496e-05,
"loss": 0.435,
"step": 82
},
{
"epoch": 0.4114002478314746,
"grad_norm": 2.753750924493306,
"learning_rate": 6.574257425742575e-05,
"loss": 0.4391,
"step": 83
},
{
"epoch": 0.4163568773234201,
"grad_norm": 2.2173389152299703,
"learning_rate": 6.653465346534654e-05,
"loss": 0.4348,
"step": 84
},
{
"epoch": 0.42131350681536556,
"grad_norm": 1.861248154528301,
"learning_rate": 6.732673267326732e-05,
"loss": 0.44,
"step": 85
},
{
"epoch": 0.42627013630731103,
"grad_norm": 1.3599555831815109,
"learning_rate": 6.811881188118812e-05,
"loss": 0.4412,
"step": 86
},
{
"epoch": 0.4312267657992565,
"grad_norm": 1.8825908999305676,
"learning_rate": 6.891089108910892e-05,
"loss": 0.4402,
"step": 87
},
{
"epoch": 0.436183395291202,
"grad_norm": 1.0404951505743878,
"learning_rate": 6.970297029702971e-05,
"loss": 0.4413,
"step": 88
},
{
"epoch": 0.44114002478314746,
"grad_norm": 1.9626701461839848,
"learning_rate": 7.04950495049505e-05,
"loss": 0.4387,
"step": 89
},
{
"epoch": 0.44609665427509293,
"grad_norm": 1.346795312977841,
"learning_rate": 7.128712871287129e-05,
"loss": 0.4459,
"step": 90
},
{
"epoch": 0.4510532837670384,
"grad_norm": 1.8165061269552687,
"learning_rate": 7.207920792079209e-05,
"loss": 0.4386,
"step": 91
},
{
"epoch": 0.4560099132589839,
"grad_norm": 1.7047163209623588,
"learning_rate": 7.287128712871288e-05,
"loss": 0.446,
"step": 92
},
{
"epoch": 0.46096654275092935,
"grad_norm": 1.1114346196800748,
"learning_rate": 7.366336633663368e-05,
"loss": 0.437,
"step": 93
},
{
"epoch": 0.46592317224287483,
"grad_norm": 1.278726477325811,
"learning_rate": 7.445544554455446e-05,
"loss": 0.4484,
"step": 94
},
{
"epoch": 0.4708798017348203,
"grad_norm": 1.1877378594704833,
"learning_rate": 7.524752475247524e-05,
"loss": 0.4413,
"step": 95
},
{
"epoch": 0.4758364312267658,
"grad_norm": 1.5235522525394471,
"learning_rate": 7.603960396039604e-05,
"loss": 0.4506,
"step": 96
},
{
"epoch": 0.48079306071871125,
"grad_norm": 1.347165643600965,
"learning_rate": 7.683168316831684e-05,
"loss": 0.4395,
"step": 97
},
{
"epoch": 0.4857496902106567,
"grad_norm": 1.059604823529573,
"learning_rate": 7.762376237623763e-05,
"loss": 0.4314,
"step": 98
},
{
"epoch": 0.49070631970260226,
"grad_norm": 1.2713286854324148,
"learning_rate": 7.841584158415841e-05,
"loss": 0.4445,
"step": 99
},
{
"epoch": 0.49566294919454773,
"grad_norm": 1.2438870796013015,
"learning_rate": 7.920792079207921e-05,
"loss": 0.4347,
"step": 100
},
{
"epoch": 0.5006195786864932,
"grad_norm": 1.1477026233352385,
"learning_rate": 8e-05,
"loss": 0.4342,
"step": 101
},
{
"epoch": 0.5055762081784386,
"grad_norm": 2.0339153985641882,
"learning_rate": 7.999975845811957e-05,
"loss": 0.4357,
"step": 102
},
{
"epoch": 0.5105328376703842,
"grad_norm": 1.2573420056205944,
"learning_rate": 7.999903383539539e-05,
"loss": 0.4338,
"step": 103
},
{
"epoch": 0.5154894671623296,
"grad_norm": 1.7533982812405757,
"learning_rate": 7.99978261405788e-05,
"loss": 0.444,
"step": 104
},
{
"epoch": 0.5204460966542751,
"grad_norm": 1.5042306616329586,
"learning_rate": 7.999613538825525e-05,
"loss": 0.4349,
"step": 105
},
{
"epoch": 0.5254027261462205,
"grad_norm": 1.4254423900122548,
"learning_rate": 7.999396159884411e-05,
"loss": 0.443,
"step": 106
},
{
"epoch": 0.530359355638166,
"grad_norm": 1.9091378325875628,
"learning_rate": 7.999130479859843e-05,
"loss": 0.4391,
"step": 107
},
{
"epoch": 0.5353159851301115,
"grad_norm": 1.0666407111374168,
"learning_rate": 7.998816501960465e-05,
"loss": 0.4336,
"step": 108
},
{
"epoch": 0.540272614622057,
"grad_norm": 2.296350041317558,
"learning_rate": 7.998454229978217e-05,
"loss": 0.4391,
"step": 109
},
{
"epoch": 0.5452292441140025,
"grad_norm": 1.9484546845581812,
"learning_rate": 7.998043668288292e-05,
"loss": 0.4455,
"step": 110
},
{
"epoch": 0.550185873605948,
"grad_norm": 1.4987176063935246,
"learning_rate": 7.997584821849081e-05,
"loss": 0.4323,
"step": 111
},
{
"epoch": 0.5551425030978935,
"grad_norm": 1.2690226187336895,
"learning_rate": 7.997077696202118e-05,
"loss": 0.4345,
"step": 112
},
{
"epoch": 0.5600991325898389,
"grad_norm": 1.3508924265435234,
"learning_rate": 7.996522297472005e-05,
"loss": 0.4357,
"step": 113
},
{
"epoch": 0.5650557620817844,
"grad_norm": 0.878205176991638,
"learning_rate": 7.995918632366346e-05,
"loss": 0.4244,
"step": 114
},
{
"epoch": 0.5700123915737298,
"grad_norm": 1.2129143217229221,
"learning_rate": 7.995266708175662e-05,
"loss": 0.4344,
"step": 115
},
{
"epoch": 0.5749690210656754,
"grad_norm": 1.1214123361921224,
"learning_rate": 7.994566532773299e-05,
"loss": 0.4254,
"step": 116
},
{
"epoch": 0.5799256505576208,
"grad_norm": 1.138267588266493,
"learning_rate": 7.993818114615345e-05,
"loss": 0.4207,
"step": 117
},
{
"epoch": 0.5848822800495663,
"grad_norm": 1.175279377193168,
"learning_rate": 7.993021462740514e-05,
"loss": 0.4283,
"step": 118
},
{
"epoch": 0.5898389095415117,
"grad_norm": 1.2923967393215616,
"learning_rate": 7.992176586770047e-05,
"loss": 0.4239,
"step": 119
},
{
"epoch": 0.5947955390334573,
"grad_norm": 1.1604367180664859,
"learning_rate": 7.991283496907591e-05,
"loss": 0.4247,
"step": 120
},
{
"epoch": 0.5997521685254027,
"grad_norm": 1.2250015260621407,
"learning_rate": 7.990342203939075e-05,
"loss": 0.4272,
"step": 121
},
{
"epoch": 0.6047087980173482,
"grad_norm": 1.0696805090524393,
"learning_rate": 7.989352719232583e-05,
"loss": 0.419,
"step": 122
},
{
"epoch": 0.6096654275092936,
"grad_norm": 0.7685739951418558,
"learning_rate": 7.988315054738214e-05,
"loss": 0.4245,
"step": 123
},
{
"epoch": 0.6146220570012392,
"grad_norm": 1.099330987636103,
"learning_rate": 7.987229222987942e-05,
"loss": 0.419,
"step": 124
},
{
"epoch": 0.6195786864931846,
"grad_norm": 1.1959932170089127,
"learning_rate": 7.986095237095457e-05,
"loss": 0.4241,
"step": 125
},
{
"epoch": 0.6245353159851301,
"grad_norm": 1.1283319629455746,
"learning_rate": 7.984913110756015e-05,
"loss": 0.4141,
"step": 126
},
{
"epoch": 0.6294919454770755,
"grad_norm": 1.391652204029837,
"learning_rate": 7.983682858246265e-05,
"loss": 0.4221,
"step": 127
},
{
"epoch": 0.6344485749690211,
"grad_norm": 1.072268257225815,
"learning_rate": 7.982404494424083e-05,
"loss": 0.4252,
"step": 128
},
{
"epoch": 0.6394052044609665,
"grad_norm": 1.082064345697297,
"learning_rate": 7.98107803472839e-05,
"loss": 0.4192,
"step": 129
},
{
"epoch": 0.644361833952912,
"grad_norm": 1.247560729399529,
"learning_rate": 7.979703495178964e-05,
"loss": 0.4325,
"step": 130
},
{
"epoch": 0.6493184634448576,
"grad_norm": 1.2191150794285281,
"learning_rate": 7.978280892376246e-05,
"loss": 0.43,
"step": 131
},
{
"epoch": 0.654275092936803,
"grad_norm": 1.482051924735927,
"learning_rate": 7.976810243501147e-05,
"loss": 0.4156,
"step": 132
},
{
"epoch": 0.6592317224287485,
"grad_norm": 1.0556101942015566,
"learning_rate": 7.975291566314832e-05,
"loss": 0.4204,
"step": 133
},
{
"epoch": 0.6641883519206939,
"grad_norm": 1.4485559402791497,
"learning_rate": 7.973724879158506e-05,
"loss": 0.4282,
"step": 134
},
{
"epoch": 0.6691449814126395,
"grad_norm": 1.0373100960236672,
"learning_rate": 7.972110200953197e-05,
"loss": 0.419,
"step": 135
},
{
"epoch": 0.6741016109045849,
"grad_norm": 1.7160742238845912,
"learning_rate": 7.970447551199527e-05,
"loss": 0.4262,
"step": 136
},
{
"epoch": 0.6790582403965304,
"grad_norm": 0.853051103915497,
"learning_rate": 7.968736949977473e-05,
"loss": 0.4183,
"step": 137
},
{
"epoch": 0.6840148698884758,
"grad_norm": 1.6581770847461408,
"learning_rate": 7.966978417946126e-05,
"loss": 0.4254,
"step": 138
},
{
"epoch": 0.6889714993804213,
"grad_norm": 1.0503931111123224,
"learning_rate": 7.965171976343443e-05,
"loss": 0.4234,
"step": 139
},
{
"epoch": 0.6939281288723668,
"grad_norm": 1.5689706067197353,
"learning_rate": 7.96331764698599e-05,
"loss": 0.4254,
"step": 140
},
{
"epoch": 0.6988847583643123,
"grad_norm": 1.2998733109100478,
"learning_rate": 7.961415452268675e-05,
"loss": 0.4348,
"step": 141
},
{
"epoch": 0.7038413878562577,
"grad_norm": 1.3801257793213206,
"learning_rate": 7.959465415164485e-05,
"loss": 0.4156,
"step": 142
},
{
"epoch": 0.7087980173482032,
"grad_norm": 1.0868824840439038,
"learning_rate": 7.957467559224202e-05,
"loss": 0.4208,
"step": 143
},
{
"epoch": 0.7137546468401487,
"grad_norm": 1.2221028482858696,
"learning_rate": 7.955421908576115e-05,
"loss": 0.4176,
"step": 144
},
{
"epoch": 0.7187112763320942,
"grad_norm": 0.9007360001653403,
"learning_rate": 7.953328487925744e-05,
"loss": 0.4201,
"step": 145
},
{
"epoch": 0.7236679058240396,
"grad_norm": 0.8717287063452277,
"learning_rate": 7.951187322555525e-05,
"loss": 0.4197,
"step": 146
},
{
"epoch": 0.7286245353159851,
"grad_norm": 0.792698187374001,
"learning_rate": 7.948998438324515e-05,
"loss": 0.4164,
"step": 147
},
{
"epoch": 0.7335811648079306,
"grad_norm": 0.8872820819558791,
"learning_rate": 7.946761861668072e-05,
"loss": 0.4186,
"step": 148
},
{
"epoch": 0.7385377942998761,
"grad_norm": 0.7604504848999115,
"learning_rate": 7.944477619597546e-05,
"loss": 0.4168,
"step": 149
},
{
"epoch": 0.7434944237918215,
"grad_norm": 0.645718499087449,
"learning_rate": 7.94214573969994e-05,
"loss": 0.4191,
"step": 150
},
{
"epoch": 0.748451053283767,
"grad_norm": 1.1186697000537262,
"learning_rate": 7.939766250137589e-05,
"loss": 0.421,
"step": 151
},
{
"epoch": 0.7534076827757125,
"grad_norm": 1.2841332603219227,
"learning_rate": 7.93733917964781e-05,
"loss": 0.4172,
"step": 152
},
{
"epoch": 0.758364312267658,
"grad_norm": 0.8213291321881714,
"learning_rate": 7.934864557542565e-05,
"loss": 0.4158,
"step": 153
},
{
"epoch": 0.7633209417596035,
"grad_norm": 0.8974802788633387,
"learning_rate": 7.932342413708094e-05,
"loss": 0.4096,
"step": 154
},
{
"epoch": 0.7682775712515489,
"grad_norm": 1.2384278066852972,
"learning_rate": 7.929772778604569e-05,
"loss": 0.4175,
"step": 155
},
{
"epoch": 0.7732342007434945,
"grad_norm": 1.0539193472753547,
"learning_rate": 7.927155683265711e-05,
"loss": 0.4075,
"step": 156
},
{
"epoch": 0.7781908302354399,
"grad_norm": 1.176982037154112,
"learning_rate": 7.924491159298429e-05,
"loss": 0.4162,
"step": 157
},
{
"epoch": 0.7831474597273854,
"grad_norm": 1.130671996299273,
"learning_rate": 7.921779238882428e-05,
"loss": 0.4178,
"step": 158
},
{
"epoch": 0.7881040892193308,
"grad_norm": 0.8096653915853804,
"learning_rate": 7.919019954769828e-05,
"loss": 0.4087,
"step": 159
},
{
"epoch": 0.7930607187112764,
"grad_norm": 0.9599899411922913,
"learning_rate": 7.916213340284759e-05,
"loss": 0.4162,
"step": 160
},
{
"epoch": 0.7980173482032218,
"grad_norm": 0.8001378357264783,
"learning_rate": 7.913359429322972e-05,
"loss": 0.4122,
"step": 161
},
{
"epoch": 0.8029739776951673,
"grad_norm": 0.6305011813200033,
"learning_rate": 7.910458256351416e-05,
"loss": 0.4053,
"step": 162
},
{
"epoch": 0.8079306071871127,
"grad_norm": 0.9703142661268936,
"learning_rate": 7.907509856407828e-05,
"loss": 0.4066,
"step": 163
},
{
"epoch": 0.8128872366790583,
"grad_norm": 1.1698912536676753,
"learning_rate": 7.904514265100315e-05,
"loss": 0.4131,
"step": 164
},
{
"epoch": 0.8178438661710037,
"grad_norm": 0.936435424097582,
"learning_rate": 7.901471518606913e-05,
"loss": 0.4125,
"step": 165
},
{
"epoch": 0.8228004956629492,
"grad_norm": 1.2575701101961008,
"learning_rate": 7.898381653675158e-05,
"loss": 0.4078,
"step": 166
},
{
"epoch": 0.8277571251548946,
"grad_norm": 0.9355459201867814,
"learning_rate": 7.895244707621638e-05,
"loss": 0.4096,
"step": 167
},
{
"epoch": 0.8327137546468402,
"grad_norm": 0.9651532815403727,
"learning_rate": 7.892060718331546e-05,
"loss": 0.4123,
"step": 168
},
{
"epoch": 0.8376703841387856,
"grad_norm": 0.9789444392587163,
"learning_rate": 7.888829724258221e-05,
"loss": 0.4103,
"step": 169
},
{
"epoch": 0.8426270136307311,
"grad_norm": 1.257987990016476,
"learning_rate": 7.885551764422682e-05,
"loss": 0.4133,
"step": 170
},
{
"epoch": 0.8475836431226765,
"grad_norm": 0.8590261676536683,
"learning_rate": 7.882226878413157e-05,
"loss": 0.4124,
"step": 171
},
{
"epoch": 0.8525402726146221,
"grad_norm": 0.7848985828871553,
"learning_rate": 7.878855106384608e-05,
"loss": 0.4091,
"step": 172
},
{
"epoch": 0.8574969021065675,
"grad_norm": 0.8217556335290177,
"learning_rate": 7.875436489058243e-05,
"loss": 0.4069,
"step": 173
},
{
"epoch": 0.862453531598513,
"grad_norm": 0.8421940106759833,
"learning_rate": 7.871971067721024e-05,
"loss": 0.4079,
"step": 174
},
{
"epoch": 0.8674101610904585,
"grad_norm": 0.732338688120224,
"learning_rate": 7.86845888422517e-05,
"loss": 0.4122,
"step": 175
},
{
"epoch": 0.872366790582404,
"grad_norm": 0.760792326142143,
"learning_rate": 7.864899980987654e-05,
"loss": 0.4027,
"step": 176
},
{
"epoch": 0.8773234200743495,
"grad_norm": 0.802406665807114,
"learning_rate": 7.861294400989681e-05,
"loss": 0.4012,
"step": 177
},
{
"epoch": 0.8822800495662949,
"grad_norm": 1.1214748433312076,
"learning_rate": 7.857642187776182e-05,
"loss": 0.4042,
"step": 178
},
{
"epoch": 0.8872366790582404,
"grad_norm": 1.264405893330972,
"learning_rate": 7.85394338545528e-05,
"loss": 0.4036,
"step": 179
},
{
"epoch": 0.8921933085501859,
"grad_norm": 0.9654797671767958,
"learning_rate": 7.850198038697756e-05,
"loss": 0.405,
"step": 180
},
{
"epoch": 0.8971499380421314,
"grad_norm": 0.7820135181375528,
"learning_rate": 7.846406192736517e-05,
"loss": 0.4022,
"step": 181
},
{
"epoch": 0.9021065675340768,
"grad_norm": 0.608588052923434,
"learning_rate": 7.842567893366043e-05,
"loss": 0.3952,
"step": 182
},
{
"epoch": 0.9070631970260223,
"grad_norm": 0.7415828590929223,
"learning_rate": 7.838683186941835e-05,
"loss": 0.4087,
"step": 183
},
{
"epoch": 0.9120198265179678,
"grad_norm": 0.8731014126615367,
"learning_rate": 7.834752120379857e-05,
"loss": 0.4035,
"step": 184
},
{
"epoch": 0.9169764560099133,
"grad_norm": 1.087019749607736,
"learning_rate": 7.830774741155975e-05,
"loss": 0.4125,
"step": 185
},
{
"epoch": 0.9219330855018587,
"grad_norm": 0.8865327294603108,
"learning_rate": 7.826751097305367e-05,
"loss": 0.4006,
"step": 186
},
{
"epoch": 0.9268897149938042,
"grad_norm": 0.6344821453740659,
"learning_rate": 7.822681237421956e-05,
"loss": 0.399,
"step": 187
},
{
"epoch": 0.9318463444857497,
"grad_norm": 0.7187394053419348,
"learning_rate": 7.818565210657827e-05,
"loss": 0.3977,
"step": 188
},
{
"epoch": 0.9368029739776952,
"grad_norm": 1.0404564829171956,
"learning_rate": 7.814403066722622e-05,
"loss": 0.4067,
"step": 189
},
{
"epoch": 0.9417596034696406,
"grad_norm": 1.0665407812935424,
"learning_rate": 7.810194855882943e-05,
"loss": 0.4086,
"step": 190
},
{
"epoch": 0.9467162329615861,
"grad_norm": 1.1577211948919968,
"learning_rate": 7.805940628961747e-05,
"loss": 0.4073,
"step": 191
},
{
"epoch": 0.9516728624535316,
"grad_norm": 0.998880858399891,
"learning_rate": 7.801640437337736e-05,
"loss": 0.4019,
"step": 192
},
{
"epoch": 0.9566294919454771,
"grad_norm": 0.9782567188381105,
"learning_rate": 7.797294332944725e-05,
"loss": 0.4045,
"step": 193
},
{
"epoch": 0.9615861214374225,
"grad_norm": 0.8665875399307627,
"learning_rate": 7.792902368271027e-05,
"loss": 0.3992,
"step": 194
},
{
"epoch": 0.966542750929368,
"grad_norm": 0.8017539697149025,
"learning_rate": 7.788464596358811e-05,
"loss": 0.4028,
"step": 195
},
{
"epoch": 0.9714993804213135,
"grad_norm": 1.021352081272535,
"learning_rate": 7.783981070803469e-05,
"loss": 0.4088,
"step": 196
},
{
"epoch": 0.976456009913259,
"grad_norm": 1.220616641821274,
"learning_rate": 7.779451845752957e-05,
"loss": 0.4121,
"step": 197
},
{
"epoch": 0.9814126394052045,
"grad_norm": 0.6851459878038231,
"learning_rate": 7.774876975907154e-05,
"loss": 0.3978,
"step": 198
},
{
"epoch": 0.9863692688971499,
"grad_norm": 0.8256154983716247,
"learning_rate": 7.770256516517191e-05,
"loss": 0.4006,
"step": 199
},
{
"epoch": 0.9913258983890955,
"grad_norm": 1.02233854118449,
"learning_rate": 7.765590523384794e-05,
"loss": 0.406,
"step": 200
},
{
"epoch": 0.9962825278810409,
"grad_norm": 1.2566526671400393,
"learning_rate": 7.760879052861596e-05,
"loss": 0.3971,
"step": 201
},
{
"epoch": 1.0012391573729864,
"grad_norm": 1.0197719096595084,
"learning_rate": 7.756122161848474e-05,
"loss": 0.4958,
"step": 202
},
{
"epoch": 1.0061957868649318,
"grad_norm": 1.2508655704855365,
"learning_rate": 7.751319907794846e-05,
"loss": 0.3909,
"step": 203
},
{
"epoch": 1.0111524163568772,
"grad_norm": 1.0070567867985618,
"learning_rate": 7.746472348697987e-05,
"loss": 0.3833,
"step": 204
},
{
"epoch": 1.016109045848823,
"grad_norm": 1.1597304198121792,
"learning_rate": 7.74157954310232e-05,
"loss": 0.3881,
"step": 205
},
{
"epoch": 1.0210656753407683,
"grad_norm": 0.9662929035122743,
"learning_rate": 7.736641550098724e-05,
"loss": 0.3848,
"step": 206
},
{
"epoch": 1.0260223048327137,
"grad_norm": 1.0635323036573268,
"learning_rate": 7.731658429323801e-05,
"loss": 0.3874,
"step": 207
},
{
"epoch": 1.0309789343246591,
"grad_norm": 0.8185655718509083,
"learning_rate": 7.72663024095917e-05,
"loss": 0.38,
"step": 208
},
{
"epoch": 1.0359355638166048,
"grad_norm": 1.147797704777796,
"learning_rate": 7.721557045730734e-05,
"loss": 0.3791,
"step": 209
},
{
"epoch": 1.0408921933085502,
"grad_norm": 0.6711907484138894,
"learning_rate": 7.71643890490795e-05,
"loss": 0.3759,
"step": 210
},
{
"epoch": 1.0458488228004956,
"grad_norm": 0.767288350635738,
"learning_rate": 7.711275880303087e-05,
"loss": 0.3857,
"step": 211
},
{
"epoch": 1.050805452292441,
"grad_norm": 0.7110584540901675,
"learning_rate": 7.706068034270474e-05,
"loss": 0.3805,
"step": 212
},
{
"epoch": 1.0557620817843867,
"grad_norm": 1.1141445015648397,
"learning_rate": 7.700815429705761e-05,
"loss": 0.3722,
"step": 213
},
{
"epoch": 1.060718711276332,
"grad_norm": 0.8830291052163628,
"learning_rate": 7.695518130045147e-05,
"loss": 0.3735,
"step": 214
},
{
"epoch": 1.0656753407682775,
"grad_norm": 0.6334177235159056,
"learning_rate": 7.690176199264617e-05,
"loss": 0.3814,
"step": 215
},
{
"epoch": 1.070631970260223,
"grad_norm": 0.6128048872826931,
"learning_rate": 7.684789701879173e-05,
"loss": 0.3823,
"step": 216
},
{
"epoch": 1.0755885997521686,
"grad_norm": 0.635130178101925,
"learning_rate": 7.679358702942047e-05,
"loss": 0.384,
"step": 217
},
{
"epoch": 1.080545229244114,
"grad_norm": 0.7130440719911276,
"learning_rate": 7.673883268043927e-05,
"loss": 0.3789,
"step": 218
},
{
"epoch": 1.0855018587360594,
"grad_norm": 0.8361125459029056,
"learning_rate": 7.668363463312155e-05,
"loss": 0.3756,
"step": 219
},
{
"epoch": 1.090458488228005,
"grad_norm": 1.0272028699052964,
"learning_rate": 7.662799355409929e-05,
"loss": 0.3794,
"step": 220
},
{
"epoch": 1.0954151177199505,
"grad_norm": 0.959950576279975,
"learning_rate": 7.657191011535505e-05,
"loss": 0.3725,
"step": 221
},
{
"epoch": 1.100371747211896,
"grad_norm": 0.7562311795443518,
"learning_rate": 7.65153849942138e-05,
"loss": 0.3777,
"step": 222
},
{
"epoch": 1.1053283767038413,
"grad_norm": 0.7638923008741431,
"learning_rate": 7.645841887333472e-05,
"loss": 0.377,
"step": 223
},
{
"epoch": 1.110285006195787,
"grad_norm": 0.754375385945502,
"learning_rate": 7.640101244070304e-05,
"loss": 0.3794,
"step": 224
},
{
"epoch": 1.1152416356877324,
"grad_norm": 0.7522586200297396,
"learning_rate": 7.634316638962161e-05,
"loss": 0.379,
"step": 225
},
{
"epoch": 1.1201982651796778,
"grad_norm": 0.700854901575676,
"learning_rate": 7.628488141870266e-05,
"loss": 0.3819,
"step": 226
},
{
"epoch": 1.1251548946716232,
"grad_norm": 0.825909383512544,
"learning_rate": 7.622615823185925e-05,
"loss": 0.379,
"step": 227
},
{
"epoch": 1.1301115241635689,
"grad_norm": 1.0754465578447554,
"learning_rate": 7.616699753829681e-05,
"loss": 0.3834,
"step": 228
},
{
"epoch": 1.1350681536555143,
"grad_norm": 1.1737988185703399,
"learning_rate": 7.610740005250464e-05,
"loss": 0.3826,
"step": 229
},
{
"epoch": 1.1400247831474597,
"grad_norm": 0.5785260728190653,
"learning_rate": 7.604736649424714e-05,
"loss": 0.3788,
"step": 230
},
{
"epoch": 1.1449814126394051,
"grad_norm": 0.6586379291379977,
"learning_rate": 7.598689758855525e-05,
"loss": 0.38,
"step": 231
},
{
"epoch": 1.1499380421313508,
"grad_norm": 0.7869399289494947,
"learning_rate": 7.592599406571763e-05,
"loss": 0.3802,
"step": 232
},
{
"epoch": 1.1548946716232962,
"grad_norm": 0.6729360063352638,
"learning_rate": 7.586465666127187e-05,
"loss": 0.3769,
"step": 233
},
{
"epoch": 1.1598513011152416,
"grad_norm": 0.736581035090937,
"learning_rate": 7.580288611599554e-05,
"loss": 0.3824,
"step": 234
},
{
"epoch": 1.164807930607187,
"grad_norm": 0.6709797895211724,
"learning_rate": 7.574068317589734e-05,
"loss": 0.3775,
"step": 235
},
{
"epoch": 1.1697645600991327,
"grad_norm": 0.6148419568958634,
"learning_rate": 7.567804859220802e-05,
"loss": 0.3776,
"step": 236
},
{
"epoch": 1.174721189591078,
"grad_norm": 0.6467262658722034,
"learning_rate": 7.561498312137135e-05,
"loss": 0.3755,
"step": 237
},
{
"epoch": 1.1796778190830235,
"grad_norm": 0.5461086867691934,
"learning_rate": 7.555148752503495e-05,
"loss": 0.3828,
"step": 238
},
{
"epoch": 1.1846344485749691,
"grad_norm": 0.4599155278425996,
"learning_rate": 7.548756257004108e-05,
"loss": 0.382,
"step": 239
},
{
"epoch": 1.1895910780669146,
"grad_norm": 0.46455442182053813,
"learning_rate": 7.542320902841746e-05,
"loss": 0.3768,
"step": 240
},
{
"epoch": 1.19454770755886,
"grad_norm": 0.34719408842324345,
"learning_rate": 7.535842767736784e-05,
"loss": 0.3768,
"step": 241
},
{
"epoch": 1.1995043370508054,
"grad_norm": 0.37653138917342843,
"learning_rate": 7.52932192992627e-05,
"loss": 0.376,
"step": 242
},
{
"epoch": 1.2044609665427508,
"grad_norm": 0.41966657762815895,
"learning_rate": 7.522758468162975e-05,
"loss": 0.3815,
"step": 243
},
{
"epoch": 1.2094175960346965,
"grad_norm": 0.3618542297929767,
"learning_rate": 7.516152461714445e-05,
"loss": 0.3765,
"step": 244
},
{
"epoch": 1.2143742255266419,
"grad_norm": 0.5657649815473641,
"learning_rate": 7.509503990362039e-05,
"loss": 0.378,
"step": 245
},
{
"epoch": 1.2193308550185873,
"grad_norm": 0.6892489845465257,
"learning_rate": 7.502813134399974e-05,
"loss": 0.3792,
"step": 246
},
{
"epoch": 1.224287484510533,
"grad_norm": 0.727166972122069,
"learning_rate": 7.496079974634342e-05,
"loss": 0.3745,
"step": 247
},
{
"epoch": 1.2292441140024783,
"grad_norm": 0.8477735977905793,
"learning_rate": 7.48930459238215e-05,
"loss": 0.3803,
"step": 248
},
{
"epoch": 1.2342007434944238,
"grad_norm": 1.1470825347340907,
"learning_rate": 7.482487069470325e-05,
"loss": 0.3783,
"step": 249
},
{
"epoch": 1.2391573729863692,
"grad_norm": 0.9004187144612258,
"learning_rate": 7.475627488234733e-05,
"loss": 0.3756,
"step": 250
},
{
"epoch": 1.2441140024783148,
"grad_norm": 0.7971542884599199,
"learning_rate": 7.46872593151918e-05,
"loss": 0.3765,
"step": 251
},
{
"epoch": 1.2490706319702602,
"grad_norm": 0.9413192899083336,
"learning_rate": 7.461782482674417e-05,
"loss": 0.3752,
"step": 252
},
{
"epoch": 1.2540272614622057,
"grad_norm": 1.0620179749799983,
"learning_rate": 7.454797225557129e-05,
"loss": 0.3829,
"step": 253
},
{
"epoch": 1.258983890954151,
"grad_norm": 0.9188902988306181,
"learning_rate": 7.44777024452892e-05,
"loss": 0.3811,
"step": 254
},
{
"epoch": 1.2639405204460967,
"grad_norm": 0.76176279520022,
"learning_rate": 7.440701624455303e-05,
"loss": 0.3781,
"step": 255
},
{
"epoch": 1.2688971499380421,
"grad_norm": 0.7176306699252524,
"learning_rate": 7.433591450704667e-05,
"loss": 0.3726,
"step": 256
},
{
"epoch": 1.2738537794299876,
"grad_norm": 0.8007478629584447,
"learning_rate": 7.426439809147247e-05,
"loss": 0.3776,
"step": 257
},
{
"epoch": 1.2788104089219332,
"grad_norm": 0.9843733510756626,
"learning_rate": 7.419246786154094e-05,
"loss": 0.3735,
"step": 258
},
{
"epoch": 1.2837670384138786,
"grad_norm": 0.9162149531984846,
"learning_rate": 7.412012468596022e-05,
"loss": 0.3806,
"step": 259
},
{
"epoch": 1.288723667905824,
"grad_norm": 0.6176604637207814,
"learning_rate": 7.404736943842562e-05,
"loss": 0.3769,
"step": 260
},
{
"epoch": 1.2936802973977695,
"grad_norm": 0.5205293597911049,
"learning_rate": 7.397420299760911e-05,
"loss": 0.3743,
"step": 261
},
{
"epoch": 1.2986369268897149,
"grad_norm": 0.49261144114942623,
"learning_rate": 7.39006262471487e-05,
"loss": 0.3758,
"step": 262
},
{
"epoch": 1.3035935563816605,
"grad_norm": 0.4679036077993508,
"learning_rate": 7.38266400756377e-05,
"loss": 0.379,
"step": 263
},
{
"epoch": 1.308550185873606,
"grad_norm": 0.5193217780337118,
"learning_rate": 7.375224537661407e-05,
"loss": 0.3731,
"step": 264
},
{
"epoch": 1.3135068153655514,
"grad_norm": 0.5536487287786134,
"learning_rate": 7.367744304854958e-05,
"loss": 0.3736,
"step": 265
},
{
"epoch": 1.318463444857497,
"grad_norm": 0.4743844504481324,
"learning_rate": 7.360223399483897e-05,
"loss": 0.3763,
"step": 266
},
{
"epoch": 1.3234200743494424,
"grad_norm": 0.45973522881542617,
"learning_rate": 7.352661912378909e-05,
"loss": 0.373,
"step": 267
},
{
"epoch": 1.3283767038413878,
"grad_norm": 0.48786805257589877,
"learning_rate": 7.34505993486078e-05,
"loss": 0.3779,
"step": 268
},
{
"epoch": 1.3333333333333333,
"grad_norm": 0.46082909055767357,
"learning_rate": 7.33741755873931e-05,
"loss": 0.3726,
"step": 269
},
{
"epoch": 1.3382899628252787,
"grad_norm": 0.40480055483099847,
"learning_rate": 7.329734876312192e-05,
"loss": 0.3683,
"step": 270
},
{
"epoch": 1.3432465923172243,
"grad_norm": 0.5260288211761569,
"learning_rate": 7.322011980363908e-05,
"loss": 0.375,
"step": 271
},
{
"epoch": 1.3482032218091697,
"grad_norm": 0.8000585333697147,
"learning_rate": 7.314248964164594e-05,
"loss": 0.3812,
"step": 272
},
{
"epoch": 1.3531598513011152,
"grad_norm": 1.1176749895674423,
"learning_rate": 7.306445921468928e-05,
"loss": 0.3784,
"step": 273
},
{
"epoch": 1.3581164807930608,
"grad_norm": 0.8643033686238092,
"learning_rate": 7.29860294651499e-05,
"loss": 0.3765,
"step": 274
},
{
"epoch": 1.3630731102850062,
"grad_norm": 0.6961833389784987,
"learning_rate": 7.290720134023128e-05,
"loss": 0.3793,
"step": 275
},
{
"epoch": 1.3680297397769516,
"grad_norm": 0.8897368708021905,
"learning_rate": 7.282797579194808e-05,
"loss": 0.3764,
"step": 276
},
{
"epoch": 1.3729863692688973,
"grad_norm": 1.157059505215818,
"learning_rate": 7.27483537771147e-05,
"loss": 0.3835,
"step": 277
},
{
"epoch": 1.3779429987608427,
"grad_norm": 0.9009906161404626,
"learning_rate": 7.26683362573337e-05,
"loss": 0.3766,
"step": 278
},
{
"epoch": 1.3828996282527881,
"grad_norm": 0.745181325850118,
"learning_rate": 7.258792419898416e-05,
"loss": 0.3759,
"step": 279
},
{
"epoch": 1.3878562577447335,
"grad_norm": 0.6749818636165081,
"learning_rate": 7.250711857321012e-05,
"loss": 0.3744,
"step": 280
},
{
"epoch": 1.392812887236679,
"grad_norm": 0.5923790849381074,
"learning_rate": 7.24259203559087e-05,
"loss": 0.3751,
"step": 281
},
{
"epoch": 1.3977695167286246,
"grad_norm": 0.8935001689053471,
"learning_rate": 7.234433052771838e-05,
"loss": 0.3773,
"step": 282
},
{
"epoch": 1.40272614622057,
"grad_norm": 1.0946320595396666,
"learning_rate": 7.226235007400722e-05,
"loss": 0.3729,
"step": 283
},
{
"epoch": 1.4076827757125154,
"grad_norm": 0.6024341847073685,
"learning_rate": 7.217997998486085e-05,
"loss": 0.3725,
"step": 284
},
{
"epoch": 1.412639405204461,
"grad_norm": 0.4608665149912086,
"learning_rate": 7.209722125507059e-05,
"loss": 0.3716,
"step": 285
},
{
"epoch": 1.4175960346964065,
"grad_norm": 0.7816845006015856,
"learning_rate": 7.20140748841214e-05,
"loss": 0.3753,
"step": 286
},
{
"epoch": 1.422552664188352,
"grad_norm": 0.8396407484972205,
"learning_rate": 7.19305418761798e-05,
"loss": 0.3754,
"step": 287
},
{
"epoch": 1.4275092936802973,
"grad_norm": 0.6099391657297455,
"learning_rate": 7.18466232400818e-05,
"loss": 0.3693,
"step": 288
},
{
"epoch": 1.4324659231722427,
"grad_norm": 0.40684831008388006,
"learning_rate": 7.176231998932067e-05,
"loss": 0.3725,
"step": 289
},
{
"epoch": 1.4374225526641884,
"grad_norm": 0.42586071135075304,
"learning_rate": 7.167763314203465e-05,
"loss": 0.3744,
"step": 290
},
{
"epoch": 1.4423791821561338,
"grad_norm": 0.43053523382268793,
"learning_rate": 7.159256372099481e-05,
"loss": 0.3708,
"step": 291
},
{
"epoch": 1.4473358116480792,
"grad_norm": 0.4113030106637991,
"learning_rate": 7.150711275359252e-05,
"loss": 0.3766,
"step": 292
},
{
"epoch": 1.4522924411400249,
"grad_norm": 0.4387752897731675,
"learning_rate": 7.142128127182714e-05,
"loss": 0.3696,
"step": 293
},
{
"epoch": 1.4572490706319703,
"grad_norm": 0.4738786254884929,
"learning_rate": 7.133507031229358e-05,
"loss": 0.3701,
"step": 294
},
{
"epoch": 1.4622057001239157,
"grad_norm": 0.5336855316791109,
"learning_rate": 7.124848091616966e-05,
"loss": 0.3766,
"step": 295
},
{
"epoch": 1.4671623296158613,
"grad_norm": 0.525640287895575,
"learning_rate": 7.116151412920369e-05,
"loss": 0.3691,
"step": 296
},
{
"epoch": 1.4721189591078068,
"grad_norm": 0.6208954999535536,
"learning_rate": 7.107417100170171e-05,
"loss": 0.3735,
"step": 297
},
{
"epoch": 1.4770755885997522,
"grad_norm": 0.6750123425630558,
"learning_rate": 7.098645258851491e-05,
"loss": 0.3732,
"step": 298
},
{
"epoch": 1.4820322180916976,
"grad_norm": 0.6877081965958101,
"learning_rate": 7.089835994902679e-05,
"loss": 0.3698,
"step": 299
},
{
"epoch": 1.486988847583643,
"grad_norm": 0.7156301301268085,
"learning_rate": 7.080989414714045e-05,
"loss": 0.3741,
"step": 300
},
{
"epoch": 1.4919454770755887,
"grad_norm": 0.7147095744659843,
"learning_rate": 7.072105625126568e-05,
"loss": 0.3678,
"step": 301
},
{
"epoch": 1.496902106567534,
"grad_norm": 0.7610853993131405,
"learning_rate": 7.063184733430615e-05,
"loss": 0.3697,
"step": 302
},
{
"epoch": 1.5018587360594795,
"grad_norm": 0.9061730193624111,
"learning_rate": 7.054226847364627e-05,
"loss": 0.3756,
"step": 303
},
{
"epoch": 1.5068153655514251,
"grad_norm": 1.0523239987620783,
"learning_rate": 7.045232075113841e-05,
"loss": 0.3746,
"step": 304
},
{
"epoch": 1.5117719950433703,
"grad_norm": 0.9900385124301968,
"learning_rate": 7.036200525308966e-05,
"loss": 0.3686,
"step": 305
},
{
"epoch": 1.516728624535316,
"grad_norm": 0.9815673674275408,
"learning_rate": 7.02713230702488e-05,
"loss": 0.3746,
"step": 306
},
{
"epoch": 1.5216852540272616,
"grad_norm": 0.84313540646779,
"learning_rate": 7.018027529779302e-05,
"loss": 0.3728,
"step": 307
},
{
"epoch": 1.5266418835192068,
"grad_norm": 0.4618784111325805,
"learning_rate": 7.00888630353149e-05,
"loss": 0.3798,
"step": 308
},
{
"epoch": 1.5315985130111525,
"grad_norm": 0.41703424984720067,
"learning_rate": 6.999708738680886e-05,
"loss": 0.3689,
"step": 309
},
{
"epoch": 1.5365551425030979,
"grad_norm": 0.5913215667695161,
"learning_rate": 6.99049494606581e-05,
"loss": 0.372,
"step": 310
},
{
"epoch": 1.5415117719950433,
"grad_norm": 0.697528011758827,
"learning_rate": 6.981245036962098e-05,
"loss": 0.3741,
"step": 311
},
{
"epoch": 1.546468401486989,
"grad_norm": 0.5699378604300869,
"learning_rate": 6.971959123081771e-05,
"loss": 0.3723,
"step": 312
},
{
"epoch": 1.5514250309789344,
"grad_norm": 0.4182368238977319,
"learning_rate": 6.962637316571687e-05,
"loss": 0.3715,
"step": 313
},
{
"epoch": 1.5563816604708798,
"grad_norm": 0.37941077478326063,
"learning_rate": 6.95327973001218e-05,
"loss": 0.3676,
"step": 314
},
{
"epoch": 1.5613382899628254,
"grad_norm": 0.46011385805195815,
"learning_rate": 6.943886476415699e-05,
"loss": 0.3706,
"step": 315
},
{
"epoch": 1.5662949194547706,
"grad_norm": 0.5885696797756296,
"learning_rate": 6.934457669225454e-05,
"loss": 0.375,
"step": 316
},
{
"epoch": 1.5712515489467163,
"grad_norm": 0.6601349168571249,
"learning_rate": 6.924993422314035e-05,
"loss": 0.3754,
"step": 317
},
{
"epoch": 1.5762081784386617,
"grad_norm": 0.746944923527877,
"learning_rate": 6.915493849982041e-05,
"loss": 0.3695,
"step": 318
},
{
"epoch": 1.581164807930607,
"grad_norm": 0.7143244562614945,
"learning_rate": 6.905959066956703e-05,
"loss": 0.3723,
"step": 319
},
{
"epoch": 1.5861214374225527,
"grad_norm": 0.4858977001025183,
"learning_rate": 6.896389188390488e-05,
"loss": 0.3661,
"step": 320
},
{
"epoch": 1.5910780669144982,
"grad_norm": 0.3903518503904867,
"learning_rate": 6.886784329859722e-05,
"loss": 0.3659,
"step": 321
},
{
"epoch": 1.5960346964064436,
"grad_norm": 0.5780590658779307,
"learning_rate": 6.877144607363183e-05,
"loss": 0.368,
"step": 322
},
{
"epoch": 1.6009913258983892,
"grad_norm": 0.6094396527936917,
"learning_rate": 6.867470137320707e-05,
"loss": 0.3751,
"step": 323
},
{
"epoch": 1.6059479553903344,
"grad_norm": 0.5299948076513158,
"learning_rate": 6.857761036571778e-05,
"loss": 0.3701,
"step": 324
},
{
"epoch": 1.61090458488228,
"grad_norm": 0.32554048958176607,
"learning_rate": 6.84801742237412e-05,
"loss": 0.3658,
"step": 325
},
{
"epoch": 1.6158612143742255,
"grad_norm": 0.3917198928259944,
"learning_rate": 6.838239412402275e-05,
"loss": 0.3663,
"step": 326
},
{
"epoch": 1.620817843866171,
"grad_norm": 0.580429864868858,
"learning_rate": 6.828427124746191e-05,
"loss": 0.3711,
"step": 327
},
{
"epoch": 1.6257744733581165,
"grad_norm": 0.4122977155429013,
"learning_rate": 6.818580677909787e-05,
"loss": 0.3667,
"step": 328
},
{
"epoch": 1.630731102850062,
"grad_norm": 0.3311640599067616,
"learning_rate": 6.808700190809529e-05,
"loss": 0.3653,
"step": 329
},
{
"epoch": 1.6356877323420074,
"grad_norm": 0.4555478935608727,
"learning_rate": 6.798785782772987e-05,
"loss": 0.3695,
"step": 330
},
{
"epoch": 1.640644361833953,
"grad_norm": 0.5386186943934842,
"learning_rate": 6.7888375735374e-05,
"loss": 0.3715,
"step": 331
},
{
"epoch": 1.6456009913258984,
"grad_norm": 0.5119260829822706,
"learning_rate": 6.778855683248226e-05,
"loss": 0.3678,
"step": 332
},
{
"epoch": 1.6505576208178439,
"grad_norm": 0.4174256955489929,
"learning_rate": 6.768840232457691e-05,
"loss": 0.3701,
"step": 333
},
{
"epoch": 1.6555142503097895,
"grad_norm": 0.4330935030350318,
"learning_rate": 6.75879134212334e-05,
"loss": 0.3647,
"step": 334
},
{
"epoch": 1.6604708798017347,
"grad_norm": 0.4977143818569726,
"learning_rate": 6.748709133606562e-05,
"loss": 0.3645,
"step": 335
},
{
"epoch": 1.6654275092936803,
"grad_norm": 0.4632344110293173,
"learning_rate": 6.738593728671141e-05,
"loss": 0.3688,
"step": 336
},
{
"epoch": 1.6703841387856257,
"grad_norm": 0.4343232693352364,
"learning_rate": 6.72844524948177e-05,
"loss": 0.3716,
"step": 337
},
{
"epoch": 1.6753407682775712,
"grad_norm": 0.5497224652685649,
"learning_rate": 6.718263818602587e-05,
"loss": 0.3687,
"step": 338
},
{
"epoch": 1.6802973977695168,
"grad_norm": 0.6580306405235301,
"learning_rate": 6.708049558995692e-05,
"loss": 0.3668,
"step": 339
},
{
"epoch": 1.6852540272614622,
"grad_norm": 0.6272479044564085,
"learning_rate": 6.697802594019659e-05,
"loss": 0.3736,
"step": 340
},
{
"epoch": 1.6902106567534076,
"grad_norm": 0.49972287263472315,
"learning_rate": 6.687523047428043e-05,
"loss": 0.3721,
"step": 341
},
{
"epoch": 1.6951672862453533,
"grad_norm": 0.4860929606946379,
"learning_rate": 6.677211043367898e-05,
"loss": 0.3691,
"step": 342
},
{
"epoch": 1.7001239157372985,
"grad_norm": 0.5528988110266627,
"learning_rate": 6.666866706378268e-05,
"loss": 0.3663,
"step": 343
},
{
"epoch": 1.7050805452292441,
"grad_norm": 0.5931778952264959,
"learning_rate": 6.65649016138868e-05,
"loss": 0.367,
"step": 344
},
{
"epoch": 1.7100371747211895,
"grad_norm": 0.6219580459440871,
"learning_rate": 6.646081533717647e-05,
"loss": 0.3709,
"step": 345
},
{
"epoch": 1.714993804213135,
"grad_norm": 0.62122822614519,
"learning_rate": 6.635640949071141e-05,
"loss": 0.3682,
"step": 346
},
{
"epoch": 1.7199504337050806,
"grad_norm": 0.6383264360282018,
"learning_rate": 6.625168533541086e-05,
"loss": 0.3759,
"step": 347
},
{
"epoch": 1.724907063197026,
"grad_norm": 0.7086745256179291,
"learning_rate": 6.614664413603826e-05,
"loss": 0.3743,
"step": 348
},
{
"epoch": 1.7298636926889714,
"grad_norm": 0.7914076334715233,
"learning_rate": 6.60412871611861e-05,
"loss": 0.366,
"step": 349
},
{
"epoch": 1.734820322180917,
"grad_norm": 0.6800636113803615,
"learning_rate": 6.593561568326045e-05,
"loss": 0.3653,
"step": 350
},
{
"epoch": 1.7397769516728625,
"grad_norm": 0.45351689701921466,
"learning_rate": 6.582963097846567e-05,
"loss": 0.3671,
"step": 351
},
{
"epoch": 1.744733581164808,
"grad_norm": 0.46229129788058876,
"learning_rate": 6.572333432678901e-05,
"loss": 0.3675,
"step": 352
},
{
"epoch": 1.7496902106567536,
"grad_norm": 0.5621458743860188,
"learning_rate": 6.561672701198515e-05,
"loss": 0.3709,
"step": 353
},
{
"epoch": 1.7546468401486988,
"grad_norm": 0.6591816312736469,
"learning_rate": 6.550981032156064e-05,
"loss": 0.3678,
"step": 354
},
{
"epoch": 1.7596034696406444,
"grad_norm": 0.858815006302805,
"learning_rate": 6.540258554675837e-05,
"loss": 0.3687,
"step": 355
},
{
"epoch": 1.7645600991325898,
"grad_norm": 1.0641612393607265,
"learning_rate": 6.529505398254209e-05,
"loss": 0.3682,
"step": 356
},
{
"epoch": 1.7695167286245352,
"grad_norm": 0.8476056220579151,
"learning_rate": 6.518721692758056e-05,
"loss": 0.3692,
"step": 357
},
{
"epoch": 1.7744733581164809,
"grad_norm": 0.5833411072292431,
"learning_rate": 6.507907568423205e-05,
"loss": 0.3727,
"step": 358
},
{
"epoch": 1.7794299876084263,
"grad_norm": 0.39389932942655925,
"learning_rate": 6.497063155852853e-05,
"loss": 0.373,
"step": 359
},
{
"epoch": 1.7843866171003717,
"grad_norm": 0.4232182241553237,
"learning_rate": 6.48618858601599e-05,
"loss": 0.3761,
"step": 360
},
{
"epoch": 1.7893432465923174,
"grad_norm": 0.6103087106854266,
"learning_rate": 6.475283990245816e-05,
"loss": 0.3695,
"step": 361
},
{
"epoch": 1.7942998760842626,
"grad_norm": 0.6002125693210419,
"learning_rate": 6.464349500238162e-05,
"loss": 0.3682,
"step": 362
},
{
"epoch": 1.7992565055762082,
"grad_norm": 0.4564158999485684,
"learning_rate": 6.453385248049893e-05,
"loss": 0.3689,
"step": 363
},
{
"epoch": 1.8042131350681536,
"grad_norm": 0.3973850442566215,
"learning_rate": 6.44239136609731e-05,
"loss": 0.3684,
"step": 364
},
{
"epoch": 1.809169764560099,
"grad_norm": 0.3722004844867567,
"learning_rate": 6.431367987154561e-05,
"loss": 0.3597,
"step": 365
},
{
"epoch": 1.8141263940520447,
"grad_norm": 0.36135251217138775,
"learning_rate": 6.42031524435203e-05,
"loss": 0.3666,
"step": 366
},
{
"epoch": 1.81908302354399,
"grad_norm": 0.42456457513430096,
"learning_rate": 6.40923327117473e-05,
"loss": 0.3692,
"step": 367
},
{
"epoch": 1.8240396530359355,
"grad_norm": 0.34018392891314136,
"learning_rate": 6.398122201460694e-05,
"loss": 0.3582,
"step": 368
},
{
"epoch": 1.8289962825278812,
"grad_norm": 0.34702535248917055,
"learning_rate": 6.386982169399355e-05,
"loss": 0.362,
"step": 369
},
{
"epoch": 1.8339529120198264,
"grad_norm": 0.37222768584462623,
"learning_rate": 6.375813309529929e-05,
"loss": 0.3693,
"step": 370
},
{
"epoch": 1.838909541511772,
"grad_norm": 0.37704314430735963,
"learning_rate": 6.364615756739784e-05,
"loss": 0.3644,
"step": 371
},
{
"epoch": 1.8438661710037176,
"grad_norm": 0.3614680716279667,
"learning_rate": 6.353389646262823e-05,
"loss": 0.3638,
"step": 372
},
{
"epoch": 1.8488228004956628,
"grad_norm": 0.3341638078127682,
"learning_rate": 6.34213511367783e-05,
"loss": 0.3635,
"step": 373
},
{
"epoch": 1.8537794299876085,
"grad_norm": 0.37396523466792747,
"learning_rate": 6.330852294906861e-05,
"loss": 0.3631,
"step": 374
},
{
"epoch": 1.858736059479554,
"grad_norm": 0.4237992579587528,
"learning_rate": 6.319541326213573e-05,
"loss": 0.3628,
"step": 375
},
{
"epoch": 1.8636926889714993,
"grad_norm": 0.40425255099410395,
"learning_rate": 6.308202344201602e-05,
"loss": 0.3653,
"step": 376
},
{
"epoch": 1.868649318463445,
"grad_norm": 0.3209955179999364,
"learning_rate": 6.296835485812899e-05,
"loss": 0.3692,
"step": 377
},
{
"epoch": 1.8736059479553904,
"grad_norm": 0.3253660638473066,
"learning_rate": 6.285440888326082e-05,
"loss": 0.3612,
"step": 378
},
{
"epoch": 1.8785625774473358,
"grad_norm": 0.3271459226873318,
"learning_rate": 6.274018689354776e-05,
"loss": 0.3706,
"step": 379
},
{
"epoch": 1.8835192069392814,
"grad_norm": 0.3149746962127161,
"learning_rate": 6.262569026845949e-05,
"loss": 0.3696,
"step": 380
},
{
"epoch": 1.8884758364312266,
"grad_norm": 0.30573145047857164,
"learning_rate": 6.251092039078256e-05,
"loss": 0.3675,
"step": 381
},
{
"epoch": 1.8934324659231723,
"grad_norm": 0.3047375886031863,
"learning_rate": 6.239587864660354e-05,
"loss": 0.3637,
"step": 382
},
{
"epoch": 1.8983890954151177,
"grad_norm": 0.3941393217498381,
"learning_rate": 6.228056642529242e-05,
"loss": 0.369,
"step": 383
},
{
"epoch": 1.903345724907063,
"grad_norm": 0.5259788343113935,
"learning_rate": 6.216498511948572e-05,
"loss": 0.3715,
"step": 384
},
{
"epoch": 1.9083023543990087,
"grad_norm": 0.6494740207626633,
"learning_rate": 6.204913612506975e-05,
"loss": 0.3606,
"step": 385
},
{
"epoch": 1.9132589838909542,
"grad_norm": 0.7041520852135289,
"learning_rate": 6.193302084116368e-05,
"loss": 0.3641,
"step": 386
},
{
"epoch": 1.9182156133828996,
"grad_norm": 0.7602483900224043,
"learning_rate": 6.181664067010275e-05,
"loss": 0.3621,
"step": 387
},
{
"epoch": 1.9231722428748452,
"grad_norm": 0.7899041502702561,
"learning_rate": 6.169999701742118e-05,
"loss": 0.3671,
"step": 388
},
{
"epoch": 1.9281288723667904,
"grad_norm": 0.7030444055558277,
"learning_rate": 6.158309129183538e-05,
"loss": 0.3656,
"step": 389
},
{
"epoch": 1.933085501858736,
"grad_norm": 0.6071958638842148,
"learning_rate": 6.146592490522677e-05,
"loss": 0.3659,
"step": 390
},
{
"epoch": 1.9380421313506815,
"grad_norm": 0.4728524200414151,
"learning_rate": 6.134849927262481e-05,
"loss": 0.3672,
"step": 391
},
{
"epoch": 1.942998760842627,
"grad_norm": 0.3446842094670103,
"learning_rate": 6.123081581218992e-05,
"loss": 0.3649,
"step": 392
},
{
"epoch": 1.9479553903345725,
"grad_norm": 0.4022174305496056,
"learning_rate": 6.11128759451963e-05,
"loss": 0.3673,
"step": 393
},
{
"epoch": 1.952912019826518,
"grad_norm": 0.5636096699586152,
"learning_rate": 6.099468109601483e-05,
"loss": 0.362,
"step": 394
},
{
"epoch": 1.9578686493184634,
"grad_norm": 0.6819652998130323,
"learning_rate": 6.0876232692095794e-05,
"loss": 0.3661,
"step": 395
},
{
"epoch": 1.962825278810409,
"grad_norm": 0.6365953298610997,
"learning_rate": 6.075753216395172e-05,
"loss": 0.3632,
"step": 396
},
{
"epoch": 1.9677819083023544,
"grad_norm": 0.43428749476968237,
"learning_rate": 6.063858094514004e-05,
"loss": 0.3636,
"step": 397
},
{
"epoch": 1.9727385377942999,
"grad_norm": 0.33548805713556684,
"learning_rate": 6.051938047224582e-05,
"loss": 0.3647,
"step": 398
},
{
"epoch": 1.9776951672862455,
"grad_norm": 0.41462999397741335,
"learning_rate": 6.0399932184864356e-05,
"loss": 0.3651,
"step": 399
},
{
"epoch": 1.9826517967781907,
"grad_norm": 0.48226204851258686,
"learning_rate": 6.028023752558387e-05,
"loss": 0.3614,
"step": 400
},
{
"epoch": 1.9876084262701363,
"grad_norm": 0.5751629936810563,
"learning_rate": 6.0160297939968e-05,
"loss": 0.3665,
"step": 401
},
{
"epoch": 1.9925650557620818,
"grad_norm": 0.6779156841941448,
"learning_rate": 6.00401148765384e-05,
"loss": 0.3645,
"step": 402
},
{
"epoch": 1.9975216852540272,
"grad_norm": 0.6950369897000113,
"learning_rate": 5.9919689786757234e-05,
"loss": 0.3954,
"step": 403
},
{
"epoch": 2.002478314745973,
"grad_norm": 0.9229607445545323,
"learning_rate": 5.979902412500963e-05,
"loss": 0.4126,
"step": 404
},
{
"epoch": 2.007434944237918,
"grad_norm": 1.2431445665943202,
"learning_rate": 5.9678119348586115e-05,
"loss": 0.3456,
"step": 405
},
{
"epoch": 2.0123915737298637,
"grad_norm": 0.648853154194552,
"learning_rate": 5.955697691766507e-05,
"loss": 0.3418,
"step": 406
},
{
"epoch": 2.0173482032218093,
"grad_norm": 0.579480891477055,
"learning_rate": 5.943559829529501e-05,
"loss": 0.334,
"step": 407
},
{
"epoch": 2.0223048327137545,
"grad_norm": 0.8551507180871017,
"learning_rate": 5.931398494737696e-05,
"loss": 0.3376,
"step": 408
},
{
"epoch": 2.0272614622057,
"grad_norm": 0.7020237179868006,
"learning_rate": 5.9192138342646785e-05,
"loss": 0.34,
"step": 409
},
{
"epoch": 2.032218091697646,
"grad_norm": 0.6195345824183058,
"learning_rate": 5.907005995265735e-05,
"loss": 0.3401,
"step": 410
},
{
"epoch": 2.037174721189591,
"grad_norm": 0.5772527139168068,
"learning_rate": 5.894775125176087e-05,
"loss": 0.3292,
"step": 411
},
{
"epoch": 2.0421313506815366,
"grad_norm": 0.5021265872507295,
"learning_rate": 5.882521371709102e-05,
"loss": 0.3381,
"step": 412
},
{
"epoch": 2.047087980173482,
"grad_norm": 0.5894812915546477,
"learning_rate": 5.870244882854513e-05,
"loss": 0.3374,
"step": 413
},
{
"epoch": 2.0520446096654275,
"grad_norm": 0.5506092063644734,
"learning_rate": 5.857945806876631e-05,
"loss": 0.3362,
"step": 414
},
{
"epoch": 2.057001239157373,
"grad_norm": 0.3460835982963106,
"learning_rate": 5.845624292312551e-05,
"loss": 0.3314,
"step": 415
},
{
"epoch": 2.0619578686493183,
"grad_norm": 0.49475849318415527,
"learning_rate": 5.8332804879703634e-05,
"loss": 0.3312,
"step": 416
},
{
"epoch": 2.066914498141264,
"grad_norm": 0.40797927085998376,
"learning_rate": 5.820914542927356e-05,
"loss": 0.3375,
"step": 417
},
{
"epoch": 2.0718711276332096,
"grad_norm": 0.46256125838881207,
"learning_rate": 5.808526606528207e-05,
"loss": 0.3385,
"step": 418
},
{
"epoch": 2.0768277571251548,
"grad_norm": 0.43520996500621334,
"learning_rate": 5.7961168283831914e-05,
"loss": 0.3361,
"step": 419
},
{
"epoch": 2.0817843866171004,
"grad_norm": 0.3275995772725094,
"learning_rate": 5.783685358366363e-05,
"loss": 0.3329,
"step": 420
},
{
"epoch": 2.086741016109046,
"grad_norm": 0.3508599652724197,
"learning_rate": 5.771232346613759e-05,
"loss": 0.3387,
"step": 421
},
{
"epoch": 2.0916976456009913,
"grad_norm": 0.33993733778247687,
"learning_rate": 5.758757943521568e-05,
"loss": 0.3381,
"step": 422
},
{
"epoch": 2.096654275092937,
"grad_norm": 0.3779989543335282,
"learning_rate": 5.746262299744333e-05,
"loss": 0.3368,
"step": 423
},
{
"epoch": 2.101610904584882,
"grad_norm": 0.4056520143137526,
"learning_rate": 5.7337455661931184e-05,
"loss": 0.3296,
"step": 424
},
{
"epoch": 2.1065675340768277,
"grad_norm": 0.2829188187459132,
"learning_rate": 5.72120789403369e-05,
"loss": 0.3346,
"step": 425
},
{
"epoch": 2.1115241635687734,
"grad_norm": 0.28438834763470383,
"learning_rate": 5.7086494346846954e-05,
"loss": 0.333,
"step": 426
},
{
"epoch": 2.1164807930607186,
"grad_norm": 0.2811433639980777,
"learning_rate": 5.6960703398158265e-05,
"loss": 0.3344,
"step": 427
},
{
"epoch": 2.121437422552664,
"grad_norm": 0.264872240951993,
"learning_rate": 5.683470761345997e-05,
"loss": 0.336,
"step": 428
},
{
"epoch": 2.12639405204461,
"grad_norm": 0.32094685957368546,
"learning_rate": 5.670850851441499e-05,
"loss": 0.335,
"step": 429
},
{
"epoch": 2.131350681536555,
"grad_norm": 0.29696749695480423,
"learning_rate": 5.6582107625141724e-05,
"loss": 0.3344,
"step": 430
},
{
"epoch": 2.1363073110285007,
"grad_norm": 0.31226224047135687,
"learning_rate": 5.6455506472195584e-05,
"loss": 0.3403,
"step": 431
},
{
"epoch": 2.141263940520446,
"grad_norm": 0.3100881935692483,
"learning_rate": 5.6328706584550615e-05,
"loss": 0.3329,
"step": 432
},
{
"epoch": 2.1462205700123915,
"grad_norm": 0.2909272332350804,
"learning_rate": 5.6201709493580964e-05,
"loss": 0.3329,
"step": 433
},
{
"epoch": 2.151177199504337,
"grad_norm": 0.3860693343638538,
"learning_rate": 5.607451673304245e-05,
"loss": 0.3386,
"step": 434
},
{
"epoch": 2.1561338289962824,
"grad_norm": 0.3471498999682872,
"learning_rate": 5.5947129839053996e-05,
"loss": 0.3316,
"step": 435
},
{
"epoch": 2.161090458488228,
"grad_norm": 0.30300923166845395,
"learning_rate": 5.5819550350079096e-05,
"loss": 0.3335,
"step": 436
},
{
"epoch": 2.1660470879801736,
"grad_norm": 0.29394820981141223,
"learning_rate": 5.5691779806907245e-05,
"loss": 0.338,
"step": 437
},
{
"epoch": 2.171003717472119,
"grad_norm": 0.34673047871654705,
"learning_rate": 5.556381975263529e-05,
"loss": 0.3347,
"step": 438
},
{
"epoch": 2.1759603469640645,
"grad_norm": 0.3160177032904872,
"learning_rate": 5.543567173264885e-05,
"loss": 0.3355,
"step": 439
},
{
"epoch": 2.18091697645601,
"grad_norm": 0.22496198223456335,
"learning_rate": 5.5307337294603595e-05,
"loss": 0.3338,
"step": 440
},
{
"epoch": 2.1858736059479553,
"grad_norm": 0.2614030147556954,
"learning_rate": 5.517881798840662e-05,
"loss": 0.333,
"step": 441
},
{
"epoch": 2.190830235439901,
"grad_norm": 0.2707422405401144,
"learning_rate": 5.505011536619766e-05,
"loss": 0.335,
"step": 442
},
{
"epoch": 2.195786864931846,
"grad_norm": 0.24192751386308842,
"learning_rate": 5.4921230982330374e-05,
"loss": 0.3341,
"step": 443
},
{
"epoch": 2.200743494423792,
"grad_norm": 0.24976556710707468,
"learning_rate": 5.479216639335361e-05,
"loss": 0.3321,
"step": 444
},
{
"epoch": 2.2057001239157374,
"grad_norm": 0.24224759625348416,
"learning_rate": 5.466292315799252e-05,
"loss": 0.3328,
"step": 445
},
{
"epoch": 2.2106567534076826,
"grad_norm": 0.26176016864712703,
"learning_rate": 5.453350283712982e-05,
"loss": 0.3398,
"step": 446
},
{
"epoch": 2.2156133828996283,
"grad_norm": 0.2637805491990165,
"learning_rate": 5.4403906993786886e-05,
"loss": 0.3395,
"step": 447
},
{
"epoch": 2.220570012391574,
"grad_norm": 0.24268721259176745,
"learning_rate": 5.4274137193104915e-05,
"loss": 0.3349,
"step": 448
},
{
"epoch": 2.225526641883519,
"grad_norm": 0.24270028247035141,
"learning_rate": 5.4144195002325975e-05,
"loss": 0.335,
"step": 449
},
{
"epoch": 2.2304832713754648,
"grad_norm": 0.3093752787269832,
"learning_rate": 5.401408199077413e-05,
"loss": 0.3386,
"step": 450
},
{
"epoch": 2.23543990086741,
"grad_norm": 0.3330988364901497,
"learning_rate": 5.3883799729836464e-05,
"loss": 0.3377,
"step": 451
},
{
"epoch": 2.2403965303593556,
"grad_norm": 0.3445158375784873,
"learning_rate": 5.3753349792944066e-05,
"loss": 0.3332,
"step": 452
},
{
"epoch": 2.2453531598513012,
"grad_norm": 0.42889232065694805,
"learning_rate": 5.3622733755553126e-05,
"loss": 0.3386,
"step": 453
},
{
"epoch": 2.2503097893432464,
"grad_norm": 0.36669493271134557,
"learning_rate": 5.349195319512577e-05,
"loss": 0.34,
"step": 454
},
{
"epoch": 2.255266418835192,
"grad_norm": 0.2651829328711018,
"learning_rate": 5.3361009691111154e-05,
"loss": 0.3326,
"step": 455
},
{
"epoch": 2.2602230483271377,
"grad_norm": 0.21243536113292363,
"learning_rate": 5.322990482492625e-05,
"loss": 0.3289,
"step": 456
},
{
"epoch": 2.265179677819083,
"grad_norm": 0.23488873689796186,
"learning_rate": 5.30986401799369e-05,
"loss": 0.3298,
"step": 457
},
{
"epoch": 2.2701363073110286,
"grad_norm": 0.28278539825982185,
"learning_rate": 5.296721734143854e-05,
"loss": 0.335,
"step": 458
},
{
"epoch": 2.2750929368029738,
"grad_norm": 0.28374872230764553,
"learning_rate": 5.2835637896637115e-05,
"loss": 0.3338,
"step": 459
},
{
"epoch": 2.2800495662949194,
"grad_norm": 0.28778231594332443,
"learning_rate": 5.2703903434630024e-05,
"loss": 0.3358,
"step": 460
},
{
"epoch": 2.285006195786865,
"grad_norm": 0.2821746117801878,
"learning_rate": 5.257201554638668e-05,
"loss": 0.335,
"step": 461
},
{
"epoch": 2.2899628252788102,
"grad_norm": 0.22260019555120894,
"learning_rate": 5.243997582472956e-05,
"loss": 0.3321,
"step": 462
},
{
"epoch": 2.294919454770756,
"grad_norm": 0.23359117637012264,
"learning_rate": 5.2307785864314764e-05,
"loss": 0.3341,
"step": 463
},
{
"epoch": 2.2998760842627015,
"grad_norm": 0.2655316000215375,
"learning_rate": 5.21754472616129e-05,
"loss": 0.3409,
"step": 464
},
{
"epoch": 2.3048327137546467,
"grad_norm": 0.2670026997209511,
"learning_rate": 5.204296161488968e-05,
"loss": 0.3377,
"step": 465
},
{
"epoch": 2.3097893432465924,
"grad_norm": 0.1999091301337533,
"learning_rate": 5.1910330524186745e-05,
"loss": 0.3384,
"step": 466
},
{
"epoch": 2.314745972738538,
"grad_norm": 0.2517364041892872,
"learning_rate": 5.1777555591302236e-05,
"loss": 0.34,
"step": 467
},
{
"epoch": 2.319702602230483,
"grad_norm": 0.24227717495907994,
"learning_rate": 5.164463841977151e-05,
"loss": 0.3392,
"step": 468
},
{
"epoch": 2.324659231722429,
"grad_norm": 0.24877649271423527,
"learning_rate": 5.151158061484774e-05,
"loss": 0.3377,
"step": 469
},
{
"epoch": 2.329615861214374,
"grad_norm": 0.18945359917741403,
"learning_rate": 5.137838378348255e-05,
"loss": 0.3436,
"step": 470
},
{
"epoch": 2.3345724907063197,
"grad_norm": 0.19893332403593147,
"learning_rate": 5.124504953430658e-05,
"loss": 0.3376,
"step": 471
},
{
"epoch": 2.3395291201982653,
"grad_norm": 0.2137047502635654,
"learning_rate": 5.111157947761012e-05,
"loss": 0.3289,
"step": 472
},
{
"epoch": 2.3444857496902105,
"grad_norm": 0.25186226963289404,
"learning_rate": 5.097797522532356e-05,
"loss": 0.3315,
"step": 473
},
{
"epoch": 2.349442379182156,
"grad_norm": 0.255051673508901,
"learning_rate": 5.084423839099805e-05,
"loss": 0.339,
"step": 474
},
{
"epoch": 2.354399008674102,
"grad_norm": 0.20323499501401712,
"learning_rate": 5.071037058978589e-05,
"loss": 0.3353,
"step": 475
},
{
"epoch": 2.359355638166047,
"grad_norm": 0.20801901587197655,
"learning_rate": 5.0576373438421115e-05,
"loss": 0.3387,
"step": 476
},
{
"epoch": 2.3643122676579926,
"grad_norm": 0.22381819606938655,
"learning_rate": 5.044224855519991e-05,
"loss": 0.3319,
"step": 477
},
{
"epoch": 2.3692688971499383,
"grad_norm": 0.2196511953550138,
"learning_rate": 5.030799755996111e-05,
"loss": 0.3307,
"step": 478
},
{
"epoch": 2.3742255266418835,
"grad_norm": 0.22301886381698216,
"learning_rate": 5.0173622074066604e-05,
"loss": 0.3364,
"step": 479
},
{
"epoch": 2.379182156133829,
"grad_norm": 0.2051336333121646,
"learning_rate": 5.0039123720381765e-05,
"loss": 0.337,
"step": 480
},
{
"epoch": 2.3841387856257743,
"grad_norm": 0.20300166533913025,
"learning_rate": 4.990450412325586e-05,
"loss": 0.3306,
"step": 481
},
{
"epoch": 2.38909541511772,
"grad_norm": 0.2374307593924014,
"learning_rate": 4.9769764908502413e-05,
"loss": 0.3428,
"step": 482
},
{
"epoch": 2.3940520446096656,
"grad_norm": 0.21175057446827547,
"learning_rate": 4.96349077033796e-05,
"loss": 0.3373,
"step": 483
},
{
"epoch": 2.399008674101611,
"grad_norm": 0.20563440127736773,
"learning_rate": 4.949993413657057e-05,
"loss": 0.3319,
"step": 484
},
{
"epoch": 2.4039653035935564,
"grad_norm": 0.22371247404771757,
"learning_rate": 4.936484583816376e-05,
"loss": 0.3383,
"step": 485
},
{
"epoch": 2.4089219330855016,
"grad_norm": 0.20615797475462871,
"learning_rate": 4.922964443963326e-05,
"loss": 0.3376,
"step": 486
},
{
"epoch": 2.4138785625774473,
"grad_norm": 0.22893533763150797,
"learning_rate": 4.9094331573819096e-05,
"loss": 0.337,
"step": 487
},
{
"epoch": 2.418835192069393,
"grad_norm": 0.23517939853671133,
"learning_rate": 4.895890887490743e-05,
"loss": 0.3356,
"step": 488
},
{
"epoch": 2.423791821561338,
"grad_norm": 0.20057429424724502,
"learning_rate": 4.8823377978410964e-05,
"loss": 0.3376,
"step": 489
},
{
"epoch": 2.4287484510532837,
"grad_norm": 0.21919829961802664,
"learning_rate": 4.8687740521149056e-05,
"loss": 0.3337,
"step": 490
},
{
"epoch": 2.4337050805452294,
"grad_norm": 0.26014242642752583,
"learning_rate": 4.855199814122804e-05,
"loss": 0.3352,
"step": 491
},
{
"epoch": 2.4386617100371746,
"grad_norm": 0.2584940458127453,
"learning_rate": 4.84161524780214e-05,
"loss": 0.3288,
"step": 492
},
{
"epoch": 2.44361833952912,
"grad_norm": 0.24674245573299244,
"learning_rate": 4.828020517214997e-05,
"loss": 0.3303,
"step": 493
},
{
"epoch": 2.448574969021066,
"grad_norm": 0.3158488718882322,
"learning_rate": 4.8144157865462176e-05,
"loss": 0.3404,
"step": 494
},
{
"epoch": 2.453531598513011,
"grad_norm": 0.3413638205153234,
"learning_rate": 4.8008012201014096e-05,
"loss": 0.3333,
"step": 495
},
{
"epoch": 2.4584882280049567,
"grad_norm": 0.37582492333886947,
"learning_rate": 4.787176982304975e-05,
"loss": 0.3282,
"step": 496
},
{
"epoch": 2.463444857496902,
"grad_norm": 0.3793384775099229,
"learning_rate": 4.7735432376981125e-05,
"loss": 0.3381,
"step": 497
},
{
"epoch": 2.4684014869888475,
"grad_norm": 0.2816384401529513,
"learning_rate": 4.759900150936839e-05,
"loss": 0.3322,
"step": 498
},
{
"epoch": 2.473358116480793,
"grad_norm": 0.24741234458553119,
"learning_rate": 4.7462478867899944e-05,
"loss": 0.3338,
"step": 499
},
{
"epoch": 2.4783147459727384,
"grad_norm": 0.22710242981496517,
"learning_rate": 4.7325866101372585e-05,
"loss": 0.3351,
"step": 500
},
{
"epoch": 2.483271375464684,
"grad_norm": 0.23327941149354073,
"learning_rate": 4.7189164859671526e-05,
"loss": 0.3355,
"step": 501
},
{
"epoch": 2.4882280049566297,
"grad_norm": 0.25610086026152373,
"learning_rate": 4.705237679375052e-05,
"loss": 0.333,
"step": 502
},
{
"epoch": 2.493184634448575,
"grad_norm": 0.29590363645911516,
"learning_rate": 4.69155035556119e-05,
"loss": 0.3352,
"step": 503
},
{
"epoch": 2.4981412639405205,
"grad_norm": 0.3296002633563399,
"learning_rate": 4.6778546798286633e-05,
"loss": 0.3362,
"step": 504
},
{
"epoch": 2.503097893432466,
"grad_norm": 0.2633441308891435,
"learning_rate": 4.664150817581435e-05,
"loss": 0.3295,
"step": 505
},
{
"epoch": 2.5080545229244113,
"grad_norm": 0.2003652549018511,
"learning_rate": 4.650438934322337e-05,
"loss": 0.3344,
"step": 506
},
{
"epoch": 2.513011152416357,
"grad_norm": 0.2298644200478977,
"learning_rate": 4.6367191956510764e-05,
"loss": 0.332,
"step": 507
},
{
"epoch": 2.517967781908302,
"grad_norm": 0.23379538562070729,
"learning_rate": 4.622991767262222e-05,
"loss": 0.3366,
"step": 508
},
{
"epoch": 2.522924411400248,
"grad_norm": 0.3007388205665313,
"learning_rate": 4.60925681494322e-05,
"loss": 0.336,
"step": 509
},
{
"epoch": 2.5278810408921935,
"grad_norm": 0.27981497022974244,
"learning_rate": 4.595514504572382e-05,
"loss": 0.3348,
"step": 510
},
{
"epoch": 2.5328376703841387,
"grad_norm": 0.20408295200827142,
"learning_rate": 4.58176500211688e-05,
"loss": 0.3374,
"step": 511
},
{
"epoch": 2.5377942998760843,
"grad_norm": 0.19246516876902225,
"learning_rate": 4.568008473630749e-05,
"loss": 0.3342,
"step": 512
},
{
"epoch": 2.5427509293680295,
"grad_norm": 0.23233073687830116,
"learning_rate": 4.5542450852528764e-05,
"loss": 0.3359,
"step": 513
},
{
"epoch": 2.547707558859975,
"grad_norm": 0.22140903549459723,
"learning_rate": 4.5404750032049984e-05,
"loss": 0.3381,
"step": 514
},
{
"epoch": 2.5526641883519208,
"grad_norm": 0.22974445394924758,
"learning_rate": 4.526698393789691e-05,
"loss": 0.3353,
"step": 515
},
{
"epoch": 2.5576208178438664,
"grad_norm": 0.30986295920934853,
"learning_rate": 4.51291542338836e-05,
"loss": 0.3405,
"step": 516
},
{
"epoch": 2.5625774473358116,
"grad_norm": 0.2719615696911325,
"learning_rate": 4.499126258459235e-05,
"loss": 0.3355,
"step": 517
},
{
"epoch": 2.5675340768277573,
"grad_norm": 0.16191374687101495,
"learning_rate": 4.4853310655353586e-05,
"loss": 0.3352,
"step": 518
},
{
"epoch": 2.5724907063197024,
"grad_norm": 0.2532907979488151,
"learning_rate": 4.471530011222572e-05,
"loss": 0.3337,
"step": 519
},
{
"epoch": 2.577447335811648,
"grad_norm": 0.23967557439949316,
"learning_rate": 4.457723262197506e-05,
"loss": 0.3377,
"step": 520
},
{
"epoch": 2.5824039653035937,
"grad_norm": 0.1912898551193947,
"learning_rate": 4.443910985205566e-05,
"loss": 0.3278,
"step": 521
},
{
"epoch": 2.587360594795539,
"grad_norm": 0.19961800458113116,
"learning_rate": 4.430093347058921e-05,
"loss": 0.3364,
"step": 522
},
{
"epoch": 2.5923172242874846,
"grad_norm": 0.26900359284803255,
"learning_rate": 4.416270514634485e-05,
"loss": 0.3351,
"step": 523
},
{
"epoch": 2.5972738537794298,
"grad_norm": 0.21472946546241412,
"learning_rate": 4.402442654871905e-05,
"loss": 0.3314,
"step": 524
},
{
"epoch": 2.6022304832713754,
"grad_norm": 0.1872384290524764,
"learning_rate": 4.388609934771544e-05,
"loss": 0.3385,
"step": 525
},
{
"epoch": 2.607187112763321,
"grad_norm": 0.24976194756574016,
"learning_rate": 4.374772521392463e-05,
"loss": 0.3368,
"step": 526
},
{
"epoch": 2.6121437422552667,
"grad_norm": 0.2249642971113071,
"learning_rate": 4.360930581850405e-05,
"loss": 0.3392,
"step": 527
},
{
"epoch": 2.617100371747212,
"grad_norm": 0.2054171586674057,
"learning_rate": 4.347084283315773e-05,
"loss": 0.3374,
"step": 528
},
{
"epoch": 2.6220570012391575,
"grad_norm": 0.1827910858801494,
"learning_rate": 4.333233793011619e-05,
"loss": 0.3322,
"step": 529
},
{
"epoch": 2.6270136307311027,
"grad_norm": 0.1846393876427912,
"learning_rate": 4.3193792782116164e-05,
"loss": 0.3348,
"step": 530
},
{
"epoch": 2.6319702602230484,
"grad_norm": 0.2422856985597145,
"learning_rate": 4.305520906238041e-05,
"loss": 0.3327,
"step": 531
},
{
"epoch": 2.636926889714994,
"grad_norm": 0.23896322328508743,
"learning_rate": 4.291658844459757e-05,
"loss": 0.3338,
"step": 532
},
{
"epoch": 2.641883519206939,
"grad_norm": 0.18112180383848728,
"learning_rate": 4.277793260290183e-05,
"loss": 0.3364,
"step": 533
},
{
"epoch": 2.646840148698885,
"grad_norm": 0.23428378261560656,
"learning_rate": 4.2639243211852895e-05,
"loss": 0.3333,
"step": 534
},
{
"epoch": 2.65179677819083,
"grad_norm": 0.19183119613946728,
"learning_rate": 4.2500521946415514e-05,
"loss": 0.3365,
"step": 535
},
{
"epoch": 2.6567534076827757,
"grad_norm": 0.20154211295704755,
"learning_rate": 4.23617704819395e-05,
"loss": 0.3336,
"step": 536
},
{
"epoch": 2.6617100371747213,
"grad_norm": 0.2427391083128824,
"learning_rate": 4.222299049413932e-05,
"loss": 0.3324,
"step": 537
},
{
"epoch": 2.6666666666666665,
"grad_norm": 0.18062474951109844,
"learning_rate": 4.208418365907393e-05,
"loss": 0.3347,
"step": 538
},
{
"epoch": 2.671623296158612,
"grad_norm": 0.2305349411435518,
"learning_rate": 4.194535165312653e-05,
"loss": 0.333,
"step": 539
},
{
"epoch": 2.6765799256505574,
"grad_norm": 0.21312698762796153,
"learning_rate": 4.1806496152984304e-05,
"loss": 0.3305,
"step": 540
},
{
"epoch": 2.681536555142503,
"grad_norm": 0.1818189463567409,
"learning_rate": 4.16676188356182e-05,
"loss": 0.3336,
"step": 541
},
{
"epoch": 2.6864931846344486,
"grad_norm": 0.19922786477591123,
"learning_rate": 4.1528721378262605e-05,
"loss": 0.3282,
"step": 542
},
{
"epoch": 2.6914498141263943,
"grad_norm": 0.18034983700167911,
"learning_rate": 4.138980545839522e-05,
"loss": 0.3345,
"step": 543
},
{
"epoch": 2.6964064436183395,
"grad_norm": 0.19797325344913994,
"learning_rate": 4.125087275371661e-05,
"loss": 0.3354,
"step": 544
},
{
"epoch": 2.701363073110285,
"grad_norm": 0.20423478730918823,
"learning_rate": 4.1111924942130164e-05,
"loss": 0.3339,
"step": 545
},
{
"epoch": 2.7063197026022303,
"grad_norm": 0.19187739598428746,
"learning_rate": 4.097296370172164e-05,
"loss": 0.3317,
"step": 546
},
{
"epoch": 2.711276332094176,
"grad_norm": 0.18918569677735037,
"learning_rate": 4.083399071073902e-05,
"loss": 0.3327,
"step": 547
},
{
"epoch": 2.7162329615861216,
"grad_norm": 0.2165189596882268,
"learning_rate": 4.069500764757217e-05,
"loss": 0.3311,
"step": 548
},
{
"epoch": 2.721189591078067,
"grad_norm": 0.1957402774919667,
"learning_rate": 4.055601619073261e-05,
"loss": 0.3354,
"step": 549
},
{
"epoch": 2.7261462205700124,
"grad_norm": 0.17969490125915344,
"learning_rate": 4.041701801883324e-05,
"loss": 0.3314,
"step": 550
},
{
"epoch": 2.7311028500619576,
"grad_norm": 0.22689675751479285,
"learning_rate": 4.0278014810568045e-05,
"loss": 0.3332,
"step": 551
},
{
"epoch": 2.7360594795539033,
"grad_norm": 0.1755924105874963,
"learning_rate": 4.0139008244691845e-05,
"loss": 0.3322,
"step": 552
},
{
"epoch": 2.741016109045849,
"grad_norm": 0.19129031130715107,
"learning_rate": 4e-05,
"loss": 0.3418,
"step": 553
},
{
"epoch": 2.7459727385377946,
"grad_norm": 0.17689484335846134,
"learning_rate": 3.986099175530817e-05,
"loss": 0.3309,
"step": 554
},
{
"epoch": 2.7509293680297398,
"grad_norm": 0.15442290795456226,
"learning_rate": 3.9721985189431975e-05,
"loss": 0.33,
"step": 555
},
{
"epoch": 2.7558859975216854,
"grad_norm": 0.21388356391908783,
"learning_rate": 3.958298198116677e-05,
"loss": 0.3385,
"step": 556
},
{
"epoch": 2.7608426270136306,
"grad_norm": 0.1684374241371316,
"learning_rate": 3.9443983809267405e-05,
"loss": 0.339,
"step": 557
},
{
"epoch": 2.7657992565055762,
"grad_norm": 0.20515582530430282,
"learning_rate": 3.930499235242783e-05,
"loss": 0.3327,
"step": 558
},
{
"epoch": 2.770755885997522,
"grad_norm": 0.22058261160844722,
"learning_rate": 3.9166009289260995e-05,
"loss": 0.3369,
"step": 559
},
{
"epoch": 2.775712515489467,
"grad_norm": 0.18346028180253854,
"learning_rate": 3.9027036298278366e-05,
"loss": 0.3344,
"step": 560
},
{
"epoch": 2.7806691449814127,
"grad_norm": 0.16762816915906323,
"learning_rate": 3.888807505786984e-05,
"loss": 0.3323,
"step": 561
},
{
"epoch": 2.785625774473358,
"grad_norm": 0.1990414852782902,
"learning_rate": 3.8749127246283386e-05,
"loss": 0.3334,
"step": 562
},
{
"epoch": 2.7905824039653035,
"grad_norm": 0.18489638959504387,
"learning_rate": 3.86101945416048e-05,
"loss": 0.3284,
"step": 563
},
{
"epoch": 2.795539033457249,
"grad_norm": 0.1652010562534876,
"learning_rate": 3.84712786217374e-05,
"loss": 0.3326,
"step": 564
},
{
"epoch": 2.8004956629491944,
"grad_norm": 0.17456119169719603,
"learning_rate": 3.8332381164381814e-05,
"loss": 0.335,
"step": 565
},
{
"epoch": 2.80545229244114,
"grad_norm": 0.16210051076095758,
"learning_rate": 3.81935038470157e-05,
"loss": 0.3346,
"step": 566
},
{
"epoch": 2.8104089219330852,
"grad_norm": 0.16777608423300605,
"learning_rate": 3.805464834687349e-05,
"loss": 0.3337,
"step": 567
},
{
"epoch": 2.815365551425031,
"grad_norm": 0.15906324157550705,
"learning_rate": 3.791581634092609e-05,
"loss": 0.3306,
"step": 568
},
{
"epoch": 2.8203221809169765,
"grad_norm": 0.14249599390439402,
"learning_rate": 3.7777009505860686e-05,
"loss": 0.3318,
"step": 569
},
{
"epoch": 2.825278810408922,
"grad_norm": 0.18459264664094613,
"learning_rate": 3.763822951806051e-05,
"loss": 0.3386,
"step": 570
},
{
"epoch": 2.8302354399008673,
"grad_norm": 0.20022055680618658,
"learning_rate": 3.749947805358449e-05,
"loss": 0.3303,
"step": 571
},
{
"epoch": 2.835192069392813,
"grad_norm": 0.20222925053513227,
"learning_rate": 3.736075678814712e-05,
"loss": 0.3356,
"step": 572
},
{
"epoch": 2.840148698884758,
"grad_norm": 0.20684131712971296,
"learning_rate": 3.722206739709817e-05,
"loss": 0.3303,
"step": 573
},
{
"epoch": 2.845105328376704,
"grad_norm": 0.1779337567262826,
"learning_rate": 3.708341155540246e-05,
"loss": 0.3335,
"step": 574
},
{
"epoch": 2.8500619578686495,
"grad_norm": 0.2413271248873493,
"learning_rate": 3.69447909376196e-05,
"loss": 0.3393,
"step": 575
},
{
"epoch": 2.8550185873605947,
"grad_norm": 0.23918769230290013,
"learning_rate": 3.680620721788385e-05,
"loss": 0.3294,
"step": 576
},
{
"epoch": 2.8599752168525403,
"grad_norm": 0.1621753755153279,
"learning_rate": 3.6667662069883814e-05,
"loss": 0.3264,
"step": 577
},
{
"epoch": 2.8649318463444855,
"grad_norm": 0.19949158413337165,
"learning_rate": 3.652915716684228e-05,
"loss": 0.3333,
"step": 578
},
{
"epoch": 2.869888475836431,
"grad_norm": 0.2061475811060586,
"learning_rate": 3.639069418149596e-05,
"loss": 0.33,
"step": 579
},
{
"epoch": 2.874845105328377,
"grad_norm": 0.17077547287771824,
"learning_rate": 3.6252274786075375e-05,
"loss": 0.3329,
"step": 580
},
{
"epoch": 2.8798017348203224,
"grad_norm": 0.19248463763061568,
"learning_rate": 3.611390065228457e-05,
"loss": 0.3321,
"step": 581
},
{
"epoch": 2.8847583643122676,
"grad_norm": 0.1862619363005395,
"learning_rate": 3.597557345128097e-05,
"loss": 0.3326,
"step": 582
},
{
"epoch": 2.8897149938042133,
"grad_norm": 0.18090445666144891,
"learning_rate": 3.5837294853655166e-05,
"loss": 0.3303,
"step": 583
},
{
"epoch": 2.8946716232961585,
"grad_norm": 0.1440347767860962,
"learning_rate": 3.5699066529410804e-05,
"loss": 0.3364,
"step": 584
},
{
"epoch": 2.899628252788104,
"grad_norm": 0.184137285441072,
"learning_rate": 3.556089014794434e-05,
"loss": 0.3332,
"step": 585
},
{
"epoch": 2.9045848822800497,
"grad_norm": 0.18962839342191237,
"learning_rate": 3.542276737802495e-05,
"loss": 0.334,
"step": 586
},
{
"epoch": 2.909541511771995,
"grad_norm": 0.16779757161733827,
"learning_rate": 3.528469988777429e-05,
"loss": 0.326,
"step": 587
},
{
"epoch": 2.9144981412639406,
"grad_norm": 0.1690959070694777,
"learning_rate": 3.514668934464642e-05,
"loss": 0.3318,
"step": 588
},
{
"epoch": 2.9194547707558858,
"grad_norm": 0.19976961678804345,
"learning_rate": 3.500873741540767e-05,
"loss": 0.3307,
"step": 589
},
{
"epoch": 2.9244114002478314,
"grad_norm": 0.16964715443901282,
"learning_rate": 3.487084576611642e-05,
"loss": 0.3316,
"step": 590
},
{
"epoch": 2.929368029739777,
"grad_norm": 0.1610263789509033,
"learning_rate": 3.4733016062103105e-05,
"loss": 0.3291,
"step": 591
},
{
"epoch": 2.9343246592317227,
"grad_norm": 0.19562682805644308,
"learning_rate": 3.4595249967950015e-05,
"loss": 0.3314,
"step": 592
},
{
"epoch": 2.939281288723668,
"grad_norm": 0.18783423711181588,
"learning_rate": 3.4457549147471256e-05,
"loss": 0.3337,
"step": 593
},
{
"epoch": 2.9442379182156135,
"grad_norm": 0.17249410205027924,
"learning_rate": 3.431991526369253e-05,
"loss": 0.336,
"step": 594
},
{
"epoch": 2.9491945477075587,
"grad_norm": 0.20170942304434009,
"learning_rate": 3.418234997883121e-05,
"loss": 0.3275,
"step": 595
},
{
"epoch": 2.9541511771995044,
"grad_norm": 0.18222174501295177,
"learning_rate": 3.4044854954276186e-05,
"loss": 0.3365,
"step": 596
},
{
"epoch": 2.95910780669145,
"grad_norm": 0.1843856632215925,
"learning_rate": 3.3907431850567804e-05,
"loss": 0.3345,
"step": 597
},
{
"epoch": 2.964064436183395,
"grad_norm": 0.17773265749067887,
"learning_rate": 3.377008232737779e-05,
"loss": 0.3318,
"step": 598
},
{
"epoch": 2.969021065675341,
"grad_norm": 0.17045064724964207,
"learning_rate": 3.363280804348925e-05,
"loss": 0.3369,
"step": 599
},
{
"epoch": 2.973977695167286,
"grad_norm": 0.18104363543938948,
"learning_rate": 3.349561065677663e-05,
"loss": 0.3321,
"step": 600
},
{
"epoch": 2.9789343246592317,
"grad_norm": 0.16258663213066146,
"learning_rate": 3.335849182418567e-05,
"loss": 0.3345,
"step": 601
},
{
"epoch": 2.9838909541511773,
"grad_norm": 0.16984768351493296,
"learning_rate": 3.322145320171339e-05,
"loss": 0.334,
"step": 602
},
{
"epoch": 2.9888475836431225,
"grad_norm": 0.18346639298353426,
"learning_rate": 3.3084496444388105e-05,
"loss": 0.3304,
"step": 603
},
{
"epoch": 2.993804213135068,
"grad_norm": 0.14403509012455282,
"learning_rate": 3.294762320624949e-05,
"loss": 0.3287,
"step": 604
},
{
"epoch": 2.9987608426270134,
"grad_norm": 0.20073794639228396,
"learning_rate": 3.281083514032849e-05,
"loss": 0.3859,
"step": 605
},
{
"epoch": 3.003717472118959,
"grad_norm": 0.2089444698911026,
"learning_rate": 3.267413389862742e-05,
"loss": 0.3361,
"step": 606
},
{
"epoch": 3.0086741016109046,
"grad_norm": 0.1762655173237084,
"learning_rate": 3.2537521132100056e-05,
"loss": 0.2999,
"step": 607
},
{
"epoch": 3.01363073110285,
"grad_norm": 0.19507744188878484,
"learning_rate": 3.240099849063163e-05,
"loss": 0.3009,
"step": 608
},
{
"epoch": 3.0185873605947955,
"grad_norm": 0.21454164658321215,
"learning_rate": 3.2264567623018895e-05,
"loss": 0.3077,
"step": 609
},
{
"epoch": 3.023543990086741,
"grad_norm": 0.21359327347841275,
"learning_rate": 3.212823017695026e-05,
"loss": 0.3023,
"step": 610
},
{
"epoch": 3.0285006195786863,
"grad_norm": 0.20315773200419085,
"learning_rate": 3.199198779898591e-05,
"loss": 0.3026,
"step": 611
},
{
"epoch": 3.033457249070632,
"grad_norm": 0.18593215246007733,
"learning_rate": 3.1855842134537844e-05,
"loss": 0.3035,
"step": 612
},
{
"epoch": 3.0384138785625776,
"grad_norm": 0.1884189275848539,
"learning_rate": 3.1719794827850034e-05,
"loss": 0.3038,
"step": 613
},
{
"epoch": 3.043370508054523,
"grad_norm": 0.1795860923978728,
"learning_rate": 3.158384752197861e-05,
"loss": 0.3039,
"step": 614
},
{
"epoch": 3.0483271375464684,
"grad_norm": 0.18315632009685323,
"learning_rate": 3.144800185877197e-05,
"loss": 0.3064,
"step": 615
},
{
"epoch": 3.053283767038414,
"grad_norm": 0.19153130258541615,
"learning_rate": 3.131225947885096e-05,
"loss": 0.3048,
"step": 616
},
{
"epoch": 3.0582403965303593,
"grad_norm": 0.18262171453236378,
"learning_rate": 3.117662202158904e-05,
"loss": 0.3045,
"step": 617
},
{
"epoch": 3.063197026022305,
"grad_norm": 0.1937465027808409,
"learning_rate": 3.104109112509257e-05,
"loss": 0.3024,
"step": 618
},
{
"epoch": 3.06815365551425,
"grad_norm": 0.18294252068153583,
"learning_rate": 3.0905668426180925e-05,
"loss": 0.3034,
"step": 619
},
{
"epoch": 3.0731102850061958,
"grad_norm": 0.186995165113216,
"learning_rate": 3.0770355560366745e-05,
"loss": 0.3069,
"step": 620
},
{
"epoch": 3.0780669144981414,
"grad_norm": 0.20411338272963947,
"learning_rate": 3.063515416183625e-05,
"loss": 0.3046,
"step": 621
},
{
"epoch": 3.0830235439900866,
"grad_norm": 0.17006596521617198,
"learning_rate": 3.0500065863429446e-05,
"loss": 0.303,
"step": 622
},
{
"epoch": 3.0879801734820322,
"grad_norm": 0.19496621027527322,
"learning_rate": 3.0365092296620418e-05,
"loss": 0.2992,
"step": 623
},
{
"epoch": 3.092936802973978,
"grad_norm": 0.18067090099414398,
"learning_rate": 3.0230235091497593e-05,
"loss": 0.3046,
"step": 624
},
{
"epoch": 3.097893432465923,
"grad_norm": 0.16987068098319333,
"learning_rate": 3.0095495876744156e-05,
"loss": 0.3051,
"step": 625
},
{
"epoch": 3.1028500619578687,
"grad_norm": 0.1704176025292816,
"learning_rate": 2.9960876279618238e-05,
"loss": 0.304,
"step": 626
},
{
"epoch": 3.107806691449814,
"grad_norm": 0.15464066621895128,
"learning_rate": 2.982637792593342e-05,
"loss": 0.3063,
"step": 627
},
{
"epoch": 3.1127633209417596,
"grad_norm": 0.18094562465557273,
"learning_rate": 2.96920024400389e-05,
"loss": 0.2998,
"step": 628
},
{
"epoch": 3.117719950433705,
"grad_norm": 0.16554943056629043,
"learning_rate": 2.9557751444800097e-05,
"loss": 0.3015,
"step": 629
},
{
"epoch": 3.1226765799256504,
"grad_norm": 0.17971307653565008,
"learning_rate": 2.9423626561578885e-05,
"loss": 0.3096,
"step": 630
},
{
"epoch": 3.127633209417596,
"grad_norm": 0.1806551510167954,
"learning_rate": 2.9289629410214117e-05,
"loss": 0.3033,
"step": 631
},
{
"epoch": 3.1325898389095417,
"grad_norm": 0.15240283472952065,
"learning_rate": 2.9155761609001964e-05,
"loss": 0.3032,
"step": 632
},
{
"epoch": 3.137546468401487,
"grad_norm": 0.1577195426163746,
"learning_rate": 2.9022024774676442e-05,
"loss": 0.3021,
"step": 633
},
{
"epoch": 3.1425030978934325,
"grad_norm": 0.16849560034494637,
"learning_rate": 2.8888420522389905e-05,
"loss": 0.3088,
"step": 634
},
{
"epoch": 3.147459727385378,
"grad_norm": 0.14345317953623954,
"learning_rate": 2.8754950465693427e-05,
"loss": 0.3056,
"step": 635
},
{
"epoch": 3.1524163568773234,
"grad_norm": 0.1523622436509427,
"learning_rate": 2.8621616216517462e-05,
"loss": 0.3025,
"step": 636
},
{
"epoch": 3.157372986369269,
"grad_norm": 0.15673429079604273,
"learning_rate": 2.848841938515226e-05,
"loss": 0.3024,
"step": 637
},
{
"epoch": 3.162329615861214,
"grad_norm": 0.13466456756806872,
"learning_rate": 2.8355361580228495e-05,
"loss": 0.3032,
"step": 638
},
{
"epoch": 3.16728624535316,
"grad_norm": 0.1560250590207022,
"learning_rate": 2.8222444408697767e-05,
"loss": 0.3044,
"step": 639
},
{
"epoch": 3.1722428748451055,
"grad_norm": 0.15822779235945905,
"learning_rate": 2.808966947581327e-05,
"loss": 0.3004,
"step": 640
},
{
"epoch": 3.1771995043370507,
"grad_norm": 0.15868479894261964,
"learning_rate": 2.795703838511033e-05,
"loss": 0.3012,
"step": 641
},
{
"epoch": 3.1821561338289963,
"grad_norm": 0.17605282990168575,
"learning_rate": 2.7824552738387124e-05,
"loss": 0.304,
"step": 642
},
{
"epoch": 3.187112763320942,
"grad_norm": 0.12790318324939634,
"learning_rate": 2.769221413568525e-05,
"loss": 0.3056,
"step": 643
},
{
"epoch": 3.192069392812887,
"grad_norm": 0.141503542353362,
"learning_rate": 2.756002417527045e-05,
"loss": 0.304,
"step": 644
},
{
"epoch": 3.197026022304833,
"grad_norm": 0.17436702006560106,
"learning_rate": 2.742798445361332e-05,
"loss": 0.3017,
"step": 645
},
{
"epoch": 3.201982651796778,
"grad_norm": 0.14699228929283586,
"learning_rate": 2.729609656537e-05,
"loss": 0.3056,
"step": 646
},
{
"epoch": 3.2069392812887236,
"grad_norm": 0.14936078562070917,
"learning_rate": 2.7164362103362888e-05,
"loss": 0.3032,
"step": 647
},
{
"epoch": 3.2118959107806693,
"grad_norm": 0.1542891045557535,
"learning_rate": 2.703278265856148e-05,
"loss": 0.3056,
"step": 648
},
{
"epoch": 3.2168525402726145,
"grad_norm": 0.15226766205832612,
"learning_rate": 2.6901359820063107e-05,
"loss": 0.3016,
"step": 649
},
{
"epoch": 3.22180916976456,
"grad_norm": 0.16501837622862042,
"learning_rate": 2.6770095175073758e-05,
"loss": 0.3052,
"step": 650
},
{
"epoch": 3.2267657992565058,
"grad_norm": 0.1454205047479316,
"learning_rate": 2.663899030888886e-05,
"loss": 0.3039,
"step": 651
},
{
"epoch": 3.231722428748451,
"grad_norm": 0.16727660054359092,
"learning_rate": 2.650804680487424e-05,
"loss": 0.3055,
"step": 652
},
{
"epoch": 3.2366790582403966,
"grad_norm": 0.14477947917200043,
"learning_rate": 2.6377266244446898e-05,
"loss": 0.3028,
"step": 653
},
{
"epoch": 3.241635687732342,
"grad_norm": 0.1555736064441561,
"learning_rate": 2.624665020705594e-05,
"loss": 0.3063,
"step": 654
},
{
"epoch": 3.2465923172242874,
"grad_norm": 0.1586046258309188,
"learning_rate": 2.611620027016355e-05,
"loss": 0.307,
"step": 655
},
{
"epoch": 3.251548946716233,
"grad_norm": 0.15599462369905948,
"learning_rate": 2.598591800922588e-05,
"loss": 0.3017,
"step": 656
},
{
"epoch": 3.2565055762081783,
"grad_norm": 0.1552548399346659,
"learning_rate": 2.5855804997674042e-05,
"loss": 0.3036,
"step": 657
},
{
"epoch": 3.261462205700124,
"grad_norm": 0.14508182578537362,
"learning_rate": 2.5725862806895098e-05,
"loss": 0.3015,
"step": 658
},
{
"epoch": 3.2664188351920695,
"grad_norm": 0.14299675482805052,
"learning_rate": 2.559609300621312e-05,
"loss": 0.3048,
"step": 659
},
{
"epoch": 3.2713754646840147,
"grad_norm": 0.1360295624700782,
"learning_rate": 2.546649716287019e-05,
"loss": 0.2999,
"step": 660
},
{
"epoch": 3.2763320941759604,
"grad_norm": 0.1413517812236507,
"learning_rate": 2.5337076842007504e-05,
"loss": 0.2998,
"step": 661
},
{
"epoch": 3.281288723667906,
"grad_norm": 0.13127029981708502,
"learning_rate": 2.5207833606646403e-05,
"loss": 0.2996,
"step": 662
},
{
"epoch": 3.2862453531598512,
"grad_norm": 0.13139183414183372,
"learning_rate": 2.5078769017669632e-05,
"loss": 0.3063,
"step": 663
},
{
"epoch": 3.291201982651797,
"grad_norm": 0.13006606842033047,
"learning_rate": 2.4949884633802343e-05,
"loss": 0.3026,
"step": 664
},
{
"epoch": 3.296158612143742,
"grad_norm": 0.136510548326439,
"learning_rate": 2.482118201159339e-05,
"loss": 0.3058,
"step": 665
},
{
"epoch": 3.3011152416356877,
"grad_norm": 0.15418988542864148,
"learning_rate": 2.4692662705396412e-05,
"loss": 0.3009,
"step": 666
},
{
"epoch": 3.3060718711276333,
"grad_norm": 0.13624965023397706,
"learning_rate": 2.4564328267351165e-05,
"loss": 0.3058,
"step": 667
},
{
"epoch": 3.3110285006195785,
"grad_norm": 0.13980897795605768,
"learning_rate": 2.4436180247364734e-05,
"loss": 0.3071,
"step": 668
},
{
"epoch": 3.315985130111524,
"grad_norm": 0.1204698411572796,
"learning_rate": 2.430822019309277e-05,
"loss": 0.3007,
"step": 669
},
{
"epoch": 3.32094175960347,
"grad_norm": 0.13636864767510704,
"learning_rate": 2.418044964992091e-05,
"loss": 0.3077,
"step": 670
},
{
"epoch": 3.325898389095415,
"grad_norm": 0.14454738758907348,
"learning_rate": 2.405287016094601e-05,
"loss": 0.3071,
"step": 671
},
{
"epoch": 3.3308550185873607,
"grad_norm": 0.1282434836197698,
"learning_rate": 2.3925483266957558e-05,
"loss": 0.3032,
"step": 672
},
{
"epoch": 3.3358116480793063,
"grad_norm": 0.14263625355479145,
"learning_rate": 2.3798290506419042e-05,
"loss": 0.3037,
"step": 673
},
{
"epoch": 3.3407682775712515,
"grad_norm": 0.11974835937679618,
"learning_rate": 2.3671293415449395e-05,
"loss": 0.3002,
"step": 674
},
{
"epoch": 3.345724907063197,
"grad_norm": 0.13263778471023774,
"learning_rate": 2.3544493527804412e-05,
"loss": 0.3046,
"step": 675
},
{
"epoch": 3.3506815365551423,
"grad_norm": 0.13481169999279188,
"learning_rate": 2.341789237485829e-05,
"loss": 0.3025,
"step": 676
},
{
"epoch": 3.355638166047088,
"grad_norm": 0.1317185419349756,
"learning_rate": 2.329149148558502e-05,
"loss": 0.3011,
"step": 677
},
{
"epoch": 3.3605947955390336,
"grad_norm": 0.13819168121551242,
"learning_rate": 2.3165292386540048e-05,
"loss": 0.3071,
"step": 678
},
{
"epoch": 3.365551425030979,
"grad_norm": 0.14611123791855496,
"learning_rate": 2.3039296601841745e-05,
"loss": 0.3,
"step": 679
},
{
"epoch": 3.3705080545229245,
"grad_norm": 0.13143204123843844,
"learning_rate": 2.291350565315307e-05,
"loss": 0.305,
"step": 680
},
{
"epoch": 3.3754646840148697,
"grad_norm": 0.15517786038481768,
"learning_rate": 2.2787921059663107e-05,
"loss": 0.3044,
"step": 681
},
{
"epoch": 3.3804213135068153,
"grad_norm": 0.1281201579748097,
"learning_rate": 2.266254433806883e-05,
"loss": 0.2988,
"step": 682
},
{
"epoch": 3.385377942998761,
"grad_norm": 0.1354716231594334,
"learning_rate": 2.253737700255668e-05,
"loss": 0.3014,
"step": 683
},
{
"epoch": 3.390334572490706,
"grad_norm": 0.13847005305529575,
"learning_rate": 2.2412420564784324e-05,
"loss": 0.3049,
"step": 684
},
{
"epoch": 3.3952912019826518,
"grad_norm": 0.1360457576632222,
"learning_rate": 2.228767653386242e-05,
"loss": 0.3054,
"step": 685
},
{
"epoch": 3.4002478314745974,
"grad_norm": 0.13897673922449205,
"learning_rate": 2.2163146416336362e-05,
"loss": 0.3052,
"step": 686
},
{
"epoch": 3.4052044609665426,
"grad_norm": 0.12908787668600238,
"learning_rate": 2.203883171616812e-05,
"loss": 0.3073,
"step": 687
},
{
"epoch": 3.4101610904584883,
"grad_norm": 0.13724987290006566,
"learning_rate": 2.1914733934717943e-05,
"loss": 0.3029,
"step": 688
},
{
"epoch": 3.415117719950434,
"grad_norm": 0.13922005425903064,
"learning_rate": 2.179085457072645e-05,
"loss": 0.3057,
"step": 689
},
{
"epoch": 3.420074349442379,
"grad_norm": 0.1354335750505619,
"learning_rate": 2.1667195120296362e-05,
"loss": 0.3049,
"step": 690
},
{
"epoch": 3.4250309789343247,
"grad_norm": 0.14882023636556463,
"learning_rate": 2.1543757076874502e-05,
"loss": 0.3048,
"step": 691
},
{
"epoch": 3.42998760842627,
"grad_norm": 0.13836163300637985,
"learning_rate": 2.1420541931233712e-05,
"loss": 0.3051,
"step": 692
},
{
"epoch": 3.4349442379182156,
"grad_norm": 0.14256891798066113,
"learning_rate": 2.1297551171454875e-05,
"loss": 0.3055,
"step": 693
},
{
"epoch": 3.439900867410161,
"grad_norm": 0.1384981981066132,
"learning_rate": 2.1174786282908978e-05,
"loss": 0.3092,
"step": 694
},
{
"epoch": 3.4448574969021064,
"grad_norm": 0.12340421646609305,
"learning_rate": 2.105224874823914e-05,
"loss": 0.3051,
"step": 695
},
{
"epoch": 3.449814126394052,
"grad_norm": 0.13563333777142683,
"learning_rate": 2.092994004734267e-05,
"loss": 0.3056,
"step": 696
},
{
"epoch": 3.4547707558859977,
"grad_norm": 0.14050309446491482,
"learning_rate": 2.0807861657353232e-05,
"loss": 0.3029,
"step": 697
},
{
"epoch": 3.459727385377943,
"grad_norm": 0.11583428009228229,
"learning_rate": 2.0686015052623036e-05,
"loss": 0.3122,
"step": 698
},
{
"epoch": 3.4646840148698885,
"grad_norm": 0.13336246662013498,
"learning_rate": 2.0564401704705e-05,
"loss": 0.3045,
"step": 699
},
{
"epoch": 3.469640644361834,
"grad_norm": 0.12782118270953827,
"learning_rate": 2.0443023082334947e-05,
"loss": 0.3055,
"step": 700
},
{
"epoch": 3.4745972738537794,
"grad_norm": 0.12141941353759776,
"learning_rate": 2.032188065141389e-05,
"loss": 0.3059,
"step": 701
},
{
"epoch": 3.479553903345725,
"grad_norm": 0.1277767185346589,
"learning_rate": 2.0200975874990395e-05,
"loss": 0.3045,
"step": 702
},
{
"epoch": 3.48451053283767,
"grad_norm": 0.12523003640838745,
"learning_rate": 2.0080310213242776e-05,
"loss": 0.3017,
"step": 703
},
{
"epoch": 3.489467162329616,
"grad_norm": 0.11933778268709544,
"learning_rate": 1.9959885123461605e-05,
"loss": 0.3051,
"step": 704
},
{
"epoch": 3.4944237918215615,
"grad_norm": 0.13986871746721385,
"learning_rate": 1.983970206003201e-05,
"loss": 0.3083,
"step": 705
},
{
"epoch": 3.4993804213135067,
"grad_norm": 0.11878623960823902,
"learning_rate": 1.971976247441615e-05,
"loss": 0.307,
"step": 706
},
{
"epoch": 3.5043370508054523,
"grad_norm": 0.12807313446541005,
"learning_rate": 1.960006781513565e-05,
"loss": 0.3083,
"step": 707
},
{
"epoch": 3.5092936802973975,
"grad_norm": 0.13808763890459724,
"learning_rate": 1.9480619527754184e-05,
"loss": 0.3091,
"step": 708
},
{
"epoch": 3.514250309789343,
"grad_norm": 0.13222322590668356,
"learning_rate": 1.9361419054859965e-05,
"loss": 0.3017,
"step": 709
},
{
"epoch": 3.519206939281289,
"grad_norm": 0.12925333708719713,
"learning_rate": 1.9242467836048296e-05,
"loss": 0.3016,
"step": 710
},
{
"epoch": 3.5241635687732344,
"grad_norm": 0.131207679724427,
"learning_rate": 1.9123767307904216e-05,
"loss": 0.3048,
"step": 711
},
{
"epoch": 3.5291201982651796,
"grad_norm": 0.13331684079879477,
"learning_rate": 1.900531890398518e-05,
"loss": 0.3065,
"step": 712
},
{
"epoch": 3.5340768277571253,
"grad_norm": 0.11121332621566554,
"learning_rate": 1.8887124054803712e-05,
"loss": 0.301,
"step": 713
},
{
"epoch": 3.5390334572490705,
"grad_norm": 0.16037753358502257,
"learning_rate": 1.8769184187810097e-05,
"loss": 0.3082,
"step": 714
},
{
"epoch": 3.543990086741016,
"grad_norm": 0.11888454595139157,
"learning_rate": 1.8651500727375197e-05,
"loss": 0.3049,
"step": 715
},
{
"epoch": 3.5489467162329618,
"grad_norm": 0.12979326089504686,
"learning_rate": 1.853407509477323e-05,
"loss": 0.3069,
"step": 716
},
{
"epoch": 3.553903345724907,
"grad_norm": 0.11330744270390525,
"learning_rate": 1.8416908708164625e-05,
"loss": 0.3003,
"step": 717
},
{
"epoch": 3.5588599752168526,
"grad_norm": 0.1263040498972687,
"learning_rate": 1.830000298257881e-05,
"loss": 0.3038,
"step": 718
},
{
"epoch": 3.563816604708798,
"grad_norm": 0.1080320775091172,
"learning_rate": 1.8183359329897273e-05,
"loss": 0.3054,
"step": 719
},
{
"epoch": 3.5687732342007434,
"grad_norm": 0.12148959658498093,
"learning_rate": 1.8066979158836324e-05,
"loss": 0.3015,
"step": 720
},
{
"epoch": 3.573729863692689,
"grad_norm": 0.1306325937471217,
"learning_rate": 1.7950863874930272e-05,
"loss": 0.3091,
"step": 721
},
{
"epoch": 3.5786864931846347,
"grad_norm": 0.11558208336385786,
"learning_rate": 1.7835014880514285e-05,
"loss": 0.3054,
"step": 722
},
{
"epoch": 3.58364312267658,
"grad_norm": 0.11233874217296087,
"learning_rate": 1.771943357470759e-05,
"loss": 0.3099,
"step": 723
},
{
"epoch": 3.5885997521685256,
"grad_norm": 0.10715556044653257,
"learning_rate": 1.760412135339646e-05,
"loss": 0.3096,
"step": 724
},
{
"epoch": 3.5935563816604708,
"grad_norm": 0.12755358454831467,
"learning_rate": 1.7489079609217454e-05,
"loss": 0.3067,
"step": 725
},
{
"epoch": 3.5985130111524164,
"grad_norm": 0.11556106841681339,
"learning_rate": 1.7374309731540512e-05,
"loss": 0.302,
"step": 726
},
{
"epoch": 3.603469640644362,
"grad_norm": 0.13828149686762833,
"learning_rate": 1.7259813106452264e-05,
"loss": 0.3042,
"step": 727
},
{
"epoch": 3.6084262701363072,
"grad_norm": 0.11303890136663619,
"learning_rate": 1.7145591116739188e-05,
"loss": 0.3059,
"step": 728
},
{
"epoch": 3.613382899628253,
"grad_norm": 0.15458800679371804,
"learning_rate": 1.7031645141871017e-05,
"loss": 0.3078,
"step": 729
},
{
"epoch": 3.618339529120198,
"grad_norm": 0.10950048353822843,
"learning_rate": 1.691797655798398e-05,
"loss": 0.3012,
"step": 730
},
{
"epoch": 3.6232961586121437,
"grad_norm": 0.14585691976372792,
"learning_rate": 1.680458673786426e-05,
"loss": 0.3056,
"step": 731
},
{
"epoch": 3.6282527881040894,
"grad_norm": 0.10063716590394328,
"learning_rate": 1.66914770509314e-05,
"loss": 0.3059,
"step": 732
},
{
"epoch": 3.6332094175960346,
"grad_norm": 0.16856798984329935,
"learning_rate": 1.6578648863221704e-05,
"loss": 0.2976,
"step": 733
},
{
"epoch": 3.63816604708798,
"grad_norm": 0.09860058370125988,
"learning_rate": 1.6466103537371786e-05,
"loss": 0.3033,
"step": 734
},
{
"epoch": 3.6431226765799254,
"grad_norm": 0.13460381635384605,
"learning_rate": 1.635384243260215e-05,
"loss": 0.302,
"step": 735
},
{
"epoch": 3.648079306071871,
"grad_norm": 0.11916515297319498,
"learning_rate": 1.6241866904700717e-05,
"loss": 0.2986,
"step": 736
},
{
"epoch": 3.6530359355638167,
"grad_norm": 0.12728574264346731,
"learning_rate": 1.6130178306006458e-05,
"loss": 0.3049,
"step": 737
},
{
"epoch": 3.6579925650557623,
"grad_norm": 0.1267215019920343,
"learning_rate": 1.601877798539307e-05,
"loss": 0.3069,
"step": 738
},
{
"epoch": 3.6629491945477075,
"grad_norm": 0.12422633657735385,
"learning_rate": 1.5907667288252698e-05,
"loss": 0.3,
"step": 739
},
{
"epoch": 3.667905824039653,
"grad_norm": 0.10548593638677428,
"learning_rate": 1.5796847556479714e-05,
"loss": 0.3028,
"step": 740
},
{
"epoch": 3.6728624535315983,
"grad_norm": 0.10690753261122879,
"learning_rate": 1.5686320128454407e-05,
"loss": 0.3062,
"step": 741
},
{
"epoch": 3.677819083023544,
"grad_norm": 0.10095888952717658,
"learning_rate": 1.557608633902691e-05,
"loss": 0.3058,
"step": 742
},
{
"epoch": 3.6827757125154896,
"grad_norm": 0.11008343099787869,
"learning_rate": 1.5466147519501074e-05,
"loss": 0.3045,
"step": 743
},
{
"epoch": 3.687732342007435,
"grad_norm": 0.09550712617687758,
"learning_rate": 1.5356504997618382e-05,
"loss": 0.3,
"step": 744
},
{
"epoch": 3.6926889714993805,
"grad_norm": 0.11495321087022631,
"learning_rate": 1.524716009754184e-05,
"loss": 0.2979,
"step": 745
},
{
"epoch": 3.6976456009913257,
"grad_norm": 0.11635709075735622,
"learning_rate": 1.5138114139840117e-05,
"loss": 0.3023,
"step": 746
},
{
"epoch": 3.7026022304832713,
"grad_norm": 0.11348474927513516,
"learning_rate": 1.5029368441471483e-05,
"loss": 0.3015,
"step": 747
},
{
"epoch": 3.707558859975217,
"grad_norm": 0.12353200503351507,
"learning_rate": 1.4920924315767952e-05,
"loss": 0.3018,
"step": 748
},
{
"epoch": 3.7125154894671626,
"grad_norm": 0.11128731875814545,
"learning_rate": 1.4812783072419442e-05,
"loss": 0.3024,
"step": 749
},
{
"epoch": 3.717472118959108,
"grad_norm": 0.12479505016869401,
"learning_rate": 1.4704946017457925e-05,
"loss": 0.3029,
"step": 750
},
{
"epoch": 3.7224287484510534,
"grad_norm": 0.10039835754063967,
"learning_rate": 1.4597414453241636e-05,
"loss": 0.3062,
"step": 751
},
{
"epoch": 3.7273853779429986,
"grad_norm": 0.13104594311695542,
"learning_rate": 1.4490189678439376e-05,
"loss": 0.3044,
"step": 752
},
{
"epoch": 3.7323420074349443,
"grad_norm": 0.11702158446742426,
"learning_rate": 1.4383272988014851e-05,
"loss": 0.3064,
"step": 753
},
{
"epoch": 3.73729863692689,
"grad_norm": 0.11975778168587813,
"learning_rate": 1.427666567321099e-05,
"loss": 0.3067,
"step": 754
},
{
"epoch": 3.742255266418835,
"grad_norm": 0.1181913843002168,
"learning_rate": 1.4170369021534347e-05,
"loss": 0.3015,
"step": 755
},
{
"epoch": 3.7472118959107807,
"grad_norm": 0.1045706641304008,
"learning_rate": 1.4064384316739563e-05,
"loss": 0.3036,
"step": 756
},
{
"epoch": 3.752168525402726,
"grad_norm": 0.11402120830752768,
"learning_rate": 1.3958712838813902e-05,
"loss": 0.3039,
"step": 757
},
{
"epoch": 3.7571251548946716,
"grad_norm": 0.12157415963281785,
"learning_rate": 1.3853355863961731e-05,
"loss": 0.303,
"step": 758
},
{
"epoch": 3.7620817843866172,
"grad_norm": 0.10041584835817995,
"learning_rate": 1.3748314664589169e-05,
"loss": 0.3044,
"step": 759
},
{
"epoch": 3.7670384138785624,
"grad_norm": 0.11527234898628047,
"learning_rate": 1.3643590509288607e-05,
"loss": 0.3022,
"step": 760
},
{
"epoch": 3.771995043370508,
"grad_norm": 0.10863127064484607,
"learning_rate": 1.353918466282354e-05,
"loss": 0.3038,
"step": 761
},
{
"epoch": 3.7769516728624533,
"grad_norm": 0.1101423637817911,
"learning_rate": 1.3435098386113192e-05,
"loss": 0.2996,
"step": 762
},
{
"epoch": 3.781908302354399,
"grad_norm": 0.11761126602863732,
"learning_rate": 1.3331332936217326e-05,
"loss": 0.3039,
"step": 763
},
{
"epoch": 3.7868649318463445,
"grad_norm": 0.10558893915498015,
"learning_rate": 1.3227889566321022e-05,
"loss": 0.3039,
"step": 764
},
{
"epoch": 3.79182156133829,
"grad_norm": 0.10727419606238538,
"learning_rate": 1.3124769525719576e-05,
"loss": 0.3055,
"step": 765
},
{
"epoch": 3.7967781908302354,
"grad_norm": 0.110726016670653,
"learning_rate": 1.3021974059803432e-05,
"loss": 0.3026,
"step": 766
},
{
"epoch": 3.801734820322181,
"grad_norm": 0.09144950044698749,
"learning_rate": 1.2919504410043083e-05,
"loss": 0.2944,
"step": 767
},
{
"epoch": 3.806691449814126,
"grad_norm": 0.1001368569514324,
"learning_rate": 1.2817361813974136e-05,
"loss": 0.3017,
"step": 768
},
{
"epoch": 3.811648079306072,
"grad_norm": 0.09886758402680869,
"learning_rate": 1.2715547505182312e-05,
"loss": 0.3011,
"step": 769
},
{
"epoch": 3.8166047087980175,
"grad_norm": 0.10959965871086583,
"learning_rate": 1.2614062713288608e-05,
"loss": 0.3053,
"step": 770
},
{
"epoch": 3.8215613382899627,
"grad_norm": 0.09813115118616604,
"learning_rate": 1.251290866393438e-05,
"loss": 0.3046,
"step": 771
},
{
"epoch": 3.8265179677819083,
"grad_norm": 0.10075106248289464,
"learning_rate": 1.2412086578766602e-05,
"loss": 0.3068,
"step": 772
},
{
"epoch": 3.8314745972738535,
"grad_norm": 0.11066290728626664,
"learning_rate": 1.2311597675423089e-05,
"loss": 0.3003,
"step": 773
},
{
"epoch": 3.836431226765799,
"grad_norm": 0.09795728766221641,
"learning_rate": 1.2211443167517757e-05,
"loss": 0.301,
"step": 774
},
{
"epoch": 3.841387856257745,
"grad_norm": 0.10258653518600384,
"learning_rate": 1.2111624264626012e-05,
"loss": 0.3048,
"step": 775
},
{
"epoch": 3.8463444857496905,
"grad_norm": 0.10558082756801986,
"learning_rate": 1.2012142172270136e-05,
"loss": 0.3057,
"step": 776
},
{
"epoch": 3.8513011152416357,
"grad_norm": 0.10445656619048024,
"learning_rate": 1.1912998091904724e-05,
"loss": 0.3113,
"step": 777
},
{
"epoch": 3.8562577447335813,
"grad_norm": 0.10212700920367412,
"learning_rate": 1.1814193220902146e-05,
"loss": 0.3031,
"step": 778
},
{
"epoch": 3.8612143742255265,
"grad_norm": 0.10074868067573164,
"learning_rate": 1.1715728752538103e-05,
"loss": 0.308,
"step": 779
},
{
"epoch": 3.866171003717472,
"grad_norm": 0.10571053475203836,
"learning_rate": 1.1617605875977253e-05,
"loss": 0.3045,
"step": 780
},
{
"epoch": 3.8711276332094178,
"grad_norm": 0.10411602926538378,
"learning_rate": 1.1519825776258812e-05,
"loss": 0.3043,
"step": 781
},
{
"epoch": 3.876084262701363,
"grad_norm": 0.10184699452843421,
"learning_rate": 1.142238963428223e-05,
"loss": 0.3025,
"step": 782
},
{
"epoch": 3.8810408921933086,
"grad_norm": 0.09478239661042007,
"learning_rate": 1.1325298626792937e-05,
"loss": 0.3049,
"step": 783
},
{
"epoch": 3.885997521685254,
"grad_norm": 0.09053688283495022,
"learning_rate": 1.1228553926368173e-05,
"loss": 0.3056,
"step": 784
},
{
"epoch": 3.8909541511771994,
"grad_norm": 0.09331448652847525,
"learning_rate": 1.1132156701402796e-05,
"loss": 0.3022,
"step": 785
},
{
"epoch": 3.895910780669145,
"grad_norm": 0.09489554080409102,
"learning_rate": 1.1036108116095142e-05,
"loss": 0.3073,
"step": 786
},
{
"epoch": 3.9008674101610907,
"grad_norm": 0.09039951396430053,
"learning_rate": 1.0940409330432988e-05,
"loss": 0.3038,
"step": 787
},
{
"epoch": 3.905824039653036,
"grad_norm": 0.09184380274512176,
"learning_rate": 1.0845061500179588e-05,
"loss": 0.3075,
"step": 788
},
{
"epoch": 3.9107806691449816,
"grad_norm": 0.09822283803476375,
"learning_rate": 1.0750065776859659e-05,
"loss": 0.2995,
"step": 789
},
{
"epoch": 3.9157372986369268,
"grad_norm": 0.09884065337414615,
"learning_rate": 1.0655423307745463e-05,
"loss": 0.3025,
"step": 790
},
{
"epoch": 3.9206939281288724,
"grad_norm": 0.1023994062957483,
"learning_rate": 1.0561135235843016e-05,
"loss": 0.3062,
"step": 791
},
{
"epoch": 3.925650557620818,
"grad_norm": 0.09323621076945705,
"learning_rate": 1.0467202699878212e-05,
"loss": 0.3072,
"step": 792
},
{
"epoch": 3.9306071871127632,
"grad_norm": 0.0996518708664855,
"learning_rate": 1.0373626834283134e-05,
"loss": 0.2942,
"step": 793
},
{
"epoch": 3.935563816604709,
"grad_norm": 0.09801724942181,
"learning_rate": 1.028040876918229e-05,
"loss": 0.3068,
"step": 794
},
{
"epoch": 3.940520446096654,
"grad_norm": 0.10159923408173142,
"learning_rate": 1.018754963037904e-05,
"loss": 0.3018,
"step": 795
},
{
"epoch": 3.9454770755885997,
"grad_norm": 0.09250643265924624,
"learning_rate": 1.0095050539341926e-05,
"loss": 0.3006,
"step": 796
},
{
"epoch": 3.9504337050805454,
"grad_norm": 0.09946375207125929,
"learning_rate": 1.0002912613191152e-05,
"loss": 0.3053,
"step": 797
},
{
"epoch": 3.9553903345724906,
"grad_norm": 0.10001637507100242,
"learning_rate": 9.911136964685121e-06,
"loss": 0.2997,
"step": 798
},
{
"epoch": 3.960346964064436,
"grad_norm": 0.08720799711260668,
"learning_rate": 9.819724702206984e-06,
"loss": 0.3009,
"step": 799
},
{
"epoch": 3.9653035935563814,
"grad_norm": 0.09577575913167323,
"learning_rate": 9.728676929751235e-06,
"loss": 0.3055,
"step": 800
},
{
"epoch": 3.970260223048327,
"grad_norm": 0.10355284470935523,
"learning_rate": 9.637994746910348e-06,
"loss": 0.308,
"step": 801
},
{
"epoch": 3.9752168525402727,
"grad_norm": 0.09524290206581833,
"learning_rate": 9.547679248861593e-06,
"loss": 0.307,
"step": 802
},
{
"epoch": 3.9801734820322183,
"grad_norm": 0.10448142281939526,
"learning_rate": 9.457731526353725e-06,
"loss": 0.3049,
"step": 803
},
{
"epoch": 3.9851301115241635,
"grad_norm": 0.09263016898548693,
"learning_rate": 9.368152665693864e-06,
"loss": 0.3008,
"step": 804
},
{
"epoch": 3.990086741016109,
"grad_norm": 0.10213714785836968,
"learning_rate": 9.278943748734321e-06,
"loss": 0.3046,
"step": 805
},
{
"epoch": 3.9950433705080544,
"grad_norm": 0.09415919510644356,
"learning_rate": 9.190105852859559e-06,
"loss": 0.3001,
"step": 806
},
{
"epoch": 4.0,
"grad_norm": 0.11841781438474604,
"learning_rate": 9.101640050973213e-06,
"loss": 0.3822,
"step": 807
},
{
"epoch": 4.004956629491946,
"grad_norm": 0.14722791895577392,
"learning_rate": 9.013547411485102e-06,
"loss": 0.2865,
"step": 808
},
{
"epoch": 4.009913258983891,
"grad_norm": 0.12108449172793088,
"learning_rate": 8.925828998298298e-06,
"loss": 0.2798,
"step": 809
},
{
"epoch": 4.014869888475836,
"grad_norm": 0.1016479933236839,
"learning_rate": 8.83848587079632e-06,
"loss": 0.2802,
"step": 810
},
{
"epoch": 4.019826517967782,
"grad_norm": 0.11127976713554319,
"learning_rate": 8.75151908383034e-06,
"loss": 0.2792,
"step": 811
},
{
"epoch": 4.024783147459727,
"grad_norm": 0.13578069554004418,
"learning_rate": 8.664929687706434e-06,
"loss": 0.281,
"step": 812
},
{
"epoch": 4.029739776951673,
"grad_norm": 0.1360686294776198,
"learning_rate": 8.578718728172868e-06,
"loss": 0.281,
"step": 813
},
{
"epoch": 4.034696406443619,
"grad_norm": 0.11601909584488747,
"learning_rate": 8.492887246407489e-06,
"loss": 0.284,
"step": 814
},
{
"epoch": 4.039653035935564,
"grad_norm": 0.1154783460259016,
"learning_rate": 8.407436279005203e-06,
"loss": 0.2833,
"step": 815
},
{
"epoch": 4.044609665427509,
"grad_norm": 0.11725746533152305,
"learning_rate": 8.322366857965356e-06,
"loss": 0.2813,
"step": 816
},
{
"epoch": 4.049566294919455,
"grad_norm": 0.1181058547228756,
"learning_rate": 8.237680010679345e-06,
"loss": 0.2811,
"step": 817
},
{
"epoch": 4.0545229244114,
"grad_norm": 0.11517240470012005,
"learning_rate": 8.153376759918207e-06,
"loss": 0.2822,
"step": 818
},
{
"epoch": 4.059479553903346,
"grad_norm": 0.11125181982315334,
"learning_rate": 8.06945812382022e-06,
"loss": 0.2857,
"step": 819
},
{
"epoch": 4.064436183395292,
"grad_norm": 0.11133148377505786,
"learning_rate": 7.985925115878621e-06,
"loss": 0.2795,
"step": 820
},
{
"epoch": 4.069392812887236,
"grad_norm": 0.10387667204411895,
"learning_rate": 7.902778744929414e-06,
"loss": 0.2852,
"step": 821
},
{
"epoch": 4.074349442379182,
"grad_norm": 0.10525576161206568,
"learning_rate": 7.820020015139156e-06,
"loss": 0.2836,
"step": 822
},
{
"epoch": 4.079306071871128,
"grad_norm": 0.108828574574679,
"learning_rate": 7.737649925992792e-06,
"loss": 0.277,
"step": 823
},
{
"epoch": 4.084262701363073,
"grad_norm": 0.10127418772217861,
"learning_rate": 7.655669472281625e-06,
"loss": 0.2839,
"step": 824
},
{
"epoch": 4.089219330855019,
"grad_norm": 0.10115795681430574,
"learning_rate": 7.57407964409131e-06,
"loss": 0.283,
"step": 825
},
{
"epoch": 4.094175960346964,
"grad_norm": 0.09518739210847543,
"learning_rate": 7.492881426789882e-06,
"loss": 0.2814,
"step": 826
},
{
"epoch": 4.099132589838909,
"grad_norm": 0.0959614198507469,
"learning_rate": 7.412075801015843e-06,
"loss": 0.285,
"step": 827
},
{
"epoch": 4.104089219330855,
"grad_norm": 0.10585234312512412,
"learning_rate": 7.331663742666317e-06,
"loss": 0.2763,
"step": 828
},
{
"epoch": 4.1090458488228006,
"grad_norm": 0.1004428715176425,
"learning_rate": 7.251646222885305e-06,
"loss": 0.2821,
"step": 829
},
{
"epoch": 4.114002478314746,
"grad_norm": 0.09091700534277954,
"learning_rate": 7.172024208051925e-06,
"loss": 0.2823,
"step": 830
},
{
"epoch": 4.118959107806692,
"grad_norm": 0.09687764910942183,
"learning_rate": 7.09279865976872e-06,
"loss": 0.2822,
"step": 831
},
{
"epoch": 4.123915737298637,
"grad_norm": 0.0958781996446087,
"learning_rate": 7.013970534850103e-06,
"loss": 0.2806,
"step": 832
},
{
"epoch": 4.128872366790582,
"grad_norm": 0.09496634844167438,
"learning_rate": 6.935540785310731e-06,
"loss": 0.2782,
"step": 833
},
{
"epoch": 4.133828996282528,
"grad_norm": 0.09824356232858408,
"learning_rate": 6.857510358354078e-06,
"loss": 0.2868,
"step": 834
},
{
"epoch": 4.1387856257744735,
"grad_norm": 0.09351975487729051,
"learning_rate": 6.7798801963609375e-06,
"loss": 0.2798,
"step": 835
},
{
"epoch": 4.143742255266419,
"grad_norm": 0.09995853450151393,
"learning_rate": 6.702651236878086e-06,
"loss": 0.2845,
"step": 836
},
{
"epoch": 4.148698884758364,
"grad_norm": 0.09083827830168457,
"learning_rate": 6.625824412606911e-06,
"loss": 0.279,
"step": 837
},
{
"epoch": 4.1536555142503095,
"grad_norm": 0.09393734030049775,
"learning_rate": 6.549400651392215e-06,
"loss": 0.2818,
"step": 838
},
{
"epoch": 4.158612143742255,
"grad_norm": 0.08659148767062241,
"learning_rate": 6.473380876210927e-06,
"loss": 0.2851,
"step": 839
},
{
"epoch": 4.163568773234201,
"grad_norm": 0.08429036468561842,
"learning_rate": 6.397766005161035e-06,
"loss": 0.2828,
"step": 840
},
{
"epoch": 4.1685254027261465,
"grad_norm": 0.09080594271610455,
"learning_rate": 6.322556951450431e-06,
"loss": 0.2828,
"step": 841
},
{
"epoch": 4.173482032218092,
"grad_norm": 0.09298606471973449,
"learning_rate": 6.247754623385946e-06,
"loss": 0.2817,
"step": 842
},
{
"epoch": 4.178438661710037,
"grad_norm": 0.09224315456638131,
"learning_rate": 6.173359924362312e-06,
"loss": 0.2818,
"step": 843
},
{
"epoch": 4.1833952912019825,
"grad_norm": 0.08293341261202053,
"learning_rate": 6.0993737528513055e-06,
"loss": 0.2823,
"step": 844
},
{
"epoch": 4.188351920693928,
"grad_norm": 0.08879511214700042,
"learning_rate": 6.025797002390894e-06,
"loss": 0.2845,
"step": 845
},
{
"epoch": 4.193308550185874,
"grad_norm": 0.08314309847087989,
"learning_rate": 5.952630561574402e-06,
"loss": 0.2884,
"step": 846
},
{
"epoch": 4.198265179677819,
"grad_norm": 0.09240969634004986,
"learning_rate": 5.8798753140397956e-06,
"loss": 0.2813,
"step": 847
},
{
"epoch": 4.203221809169764,
"grad_norm": 0.08832272007337454,
"learning_rate": 5.807532138459056e-06,
"loss": 0.2796,
"step": 848
},
{
"epoch": 4.20817843866171,
"grad_norm": 0.08855693742599202,
"learning_rate": 5.735601908527528e-06,
"loss": 0.2852,
"step": 849
},
{
"epoch": 4.2131350681536555,
"grad_norm": 0.09166662154860299,
"learning_rate": 5.664085492953347e-06,
"loss": 0.2808,
"step": 850
},
{
"epoch": 4.218091697645601,
"grad_norm": 0.09526345498603268,
"learning_rate": 5.592983755446981e-06,
"loss": 0.2853,
"step": 851
},
{
"epoch": 4.223048327137547,
"grad_norm": 0.08830250030108663,
"learning_rate": 5.52229755471081e-06,
"loss": 0.2796,
"step": 852
},
{
"epoch": 4.228004956629492,
"grad_norm": 0.0860713986805829,
"learning_rate": 5.452027744428732e-06,
"loss": 0.2803,
"step": 853
},
{
"epoch": 4.232961586121437,
"grad_norm": 0.09234109531994655,
"learning_rate": 5.382175173255846e-06,
"loss": 0.2839,
"step": 854
},
{
"epoch": 4.237918215613383,
"grad_norm": 0.08834860437724747,
"learning_rate": 5.312740684808209e-06,
"loss": 0.2838,
"step": 855
},
{
"epoch": 4.242874845105328,
"grad_norm": 0.08133438171291063,
"learning_rate": 5.24372511765268e-06,
"loss": 0.2818,
"step": 856
},
{
"epoch": 4.247831474597274,
"grad_norm": 0.08160907852625027,
"learning_rate": 5.17512930529676e-06,
"loss": 0.2823,
"step": 857
},
{
"epoch": 4.25278810408922,
"grad_norm": 0.09136437292265856,
"learning_rate": 5.106954076178503e-06,
"loss": 0.2828,
"step": 858
},
{
"epoch": 4.2577447335811645,
"grad_norm": 0.09152299706832191,
"learning_rate": 5.039200253656584e-06,
"loss": 0.2832,
"step": 859
},
{
"epoch": 4.26270136307311,
"grad_norm": 0.08998360407936888,
"learning_rate": 4.971868656000278e-06,
"loss": 0.2804,
"step": 860
},
{
"epoch": 4.267657992565056,
"grad_norm": 0.08355009921069978,
"learning_rate": 4.904960096379609e-06,
"loss": 0.278,
"step": 861
},
{
"epoch": 4.272614622057001,
"grad_norm": 0.09162982326472421,
"learning_rate": 4.838475382855556e-06,
"loss": 0.2817,
"step": 862
},
{
"epoch": 4.277571251548947,
"grad_norm": 0.08769303434470369,
"learning_rate": 4.7724153183702586e-06,
"loss": 0.285,
"step": 863
},
{
"epoch": 4.282527881040892,
"grad_norm": 0.07983368734380246,
"learning_rate": 4.706780700737317e-06,
"loss": 0.2814,
"step": 864
},
{
"epoch": 4.287484510532837,
"grad_norm": 0.08034220721366041,
"learning_rate": 4.641572322632177e-06,
"loss": 0.2791,
"step": 865
},
{
"epoch": 4.292441140024783,
"grad_norm": 0.08230090157319601,
"learning_rate": 4.576790971582559e-06,
"loss": 0.2789,
"step": 866
},
{
"epoch": 4.297397769516729,
"grad_norm": 0.08740869773472577,
"learning_rate": 4.512437429958936e-06,
"loss": 0.2802,
"step": 867
},
{
"epoch": 4.302354399008674,
"grad_norm": 0.08406629521865011,
"learning_rate": 4.448512474965072e-06,
"loss": 0.2841,
"step": 868
},
{
"epoch": 4.30731102850062,
"grad_norm": 0.0806706965026141,
"learning_rate": 4.385016878628654e-06,
"loss": 0.2852,
"step": 869
},
{
"epoch": 4.312267657992565,
"grad_norm": 0.08521916884499932,
"learning_rate": 4.321951407791977e-06,
"loss": 0.2804,
"step": 870
},
{
"epoch": 4.31722428748451,
"grad_norm": 0.07929363763871186,
"learning_rate": 4.25931682410266e-06,
"loss": 0.2853,
"step": 871
},
{
"epoch": 4.322180916976456,
"grad_norm": 0.08265437801922285,
"learning_rate": 4.197113884004473e-06,
"loss": 0.2864,
"step": 872
},
{
"epoch": 4.327137546468402,
"grad_norm": 0.08998988120810332,
"learning_rate": 4.135343338728142e-06,
"loss": 0.2834,
"step": 873
},
{
"epoch": 4.332094175960347,
"grad_norm": 0.08132773635051033,
"learning_rate": 4.074005934282368e-06,
"loss": 0.2836,
"step": 874
},
{
"epoch": 4.337050805452292,
"grad_norm": 0.08585364564919164,
"learning_rate": 4.013102411444752e-06,
"loss": 0.2808,
"step": 875
},
{
"epoch": 4.342007434944238,
"grad_norm": 0.08217709909623434,
"learning_rate": 3.95263350575287e-06,
"loss": 0.284,
"step": 876
},
{
"epoch": 4.346964064436183,
"grad_norm": 0.08717986026942609,
"learning_rate": 3.892599947495379e-06,
"loss": 0.2875,
"step": 877
},
{
"epoch": 4.351920693928129,
"grad_norm": 0.0908634629774549,
"learning_rate": 3.8330024617031906e-06,
"loss": 0.2834,
"step": 878
},
{
"epoch": 4.356877323420075,
"grad_norm": 0.08175206838736869,
"learning_rate": 3.7738417681407647e-06,
"loss": 0.2806,
"step": 879
},
{
"epoch": 4.36183395291202,
"grad_norm": 0.0787428947081775,
"learning_rate": 3.7151185812973435e-06,
"loss": 0.2818,
"step": 880
},
{
"epoch": 4.366790582403965,
"grad_norm": 0.08111202314877745,
"learning_rate": 3.656833610378394e-06,
"loss": 0.2839,
"step": 881
},
{
"epoch": 4.371747211895911,
"grad_norm": 0.08482710438193812,
"learning_rate": 3.5989875592969694e-06,
"loss": 0.2869,
"step": 882
},
{
"epoch": 4.376703841387856,
"grad_norm": 0.08182735521141883,
"learning_rate": 3.5415811266652856e-06,
"loss": 0.2821,
"step": 883
},
{
"epoch": 4.381660470879802,
"grad_norm": 0.08194368399664302,
"learning_rate": 3.4846150057862115e-06,
"loss": 0.2792,
"step": 884
},
{
"epoch": 4.386617100371748,
"grad_norm": 0.08181551891620019,
"learning_rate": 3.428089884644954e-06,
"loss": 0.2794,
"step": 885
},
{
"epoch": 4.391573729863692,
"grad_norm": 0.08743666320985356,
"learning_rate": 3.3720064459007218e-06,
"loss": 0.28,
"step": 886
},
{
"epoch": 4.396530359355638,
"grad_norm": 0.07883732835031036,
"learning_rate": 3.316365366878471e-06,
"loss": 0.2814,
"step": 887
},
{
"epoch": 4.401486988847584,
"grad_norm": 0.084162773543217,
"learning_rate": 3.261167319560734e-06,
"loss": 0.2857,
"step": 888
},
{
"epoch": 4.406443618339529,
"grad_norm": 0.07717717817978705,
"learning_rate": 3.2064129705795266e-06,
"loss": 0.2857,
"step": 889
},
{
"epoch": 4.411400247831475,
"grad_norm": 0.07800427937716604,
"learning_rate": 3.1521029812082803e-06,
"loss": 0.2866,
"step": 890
},
{
"epoch": 4.41635687732342,
"grad_norm": 0.07848859258578157,
"learning_rate": 3.098238007353831e-06,
"loss": 0.2828,
"step": 891
},
{
"epoch": 4.421313506815365,
"grad_norm": 0.0904568854051929,
"learning_rate": 3.0448186995485307e-06,
"loss": 0.2864,
"step": 892
},
{
"epoch": 4.426270136307311,
"grad_norm": 0.07851394101041086,
"learning_rate": 2.991845702942389e-06,
"loss": 0.2824,
"step": 893
},
{
"epoch": 4.431226765799257,
"grad_norm": 0.08119789855176456,
"learning_rate": 2.939319657295263e-06,
"loss": 0.2803,
"step": 894
},
{
"epoch": 4.436183395291202,
"grad_norm": 0.08619128184770572,
"learning_rate": 2.8872411969691527e-06,
"loss": 0.2828,
"step": 895
},
{
"epoch": 4.441140024783148,
"grad_norm": 0.08498957696639169,
"learning_rate": 2.8356109509205e-06,
"loss": 0.2817,
"step": 896
},
{
"epoch": 4.446096654275093,
"grad_norm": 0.07444800742896455,
"learning_rate": 2.7844295426926593e-06,
"loss": 0.2848,
"step": 897
},
{
"epoch": 4.451053283767038,
"grad_norm": 0.07560633490699613,
"learning_rate": 2.7336975904083085e-06,
"loss": 0.2811,
"step": 898
},
{
"epoch": 4.456009913258984,
"grad_norm": 0.07621442311288926,
"learning_rate": 2.683415706762e-06,
"loss": 0.2779,
"step": 899
},
{
"epoch": 4.4609665427509295,
"grad_norm": 0.07853324700844574,
"learning_rate": 2.6335844990127646e-06,
"loss": 0.281,
"step": 900
},
{
"epoch": 4.465923172242875,
"grad_norm": 0.08060513652848875,
"learning_rate": 2.5842045689767935e-06,
"loss": 0.281,
"step": 901
},
{
"epoch": 4.47087980173482,
"grad_norm": 0.0770124294811412,
"learning_rate": 2.535276513020142e-06,
"loss": 0.2872,
"step": 902
},
{
"epoch": 4.4758364312267656,
"grad_norm": 0.07941564944704892,
"learning_rate": 2.48680092205154e-06,
"loss": 0.2812,
"step": 903
},
{
"epoch": 4.480793060718711,
"grad_norm": 0.07746658082235451,
"learning_rate": 2.4387783815152634e-06,
"loss": 0.2831,
"step": 904
},
{
"epoch": 4.485749690210657,
"grad_norm": 0.07801782052601454,
"learning_rate": 2.3912094713840395e-06,
"loss": 0.2864,
"step": 905
},
{
"epoch": 4.4907063197026025,
"grad_norm": 0.07813726856842436,
"learning_rate": 2.3440947661520763e-06,
"loss": 0.2801,
"step": 906
},
{
"epoch": 4.495662949194548,
"grad_norm": 0.07746692349365746,
"learning_rate": 2.297434834828094e-06,
"loss": 0.281,
"step": 907
},
{
"epoch": 4.500619578686493,
"grad_norm": 0.07816724100489567,
"learning_rate": 2.2512302409284724e-06,
"loss": 0.286,
"step": 908
},
{
"epoch": 4.5055762081784385,
"grad_norm": 0.07670669793398054,
"learning_rate": 2.2054815424704447e-06,
"loss": 0.2834,
"step": 909
},
{
"epoch": 4.510532837670384,
"grad_norm": 0.07813664769713757,
"learning_rate": 2.1601892919653223e-06,
"loss": 0.2833,
"step": 910
},
{
"epoch": 4.51548946716233,
"grad_norm": 0.07854983906424601,
"learning_rate": 2.1153540364118895e-06,
"loss": 0.2864,
"step": 911
},
{
"epoch": 4.520446096654275,
"grad_norm": 0.07823949746195673,
"learning_rate": 2.0709763172897366e-06,
"loss": 0.2829,
"step": 912
},
{
"epoch": 4.52540272614622,
"grad_norm": 0.07798016395999526,
"learning_rate": 2.027056670552767e-06,
"loss": 0.2865,
"step": 913
},
{
"epoch": 4.530359355638166,
"grad_norm": 0.07835829227806566,
"learning_rate": 1.9835956266226564e-06,
"loss": 0.281,
"step": 914
},
{
"epoch": 4.5353159851301115,
"grad_norm": 0.07473839377544324,
"learning_rate": 1.9405937103825323e-06,
"loss": 0.2794,
"step": 915
},
{
"epoch": 4.540272614622057,
"grad_norm": 0.07424740849428764,
"learning_rate": 1.8980514411705764e-06,
"loss": 0.2829,
"step": 916
},
{
"epoch": 4.545229244114003,
"grad_norm": 0.07948180946158458,
"learning_rate": 1.8559693327737881e-06,
"loss": 0.2872,
"step": 917
},
{
"epoch": 4.5501858736059475,
"grad_norm": 0.07490654186327057,
"learning_rate": 1.814347893421733e-06,
"loss": 0.2815,
"step": 918
},
{
"epoch": 4.555142503097893,
"grad_norm": 0.07699939647242027,
"learning_rate": 1.7731876257804436e-06,
"loss": 0.284,
"step": 919
},
{
"epoch": 4.560099132589839,
"grad_norm": 0.07804147212637814,
"learning_rate": 1.7324890269463513e-06,
"loss": 0.2814,
"step": 920
},
{
"epoch": 4.565055762081784,
"grad_norm": 0.07825477347641324,
"learning_rate": 1.692252588440262e-06,
"loss": 0.2845,
"step": 921
},
{
"epoch": 4.57001239157373,
"grad_norm": 0.07437512487598033,
"learning_rate": 1.6524787962014244e-06,
"loss": 0.2798,
"step": 922
},
{
"epoch": 4.574969021065676,
"grad_norm": 0.07557314641029611,
"learning_rate": 1.6131681305816637e-06,
"loss": 0.2873,
"step": 923
},
{
"epoch": 4.5799256505576205,
"grad_norm": 0.07775126489770805,
"learning_rate": 1.5743210663395813e-06,
"loss": 0.2847,
"step": 924
},
{
"epoch": 4.584882280049566,
"grad_norm": 0.07766485367642273,
"learning_rate": 1.5359380726348394e-06,
"loss": 0.2821,
"step": 925
},
{
"epoch": 4.589838909541512,
"grad_norm": 0.07875944113674188,
"learning_rate": 1.49801961302245e-06,
"loss": 0.2812,
"step": 926
},
{
"epoch": 4.594795539033457,
"grad_norm": 0.07557815096974223,
"learning_rate": 1.4605661454472153e-06,
"loss": 0.2827,
"step": 927
},
{
"epoch": 4.599752168525403,
"grad_norm": 0.07912225031104035,
"learning_rate": 1.4235781222381895e-06,
"loss": 0.284,
"step": 928
},
{
"epoch": 4.604708798017349,
"grad_norm": 0.07482374187052634,
"learning_rate": 1.3870559901031987e-06,
"loss": 0.2862,
"step": 929
},
{
"epoch": 4.609665427509293,
"grad_norm": 0.07746410057512453,
"learning_rate": 1.3510001901234725e-06,
"loss": 0.2878,
"step": 930
},
{
"epoch": 4.614622057001239,
"grad_norm": 0.07418347712699255,
"learning_rate": 1.315411157748301e-06,
"loss": 0.2831,
"step": 931
},
{
"epoch": 4.619578686493185,
"grad_norm": 0.07796826236037331,
"learning_rate": 1.2802893227897672e-06,
"loss": 0.2907,
"step": 932
},
{
"epoch": 4.62453531598513,
"grad_norm": 0.07337212609853155,
"learning_rate": 1.2456351094175756e-06,
"loss": 0.2814,
"step": 933
},
{
"epoch": 4.629491945477076,
"grad_norm": 0.07760397710295686,
"learning_rate": 1.2114489361539205e-06,
"loss": 0.2819,
"step": 934
},
{
"epoch": 4.634448574969021,
"grad_norm": 0.07402370372538353,
"learning_rate": 1.1777312158684339e-06,
"loss": 0.2866,
"step": 935
},
{
"epoch": 4.639405204460966,
"grad_norm": 0.07319655425900132,
"learning_rate": 1.1444823557731887e-06,
"loss": 0.2826,
"step": 936
},
{
"epoch": 4.644361833952912,
"grad_norm": 0.08008983797140948,
"learning_rate": 1.1117027574177918e-06,
"loss": 0.2842,
"step": 937
},
{
"epoch": 4.649318463444858,
"grad_norm": 0.07566854910903925,
"learning_rate": 1.0793928166845436e-06,
"loss": 0.2811,
"step": 938
},
{
"epoch": 4.654275092936803,
"grad_norm": 0.07349105620187181,
"learning_rate": 1.0475529237836325e-06,
"loss": 0.2845,
"step": 939
},
{
"epoch": 4.659231722428748,
"grad_norm": 0.07639825926061859,
"learning_rate": 1.0161834632484368e-06,
"loss": 0.2823,
"step": 940
},
{
"epoch": 4.664188351920694,
"grad_norm": 0.07596374087858862,
"learning_rate": 9.8528481393088e-07,
"loss": 0.282,
"step": 941
},
{
"epoch": 4.669144981412639,
"grad_norm": 0.07448678341002254,
"learning_rate": 9.54857348996856e-07,
"loss": 0.2899,
"step": 942
},
{
"epoch": 4.674101610904585,
"grad_norm": 0.07508881677292457,
"learning_rate": 9.249014359217256e-07,
"loss": 0.2853,
"step": 943
},
{
"epoch": 4.679058240396531,
"grad_norm": 0.07395798034861722,
"learning_rate": 8.954174364858548e-07,
"loss": 0.2832,
"step": 944
},
{
"epoch": 4.684014869888475,
"grad_norm": 0.07501163398901284,
"learning_rate": 8.664057067702924e-07,
"loss": 0.2815,
"step": 945
},
{
"epoch": 4.688971499380421,
"grad_norm": 0.07709895170244532,
"learning_rate": 8.378665971524147e-07,
"loss": 0.2855,
"step": 946
},
{
"epoch": 4.693928128872367,
"grad_norm": 0.07216153858578365,
"learning_rate": 8.098004523017367e-07,
"loss": 0.2825,
"step": 947
},
{
"epoch": 4.698884758364312,
"grad_norm": 0.0714764773582895,
"learning_rate": 7.822076111757205e-07,
"loss": 0.2803,
"step": 948
},
{
"epoch": 4.703841387856258,
"grad_norm": 0.07349501739897359,
"learning_rate": 7.55088407015716e-07,
"loss": 0.283,
"step": 949
},
{
"epoch": 4.708798017348204,
"grad_norm": 0.07253316663670173,
"learning_rate": 7.284431673428937e-07,
"loss": 0.288,
"step": 950
},
{
"epoch": 4.713754646840148,
"grad_norm": 0.07338883975150082,
"learning_rate": 7.022722139543225e-07,
"loss": 0.2834,
"step": 951
},
{
"epoch": 4.718711276332094,
"grad_norm": 0.07911708566826511,
"learning_rate": 6.765758629190578e-07,
"loss": 0.2866,
"step": 952
},
{
"epoch": 4.72366790582404,
"grad_norm": 0.07502515318559184,
"learning_rate": 6.513544245743575e-07,
"loss": 0.2856,
"step": 953
},
{
"epoch": 4.728624535315985,
"grad_norm": 0.07421022982739711,
"learning_rate": 6.26608203521899e-07,
"loss": 0.2831,
"step": 954
},
{
"epoch": 4.733581164807931,
"grad_norm": 0.07497763527285664,
"learning_rate": 6.023374986241193e-07,
"loss": 0.2809,
"step": 955
},
{
"epoch": 4.7385377942998765,
"grad_norm": 0.07670570586677927,
"learning_rate": 5.785426030006091e-07,
"loss": 0.2856,
"step": 956
},
{
"epoch": 4.743494423791821,
"grad_norm": 0.07480580777663845,
"learning_rate": 5.552238040245516e-07,
"loss": 0.2821,
"step": 957
},
{
"epoch": 4.748451053283767,
"grad_norm": 0.07308664128147804,
"learning_rate": 5.323813833192848e-07,
"loss": 0.2821,
"step": 958
},
{
"epoch": 4.753407682775713,
"grad_norm": 0.07166740014206943,
"learning_rate": 5.100156167548642e-07,
"loss": 0.2857,
"step": 959
},
{
"epoch": 4.758364312267658,
"grad_norm": 0.0750156333319193,
"learning_rate": 4.881267744447548e-07,
"loss": 0.2807,
"step": 960
},
{
"epoch": 4.763320941759604,
"grad_norm": 0.0760232819168498,
"learning_rate": 4.6671512074256686e-07,
"loss": 0.2835,
"step": 961
},
{
"epoch": 4.768277571251549,
"grad_norm": 0.07186700445060604,
"learning_rate": 4.4578091423885804e-07,
"loss": 0.2851,
"step": 962
},
{
"epoch": 4.773234200743494,
"grad_norm": 0.0699929775880697,
"learning_rate": 4.253244077580032e-07,
"loss": 0.2887,
"step": 963
},
{
"epoch": 4.77819083023544,
"grad_norm": 0.07177034127350093,
"learning_rate": 4.05345848355152e-07,
"loss": 0.2814,
"step": 964
},
{
"epoch": 4.7831474597273855,
"grad_norm": 0.07318619478686327,
"learning_rate": 3.858454773132492e-07,
"loss": 0.2851,
"step": 965
},
{
"epoch": 4.788104089219331,
"grad_norm": 0.07196381092321871,
"learning_rate": 3.66823530140108e-07,
"loss": 0.2806,
"step": 966
},
{
"epoch": 4.793060718711276,
"grad_norm": 0.07269362276261772,
"learning_rate": 3.4828023656557687e-07,
"loss": 0.2848,
"step": 967
},
{
"epoch": 4.798017348203222,
"grad_norm": 0.07207708326827385,
"learning_rate": 3.302158205387507e-07,
"loss": 0.2826,
"step": 968
},
{
"epoch": 4.802973977695167,
"grad_norm": 0.07133213853387121,
"learning_rate": 3.1263050022528385e-07,
"loss": 0.2805,
"step": 969
},
{
"epoch": 4.807930607187113,
"grad_norm": 0.07251252362084308,
"learning_rate": 2.955244880047392e-07,
"loss": 0.2839,
"step": 970
},
{
"epoch": 4.8128872366790585,
"grad_norm": 0.070298461918812,
"learning_rate": 2.7889799046803446e-07,
"loss": 0.2781,
"step": 971
},
{
"epoch": 4.817843866171003,
"grad_norm": 0.07430381244740472,
"learning_rate": 2.6275120841495083e-07,
"loss": 0.2837,
"step": 972
},
{
"epoch": 4.822800495662949,
"grad_norm": 0.06981369751740535,
"learning_rate": 2.4708433685169064e-07,
"loss": 0.2809,
"step": 973
},
{
"epoch": 4.8277571251548945,
"grad_norm": 0.07352139801671918,
"learning_rate": 2.31897564988528e-07,
"loss": 0.2868,
"step": 974
},
{
"epoch": 4.83271375464684,
"grad_norm": 0.07432448801089282,
"learning_rate": 2.1719107623753955e-07,
"loss": 0.2797,
"step": 975
},
{
"epoch": 4.837670384138786,
"grad_norm": 0.07063237991275789,
"learning_rate": 2.0296504821037067e-07,
"loss": 0.2865,
"step": 976
},
{
"epoch": 4.8426270136307314,
"grad_norm": 0.07228098025327327,
"learning_rate": 1.8921965271610387e-07,
"loss": 0.2818,
"step": 977
},
{
"epoch": 4.847583643122676,
"grad_norm": 0.07637299443039416,
"learning_rate": 1.759550557591716e-07,
"loss": 0.283,
"step": 978
},
{
"epoch": 4.852540272614622,
"grad_norm": 0.07208686982109491,
"learning_rate": 1.6317141753735334e-07,
"loss": 0.2812,
"step": 979
},
{
"epoch": 4.8574969021065675,
"grad_norm": 0.07114579990275795,
"learning_rate": 1.5086889243985715e-07,
"loss": 0.2805,
"step": 980
},
{
"epoch": 4.862453531598513,
"grad_norm": 0.07434434344635836,
"learning_rate": 1.390476290454279e-07,
"loss": 0.284,
"step": 981
},
{
"epoch": 4.867410161090459,
"grad_norm": 0.07430905888830036,
"learning_rate": 1.2770777012057978e-07,
"loss": 0.2862,
"step": 982
},
{
"epoch": 4.872366790582404,
"grad_norm": 0.07079802955341113,
"learning_rate": 1.1684945261785541e-07,
"loss": 0.2831,
"step": 983
},
{
"epoch": 4.877323420074349,
"grad_norm": 0.07119983463830963,
"learning_rate": 1.064728076741739e-07,
"loss": 0.2838,
"step": 984
},
{
"epoch": 4.882280049566295,
"grad_norm": 0.07152021858357932,
"learning_rate": 9.657796060925429e-08,
"loss": 0.2854,
"step": 985
},
{
"epoch": 4.88723667905824,
"grad_norm": 0.0727452791489002,
"learning_rate": 8.716503092409679e-08,
"loss": 0.2842,
"step": 986
},
{
"epoch": 4.892193308550186,
"grad_norm": 0.07330856909647866,
"learning_rate": 7.823413229953058e-08,
"loss": 0.2835,
"step": 987
},
{
"epoch": 4.897149938042132,
"grad_norm": 0.07217490209474922,
"learning_rate": 6.97853725948594e-08,
"loss": 0.2834,
"step": 988
},
{
"epoch": 4.9021065675340765,
"grad_norm": 0.07249892909698981,
"learning_rate": 6.181885384656028e-08,
"loss": 0.281,
"step": 989
},
{
"epoch": 4.907063197026022,
"grad_norm": 0.07439509854947902,
"learning_rate": 5.43346722670135e-08,
"loss": 0.2817,
"step": 990
},
{
"epoch": 4.912019826517968,
"grad_norm": 0.07152145735511466,
"learning_rate": 4.733291824339237e-08,
"loss": 0.2832,
"step": 991
},
{
"epoch": 4.916976456009913,
"grad_norm": 0.07325341342974692,
"learning_rate": 4.0813676336539656e-08,
"loss": 0.2844,
"step": 992
},
{
"epoch": 4.921933085501859,
"grad_norm": 0.0703233636733742,
"learning_rate": 3.4777025279950635e-08,
"loss": 0.2814,
"step": 993
},
{
"epoch": 4.926889714993804,
"grad_norm": 0.07009377049264516,
"learning_rate": 2.9223037978822755e-08,
"loss": 0.2838,
"step": 994
},
{
"epoch": 4.931846344485749,
"grad_norm": 0.0735843894676601,
"learning_rate": 2.415178150918962e-08,
"loss": 0.2838,
"step": 995
},
{
"epoch": 4.936802973977695,
"grad_norm": 0.07197069283776517,
"learning_rate": 1.9563317117090585e-08,
"loss": 0.2846,
"step": 996
},
{
"epoch": 4.941759603469641,
"grad_norm": 0.07023145383915408,
"learning_rate": 1.545770021783799e-08,
"loss": 0.2812,
"step": 997
},
{
"epoch": 4.946716232961586,
"grad_norm": 0.07391522734344161,
"learning_rate": 1.1834980395359907e-08,
"loss": 0.2829,
"step": 998
},
{
"epoch": 4.951672862453531,
"grad_norm": 0.07202531152737036,
"learning_rate": 8.695201401578424e-09,
"loss": 0.2786,
"step": 999
},
{
"epoch": 4.956629491945477,
"grad_norm": 0.07119570489216936,
"learning_rate": 6.038401155903373e-09,
"loss": 0.2809,
"step": 1000
},
{
"epoch": 4.961586121437422,
"grad_norm": 0.07309534984608951,
"learning_rate": 3.864611744757163e-09,
"loss": 0.2808,
"step": 1001
},
{
"epoch": 4.966542750929368,
"grad_norm": 0.07040039615225531,
"learning_rate": 2.1738594212061816e-09,
"loss": 0.2832,
"step": 1002
},
{
"epoch": 4.971499380421314,
"grad_norm": 0.06981510822264225,
"learning_rate": 9.661646046144057e-10,
"loss": 0.2799,
"step": 1003
},
{
"epoch": 4.976456009913259,
"grad_norm": 0.0715282791329122,
"learning_rate": 2.415418804346814e-10,
"loss": 0.2867,
"step": 1004
},
{
"epoch": 4.981412639405205,
"grad_norm": 0.0720799754466833,
"learning_rate": 0.0,
"loss": 0.2831,
"step": 1005
},
{
"epoch": 4.981412639405205,
"step": 1005,
"total_flos": 2.069268270658722e+19,
"train_loss": 0.35086224974684455,
"train_runtime": 52973.0269,
"train_samples_per_second": 9.749,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1.0,
"max_steps": 1005,
"num_input_tokens_seen": 0,
"num_train_epochs": 5,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 2.069268270658722e+19,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}