PEFT
Safetensors
cadquery_create_basic_forms / trainer_state.json
rumike7's picture
Upload 13 files
70377c6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 9.547446153846154,
"eval_steps": 500,
"global_step": 19400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.32,
"grad_norm": 0.48755398392677307,
"learning_rate": 9e-06,
"loss": 2.9601,
"mean_token_accuracy": 0.4284199851565063,
"num_tokens": 15763.0,
"step": 10
},
{
"epoch": 0.64,
"grad_norm": 0.7431631088256836,
"learning_rate": 1.9e-05,
"loss": 3.1197,
"mean_token_accuracy": 0.42047689845785496,
"num_tokens": 30510.0,
"step": 20
},
{
"epoch": 0.96,
"grad_norm": 0.9281144142150879,
"learning_rate": 1.9999959867760483e-05,
"loss": 3.0392,
"mean_token_accuracy": 0.41553077606949956,
"num_tokens": 44066.0,
"step": 30
},
{
"epoch": 1.256,
"grad_norm": 1.2322081327438354,
"learning_rate": 1.999982113944484e-05,
"loss": 3.3331,
"mean_token_accuracy": 0.4135566469583962,
"num_tokens": 58784.0,
"step": 40
},
{
"epoch": 1.576,
"grad_norm": 7.1544270515441895,
"learning_rate": 1.9999583320967683e-05,
"loss": 3.092,
"mean_token_accuracy": 0.42304785093292596,
"num_tokens": 74305.0,
"step": 50
},
{
"epoch": 1.896,
"grad_norm": 0.8600693345069885,
"learning_rate": 1.99992464146856e-05,
"loss": 2.8575,
"mean_token_accuracy": 0.4428128655999899,
"num_tokens": 88582.0,
"step": 60
},
{
"epoch": 2.192,
"grad_norm": 1.3089476823806763,
"learning_rate": 1.999881042393706e-05,
"loss": 3.2266,
"mean_token_accuracy": 0.43124929654437144,
"num_tokens": 101701.0,
"step": 70
},
{
"epoch": 2.512,
"grad_norm": 0.6948946714401245,
"learning_rate": 1.9998275353042377e-05,
"loss": 2.6626,
"mean_token_accuracy": 0.45563504602760074,
"num_tokens": 116845.0,
"step": 80
},
{
"epoch": 2.832,
"grad_norm": 1.0951130390167236,
"learning_rate": 1.999764120730368e-05,
"loss": 2.6061,
"mean_token_accuracy": 0.4799388902261853,
"num_tokens": 131642.0,
"step": 90
},
{
"epoch": 3.128,
"grad_norm": 2.2256667613983154,
"learning_rate": 1.9996907993004836e-05,
"loss": 2.5437,
"mean_token_accuracy": 0.4792116602530351,
"num_tokens": 144071.0,
"step": 100
},
{
"epoch": 3.448,
"grad_norm": 1.3083274364471436,
"learning_rate": 1.9996075717411405e-05,
"loss": 2.3134,
"mean_token_accuracy": 0.49251377964392307,
"num_tokens": 160671.0,
"step": 110
},
{
"epoch": 3.768,
"grad_norm": 1.0546499490737915,
"learning_rate": 1.9995144388770577e-05,
"loss": 2.5219,
"mean_token_accuracy": 0.49237193521112205,
"num_tokens": 174208.0,
"step": 120
},
{
"epoch": 4.064,
"grad_norm": 1.3395127058029175,
"learning_rate": 1.9994114016311053e-05,
"loss": 2.6405,
"mean_token_accuracy": 0.494740814977401,
"num_tokens": 189174.0,
"step": 130
},
{
"epoch": 4.384,
"grad_norm": 0.9926703572273254,
"learning_rate": 1.9992984610243006e-05,
"loss": 2.2636,
"mean_token_accuracy": 0.5148700190708041,
"num_tokens": 205614.0,
"step": 140
},
{
"epoch": 4.704,
"grad_norm": 2.0030810832977295,
"learning_rate": 1.9991756181757936e-05,
"loss": 2.2209,
"mean_token_accuracy": 0.5199183862656355,
"num_tokens": 220053.0,
"step": 150
},
{
"epoch": 5.0,
"grad_norm": 3.6073784828186035,
"learning_rate": 1.999042874302857e-05,
"loss": 2.3257,
"mean_token_accuracy": 0.5087871091389978,
"num_tokens": 232090.0,
"step": 160
},
{
"epoch": 5.32,
"grad_norm": 0.9835549592971802,
"learning_rate": 1.9989002307208767e-05,
"loss": 2.0461,
"mean_token_accuracy": 0.5411571308970451,
"num_tokens": 247494.0,
"step": 170
},
{
"epoch": 5.64,
"grad_norm": 1.3350774049758911,
"learning_rate": 1.998747688843335e-05,
"loss": 2.1222,
"mean_token_accuracy": 0.5401662968099117,
"num_tokens": 261394.0,
"step": 180
},
{
"epoch": 5.96,
"grad_norm": 1.4606289863586426,
"learning_rate": 1.9985852501817985e-05,
"loss": 2.0423,
"mean_token_accuracy": 0.5442505508661271,
"num_tokens": 276061.0,
"step": 190
},
{
"epoch": 6.256,
"grad_norm": 2.1242387294769287,
"learning_rate": 1.998412916345904e-05,
"loss": 2.1295,
"mean_token_accuracy": 0.5391830210951535,
"num_tokens": 289693.0,
"step": 200
},
{
"epoch": 6.576,
"grad_norm": 1.4135829210281372,
"learning_rate": 1.99823068904334e-05,
"loss": 1.9505,
"mean_token_accuracy": 0.5633615963160992,
"num_tokens": 304290.0,
"step": 210
},
{
"epoch": 6.896,
"grad_norm": 1.05637526512146,
"learning_rate": 1.998038570079833e-05,
"loss": 2.0059,
"mean_token_accuracy": 0.569461640715599,
"num_tokens": 319354.0,
"step": 220
},
{
"epoch": 7.192,
"grad_norm": 1.409026026725769,
"learning_rate": 1.9978365613591263e-05,
"loss": 1.8998,
"mean_token_accuracy": 0.5667525610408267,
"num_tokens": 333444.0,
"step": 230
},
{
"epoch": 7.5120000000000005,
"grad_norm": 0.9068111181259155,
"learning_rate": 1.9976246648829636e-05,
"loss": 1.6992,
"mean_token_accuracy": 0.6070717331022024,
"num_tokens": 349035.0,
"step": 240
},
{
"epoch": 7.832,
"grad_norm": 1.7179079055786133,
"learning_rate": 1.997402882751068e-05,
"loss": 1.898,
"mean_token_accuracy": 0.5741809576749801,
"num_tokens": 363648.0,
"step": 250
},
{
"epoch": 8.128,
"grad_norm": 2.5260181427001953,
"learning_rate": 1.997171217161122e-05,
"loss": 1.8262,
"mean_token_accuracy": 0.6014102785973936,
"num_tokens": 377215.0,
"step": 260
},
{
"epoch": 8.448,
"grad_norm": 1.0469582080841064,
"learning_rate": 1.996929670408744e-05,
"loss": 1.7423,
"mean_token_accuracy": 0.6009339291602374,
"num_tokens": 394237.0,
"step": 270
},
{
"epoch": 8.768,
"grad_norm": 2.0109641551971436,
"learning_rate": 1.996678244887468e-05,
"loss": 1.844,
"mean_token_accuracy": 0.5853462919592858,
"num_tokens": 407239.0,
"step": 280
},
{
"epoch": 9.064,
"grad_norm": 2.3061084747314453,
"learning_rate": 1.9964169430887174e-05,
"loss": 1.7935,
"mean_token_accuracy": 0.5961252535517151,
"num_tokens": 420770.0,
"step": 290
},
{
"epoch": 9.384,
"grad_norm": 1.5985437631607056,
"learning_rate": 1.9961457676017833e-05,
"loss": 1.703,
"mean_token_accuracy": 0.6001830734312534,
"num_tokens": 435005.0,
"step": 300
},
{
"epoch": 9.704,
"grad_norm": 1.0775606632232666,
"learning_rate": 1.9958647211137952e-05,
"loss": 1.7104,
"mean_token_accuracy": 0.6070772130973637,
"num_tokens": 449192.0,
"step": 310
},
{
"epoch": 10.0,
"grad_norm": 4.900282859802246,
"learning_rate": 1.9955738064096975e-05,
"loss": 1.7131,
"mean_token_accuracy": 0.6122549969602276,
"num_tokens": 464180.0,
"step": 320
},
{
"epoch": 10.32,
"grad_norm": 4.266648769378662,
"learning_rate": 1.9952730263722205e-05,
"loss": 1.6526,
"mean_token_accuracy": 0.6316778633743525,
"num_tokens": 480765.0,
"step": 330
},
{
"epoch": 10.64,
"grad_norm": 1.3835431337356567,
"learning_rate": 1.994962383981851e-05,
"loss": 1.5473,
"mean_token_accuracy": 0.6267588481307029,
"num_tokens": 495929.0,
"step": 340
},
{
"epoch": 10.96,
"grad_norm": 0.9430285096168518,
"learning_rate": 1.9946418823168053e-05,
"loss": 1.7158,
"mean_token_accuracy": 0.6107403030619025,
"num_tokens": 508555.0,
"step": 350
},
{
"epoch": 11.256,
"grad_norm": 1.7163220643997192,
"learning_rate": 1.994311524552996e-05,
"loss": 1.557,
"mean_token_accuracy": 0.6259392479787002,
"num_tokens": 522011.0,
"step": 360
},
{
"epoch": 11.576,
"grad_norm": 1.8388807773590088,
"learning_rate": 1.993971313964002e-05,
"loss": 1.6451,
"mean_token_accuracy": 0.6252246461808681,
"num_tokens": 535811.0,
"step": 370
},
{
"epoch": 11.896,
"grad_norm": 2.878235101699829,
"learning_rate": 1.993621253921036e-05,
"loss": 1.6015,
"mean_token_accuracy": 0.6341628909111023,
"num_tokens": 551848.0,
"step": 380
},
{
"epoch": 12.192,
"grad_norm": 8.73257064819336,
"learning_rate": 1.9932613478929103e-05,
"loss": 1.6087,
"mean_token_accuracy": 0.6341016663087381,
"num_tokens": 566187.0,
"step": 390
},
{
"epoch": 12.512,
"grad_norm": 1.6711736917495728,
"learning_rate": 1.9928915994460037e-05,
"loss": 1.4662,
"mean_token_accuracy": 0.641170359775424,
"num_tokens": 579340.0,
"step": 400
},
{
"epoch": 12.832,
"grad_norm": 1.2355554103851318,
"learning_rate": 1.9925120122442253e-05,
"loss": 1.4234,
"mean_token_accuracy": 0.6438105596229434,
"num_tokens": 595449.0,
"step": 410
},
{
"epoch": 13.128,
"grad_norm": 1.1492340564727783,
"learning_rate": 1.9921225900489776e-05,
"loss": 1.5106,
"mean_token_accuracy": 0.6468948456081184,
"num_tokens": 610726.0,
"step": 420
},
{
"epoch": 13.448,
"grad_norm": 2.4454128742218018,
"learning_rate": 1.9917233367191205e-05,
"loss": 1.4301,
"mean_token_accuracy": 0.6485198132693768,
"num_tokens": 624770.0,
"step": 430
},
{
"epoch": 13.768,
"grad_norm": 1.809606909751892,
"learning_rate": 1.9913142562109328e-05,
"loss": 1.4744,
"mean_token_accuracy": 0.656716751307249,
"num_tokens": 640635.0,
"step": 440
},
{
"epoch": 14.064,
"grad_norm": 1.8572089672088623,
"learning_rate": 1.990895352578072e-05,
"loss": 1.4747,
"mean_token_accuracy": 0.6568980690192532,
"num_tokens": 652969.0,
"step": 450
},
{
"epoch": 14.384,
"grad_norm": 1.4431779384613037,
"learning_rate": 1.9904666299715357e-05,
"loss": 1.3779,
"mean_token_accuracy": 0.6651028156280517,
"num_tokens": 667518.0,
"step": 460
},
{
"epoch": 14.704,
"grad_norm": 1.1659435033798218,
"learning_rate": 1.9900280926396186e-05,
"loss": 1.4853,
"mean_token_accuracy": 0.6542218446731567,
"num_tokens": 682093.0,
"step": 470
},
{
"epoch": 15.0,
"grad_norm": 2.4632649421691895,
"learning_rate": 1.989579744927872e-05,
"loss": 1.5292,
"mean_token_accuracy": 0.6373760857292123,
"num_tokens": 696270.0,
"step": 480
},
{
"epoch": 15.32,
"grad_norm": 1.2648308277130127,
"learning_rate": 1.98912159127906e-05,
"loss": 1.5282,
"mean_token_accuracy": 0.6369021199643612,
"num_tokens": 711645.0,
"step": 490
},
{
"epoch": 15.64,
"grad_norm": 1.6591901779174805,
"learning_rate": 1.988653636233116e-05,
"loss": 1.3674,
"mean_token_accuracy": 0.6757966171950102,
"num_tokens": 724986.0,
"step": 500
},
{
"epoch": 15.96,
"grad_norm": 1.3664172887802124,
"learning_rate": 1.988175884427097e-05,
"loss": 1.3619,
"mean_token_accuracy": 0.6697750940918923,
"num_tokens": 740314.0,
"step": 510
},
{
"epoch": 16.256,
"grad_norm": 1.3944075107574463,
"learning_rate": 1.9876883405951378e-05,
"loss": 1.4199,
"mean_token_accuracy": 0.6569653801821373,
"num_tokens": 754126.0,
"step": 520
},
{
"epoch": 16.576,
"grad_norm": 1.846074104309082,
"learning_rate": 1.987191009568405e-05,
"loss": 1.3542,
"mean_token_accuracy": 0.6702545773237943,
"num_tokens": 770447.0,
"step": 530
},
{
"epoch": 16.896,
"grad_norm": 1.4818017482757568,
"learning_rate": 1.9866838962750473e-05,
"loss": 1.3385,
"mean_token_accuracy": 0.673089163005352,
"num_tokens": 784145.0,
"step": 540
},
{
"epoch": 17.192,
"grad_norm": 1.6503247022628784,
"learning_rate": 1.986167005740149e-05,
"loss": 1.4058,
"mean_token_accuracy": 0.6697660020074329,
"num_tokens": 797632.0,
"step": 550
},
{
"epoch": 17.512,
"grad_norm": 1.7759121656417847,
"learning_rate": 1.985640343085678e-05,
"loss": 1.485,
"mean_token_accuracy": 0.6630586348474026,
"num_tokens": 812748.0,
"step": 560
},
{
"epoch": 17.832,
"grad_norm": 1.6328964233398438,
"learning_rate": 1.9851039135304366e-05,
"loss": 1.3464,
"mean_token_accuracy": 0.673164501786232,
"num_tokens": 827388.0,
"step": 570
},
{
"epoch": 18.128,
"grad_norm": 1.394505500793457,
"learning_rate": 1.9845577223900087e-05,
"loss": 1.3223,
"mean_token_accuracy": 0.6847520865298606,
"num_tokens": 842219.0,
"step": 580
},
{
"epoch": 18.448,
"grad_norm": 1.541831374168396,
"learning_rate": 1.984001775076708e-05,
"loss": 1.3222,
"mean_token_accuracy": 0.6771992217749357,
"num_tokens": 857904.0,
"step": 590
},
{
"epoch": 18.768,
"grad_norm": 1.4715123176574707,
"learning_rate": 1.983436077099524e-05,
"loss": 1.3623,
"mean_token_accuracy": 0.6771474566310645,
"num_tokens": 871758.0,
"step": 600
},
{
"epoch": 19.064,
"grad_norm": 1.244395136833191,
"learning_rate": 1.9828606340640678e-05,
"loss": 1.3194,
"mean_token_accuracy": 0.6701785076308895,
"num_tokens": 885177.0,
"step": 610
},
{
"epoch": 19.384,
"grad_norm": 3.2594940662384033,
"learning_rate": 1.9822754516725148e-05,
"loss": 1.3986,
"mean_token_accuracy": 0.6804742000997066,
"num_tokens": 900412.0,
"step": 620
},
{
"epoch": 19.704,
"grad_norm": 1.9484755992889404,
"learning_rate": 1.9816805357235512e-05,
"loss": 1.3087,
"mean_token_accuracy": 0.675427176989615,
"num_tokens": 914923.0,
"step": 630
},
{
"epoch": 20.0,
"grad_norm": 4.795617580413818,
"learning_rate": 1.981075892112314e-05,
"loss": 1.3261,
"mean_token_accuracy": 0.6869603467954172,
"num_tokens": 928360.0,
"step": 640
},
{
"epoch": 20.32,
"grad_norm": 2.1296019554138184,
"learning_rate": 1.980461526830334e-05,
"loss": 1.3365,
"mean_token_accuracy": 0.6707998286932707,
"num_tokens": 942431.0,
"step": 650
},
{
"epoch": 20.64,
"grad_norm": 2.041980743408203,
"learning_rate": 1.979837445965475e-05,
"loss": 1.4148,
"mean_token_accuracy": 0.6774610493332147,
"num_tokens": 957665.0,
"step": 660
},
{
"epoch": 20.96,
"grad_norm": 2.0277955532073975,
"learning_rate": 1.979203655701875e-05,
"loss": 1.1934,
"mean_token_accuracy": 0.7059706412255764,
"num_tokens": 972341.0,
"step": 670
},
{
"epoch": 21.256,
"grad_norm": 1.4342715740203857,
"learning_rate": 1.978560162319885e-05,
"loss": 1.2645,
"mean_token_accuracy": 0.6948015895244237,
"num_tokens": 987917.0,
"step": 680
},
{
"epoch": 21.576,
"grad_norm": 2.365342140197754,
"learning_rate": 1.9779069721960046e-05,
"loss": 1.356,
"mean_token_accuracy": 0.6770768724381924,
"num_tokens": 1001846.0,
"step": 690
},
{
"epoch": 21.896,
"grad_norm": 1.4183971881866455,
"learning_rate": 1.9772440918028217e-05,
"loss": 1.3372,
"mean_token_accuracy": 0.6928307216614484,
"num_tokens": 1016810.0,
"step": 700
},
{
"epoch": 22.192,
"grad_norm": 1.5157005786895752,
"learning_rate": 1.9765715277089458e-05,
"loss": 1.2262,
"mean_token_accuracy": 0.6972452486689026,
"num_tokens": 1032507.0,
"step": 710
},
{
"epoch": 22.512,
"grad_norm": 1.4448522329330444,
"learning_rate": 1.9758892865789445e-05,
"loss": 1.2261,
"mean_token_accuracy": 0.6949771210551262,
"num_tokens": 1047386.0,
"step": 720
},
{
"epoch": 22.832,
"grad_norm": 2.33046817779541,
"learning_rate": 1.9751973751732775e-05,
"loss": 1.2348,
"mean_token_accuracy": 0.6982233498245478,
"num_tokens": 1061351.0,
"step": 730
},
{
"epoch": 23.128,
"grad_norm": 1.8830664157867432,
"learning_rate": 1.9744958003482285e-05,
"loss": 1.2979,
"mean_token_accuracy": 0.6971497769291336,
"num_tokens": 1073148.0,
"step": 740
},
{
"epoch": 23.448,
"grad_norm": 1.466878056526184,
"learning_rate": 1.9737845690558385e-05,
"loss": 1.3683,
"mean_token_accuracy": 0.680212589353323,
"num_tokens": 1088218.0,
"step": 750
},
{
"epoch": 23.768,
"grad_norm": 1.5701245069503784,
"learning_rate": 1.973063688343835e-05,
"loss": 1.1505,
"mean_token_accuracy": 0.7072769150137901,
"num_tokens": 1102836.0,
"step": 760
},
{
"epoch": 24.064,
"grad_norm": 1.6687356233596802,
"learning_rate": 1.9723331653555653e-05,
"loss": 1.2474,
"mean_token_accuracy": 0.6967680285105834,
"num_tokens": 1116942.0,
"step": 770
},
{
"epoch": 24.384,
"grad_norm": 1.3728556632995605,
"learning_rate": 1.9715930073299227e-05,
"loss": 1.2448,
"mean_token_accuracy": 0.7040290288627148,
"num_tokens": 1132054.0,
"step": 780
},
{
"epoch": 24.704,
"grad_norm": 1.4181838035583496,
"learning_rate": 1.970843221601276e-05,
"loss": 1.1969,
"mean_token_accuracy": 0.6944498892873525,
"num_tokens": 1148041.0,
"step": 790
},
{
"epoch": 25.0,
"grad_norm": 4.3919596672058105,
"learning_rate": 1.9700838155993972e-05,
"loss": 1.1934,
"mean_token_accuracy": 0.7042354522524653,
"num_tokens": 1160450.0,
"step": 800
},
{
"epoch": 25.32,
"grad_norm": 1.5123074054718018,
"learning_rate": 1.9693147968493872e-05,
"loss": 1.2369,
"mean_token_accuracy": 0.692409698665142,
"num_tokens": 1174351.0,
"step": 810
},
{
"epoch": 25.64,
"grad_norm": 1.278221845626831,
"learning_rate": 1.9685361729716014e-05,
"loss": 1.1829,
"mean_token_accuracy": 0.7174848213791847,
"num_tokens": 1190213.0,
"step": 820
},
{
"epoch": 25.96,
"grad_norm": 2.224332094192505,
"learning_rate": 1.967747951681575e-05,
"loss": 1.2224,
"mean_token_accuracy": 0.7052119519561529,
"num_tokens": 1205508.0,
"step": 830
},
{
"epoch": 26.256,
"grad_norm": 1.548086166381836,
"learning_rate": 1.966950140789944e-05,
"loss": 1.2254,
"mean_token_accuracy": 0.7041690679820808,
"num_tokens": 1219047.0,
"step": 840
},
{
"epoch": 26.576,
"grad_norm": 2.0900254249572754,
"learning_rate": 1.9661427482023718e-05,
"loss": 1.1557,
"mean_token_accuracy": 0.7088660508394241,
"num_tokens": 1231738.0,
"step": 850
},
{
"epoch": 26.896,
"grad_norm": 1.9919354915618896,
"learning_rate": 1.965325781919467e-05,
"loss": 1.1962,
"mean_token_accuracy": 0.7142665989696979,
"num_tokens": 1248062.0,
"step": 860
},
{
"epoch": 27.192,
"grad_norm": 2.2158303260803223,
"learning_rate": 1.9644992500367072e-05,
"loss": 1.2078,
"mean_token_accuracy": 0.7049629523141964,
"num_tokens": 1261738.0,
"step": 870
},
{
"epoch": 27.512,
"grad_norm": 1.830531358718872,
"learning_rate": 1.9636631607443565e-05,
"loss": 1.2142,
"mean_token_accuracy": 0.7097026702016592,
"num_tokens": 1278012.0,
"step": 880
},
{
"epoch": 27.832,
"grad_norm": 2.0944063663482666,
"learning_rate": 1.9628175223273847e-05,
"loss": 1.1368,
"mean_token_accuracy": 0.7265028398483991,
"num_tokens": 1292725.0,
"step": 890
},
{
"epoch": 28.128,
"grad_norm": 1.4445384740829468,
"learning_rate": 1.9619623431653872e-05,
"loss": 1.2329,
"mean_token_accuracy": 0.6941638359346906,
"num_tokens": 1305912.0,
"step": 900
},
{
"epoch": 28.448,
"grad_norm": 2.084064245223999,
"learning_rate": 1.9610976317324993e-05,
"loss": 1.1324,
"mean_token_accuracy": 0.7086500860750675,
"num_tokens": 1320269.0,
"step": 910
},
{
"epoch": 28.768,
"grad_norm": 1.5166538953781128,
"learning_rate": 1.9602233965973145e-05,
"loss": 1.215,
"mean_token_accuracy": 0.7056132420897484,
"num_tokens": 1336877.0,
"step": 920
},
{
"epoch": 29.064,
"grad_norm": 1.324559211730957,
"learning_rate": 1.9593396464227964e-05,
"loss": 1.1762,
"mean_token_accuracy": 0.7244789052653957,
"num_tokens": 1349855.0,
"step": 930
},
{
"epoch": 29.384,
"grad_norm": 1.3715434074401855,
"learning_rate": 1.9584463899661975e-05,
"loss": 1.1323,
"mean_token_accuracy": 0.7216422040015459,
"num_tokens": 1364729.0,
"step": 940
},
{
"epoch": 29.704,
"grad_norm": 1.782844066619873,
"learning_rate": 1.9575436360789687e-05,
"loss": 1.2588,
"mean_token_accuracy": 0.7018849883228541,
"num_tokens": 1378903.0,
"step": 950
},
{
"epoch": 30.0,
"grad_norm": 3.4414260387420654,
"learning_rate": 1.9566313937066727e-05,
"loss": 1.1545,
"mean_token_accuracy": 0.7196269961627754,
"num_tokens": 1392540.0,
"step": 960
},
{
"epoch": 30.32,
"grad_norm": 3.570629835128784,
"learning_rate": 1.9557096718888956e-05,
"loss": 1.1217,
"mean_token_accuracy": 0.7212486552074552,
"num_tokens": 1406295.0,
"step": 970
},
{
"epoch": 30.64,
"grad_norm": 1.5852808952331543,
"learning_rate": 1.9547784797591565e-05,
"loss": 1.1959,
"mean_token_accuracy": 0.7164284475147724,
"num_tokens": 1422592.0,
"step": 980
},
{
"epoch": 30.96,
"grad_norm": 1.5355671644210815,
"learning_rate": 1.9538378265448195e-05,
"loss": 1.1813,
"mean_token_accuracy": 0.710675698518753,
"num_tokens": 1437502.0,
"step": 990
},
{
"epoch": 31.256,
"grad_norm": 1.5741212368011475,
"learning_rate": 1.9528877215669983e-05,
"loss": 1.1143,
"mean_token_accuracy": 0.7233139457734855,
"num_tokens": 1452056.0,
"step": 1000
},
{
"epoch": 32.576,
"grad_norm": 1.7357654571533203,
"learning_rate": 1.8116046949409032e-05,
"loss": 1.2445,
"mean_token_accuracy": 0.7005614548921585,
"num_tokens": 14368.0,
"step": 1010
},
{
"epoch": 32.896,
"grad_norm": 2.1830084323883057,
"learning_rate": 1.807903147537074e-05,
"loss": 1.1743,
"mean_token_accuracy": 0.7173698712140322,
"num_tokens": 31135.0,
"step": 1020
},
{
"epoch": 33.224,
"grad_norm": 2.0015718936920166,
"learning_rate": 1.8041694488049716e-05,
"loss": 1.2619,
"mean_token_accuracy": 0.715624163063561,
"num_tokens": 46347.0,
"step": 1030
},
{
"epoch": 33.544,
"grad_norm": 1.7311397790908813,
"learning_rate": 1.8004037473309373e-05,
"loss": 1.2331,
"mean_token_accuracy": 0.7106888771057129,
"num_tokens": 63240.0,
"step": 1040
},
{
"epoch": 33.864,
"grad_norm": 1.8815584182739258,
"learning_rate": 1.7966061929748968e-05,
"loss": 1.2194,
"mean_token_accuracy": 0.7109542470425367,
"num_tokens": 79655.0,
"step": 1050
},
{
"epoch": 34.16,
"grad_norm": 1.684423565864563,
"learning_rate": 1.7927769368643904e-05,
"loss": 1.0667,
"mean_token_accuracy": 0.7348488770626687,
"num_tokens": 95318.0,
"step": 1060
},
{
"epoch": 34.48,
"grad_norm": 1.6687734127044678,
"learning_rate": 1.788916131388564e-05,
"loss": 1.1796,
"mean_token_accuracy": 0.7278237771242857,
"num_tokens": 111213.0,
"step": 1070
},
{
"epoch": 34.8,
"grad_norm": 1.7268950939178467,
"learning_rate": 1.785023930192103e-05,
"loss": 1.1723,
"mean_token_accuracy": 0.7138827528804541,
"num_tokens": 126943.0,
"step": 1080
},
{
"epoch": 35.096,
"grad_norm": 1.5416665077209473,
"learning_rate": 1.781100488169115e-05,
"loss": 1.0286,
"mean_token_accuracy": 0.7333241834028347,
"num_tokens": 142073.0,
"step": 1090
},
{
"epoch": 35.416,
"grad_norm": 1.7402383089065552,
"learning_rate": 1.777145961456971e-05,
"loss": 1.0884,
"mean_token_accuracy": 0.7265842445194721,
"num_tokens": 158263.0,
"step": 1100
},
{
"epoch": 35.736,
"grad_norm": 1.4736402034759521,
"learning_rate": 1.773160507430087e-05,
"loss": 1.1012,
"mean_token_accuracy": 0.727820971608162,
"num_tokens": 172637.0,
"step": 1110
},
{
"epoch": 36.032,
"grad_norm": 2.027437448501587,
"learning_rate": 1.7691442846936643e-05,
"loss": 1.1525,
"mean_token_accuracy": 0.7281997264237017,
"num_tokens": 189288.0,
"step": 1120
},
{
"epoch": 36.352,
"grad_norm": 2.058610439300537,
"learning_rate": 1.7650974530773745e-05,
"loss": 1.147,
"mean_token_accuracy": 0.7228171911090613,
"num_tokens": 204429.0,
"step": 1130
},
{
"epoch": 36.672,
"grad_norm": 1.47328519821167,
"learning_rate": 1.7610201736290022e-05,
"loss": 1.1293,
"mean_token_accuracy": 0.7266111556440592,
"num_tokens": 220109.0,
"step": 1140
},
{
"epoch": 36.992,
"grad_norm": 1.4244815111160278,
"learning_rate": 1.7569126086080342e-05,
"loss": 1.0312,
"mean_token_accuracy": 0.7415647856891155,
"num_tokens": 236961.0,
"step": 1150
},
{
"epoch": 37.288,
"grad_norm": 1.6087596416473389,
"learning_rate": 1.7527749214792023e-05,
"loss": 1.1148,
"mean_token_accuracy": 0.722566624348228,
"num_tokens": 251116.0,
"step": 1160
},
{
"epoch": 37.608,
"grad_norm": 1.6909428834915161,
"learning_rate": 1.7486072769059785e-05,
"loss": 1.1283,
"mean_token_accuracy": 0.7359607569873333,
"num_tokens": 267570.0,
"step": 1170
},
{
"epoch": 37.928,
"grad_norm": 1.646548867225647,
"learning_rate": 1.7444098407440218e-05,
"loss": 1.0572,
"mean_token_accuracy": 0.7344494730234146,
"num_tokens": 282974.0,
"step": 1180
},
{
"epoch": 38.224,
"grad_norm": 1.5249629020690918,
"learning_rate": 1.740182780034577e-05,
"loss": 0.9779,
"mean_token_accuracy": 0.7474351501142656,
"num_tokens": 298664.0,
"step": 1190
},
{
"epoch": 38.544,
"grad_norm": 1.8203458786010742,
"learning_rate": 1.7359262629978286e-05,
"loss": 1.044,
"mean_token_accuracy": 0.7267404418438673,
"num_tokens": 313932.0,
"step": 1200
},
{
"epoch": 38.864,
"grad_norm": 1.960335612297058,
"learning_rate": 1.731640459026206e-05,
"loss": 1.0537,
"mean_token_accuracy": 0.7449462197721004,
"num_tokens": 330427.0,
"step": 1210
},
{
"epoch": 39.16,
"grad_norm": 2.152423620223999,
"learning_rate": 1.727325538677642e-05,
"loss": 1.1988,
"mean_token_accuracy": 0.7341888015334671,
"num_tokens": 344595.0,
"step": 1220
},
{
"epoch": 39.48,
"grad_norm": 1.9269284009933472,
"learning_rate": 1.722981673668784e-05,
"loss": 1.0929,
"mean_token_accuracy": 0.7354621075093746,
"num_tokens": 361903.0,
"step": 1230
},
{
"epoch": 39.8,
"grad_norm": 2.627488374710083,
"learning_rate": 1.7186090368681625e-05,
"loss": 1.0304,
"mean_token_accuracy": 0.7406851584091783,
"num_tokens": 378158.0,
"step": 1240
},
{
"epoch": 40.096,
"grad_norm": 1.340135931968689,
"learning_rate": 1.714207802289311e-05,
"loss": 0.9831,
"mean_token_accuracy": 0.7508459779861811,
"num_tokens": 393086.0,
"step": 1250
},
{
"epoch": 40.416,
"grad_norm": 1.5764344930648804,
"learning_rate": 1.7097781450838408e-05,
"loss": 1.0411,
"mean_token_accuracy": 0.7428241446614265,
"num_tokens": 408865.0,
"step": 1260
},
{
"epoch": 40.736,
"grad_norm": 2.27480149269104,
"learning_rate": 1.7053202415344693e-05,
"loss": 1.1553,
"mean_token_accuracy": 0.7261891044676304,
"num_tokens": 422941.0,
"step": 1270
},
{
"epoch": 41.032,
"grad_norm": 2.0869431495666504,
"learning_rate": 1.7008342690480075e-05,
"loss": 1.0776,
"mean_token_accuracy": 0.7442273002218556,
"num_tokens": 438615.0,
"step": 1280
},
{
"epoch": 41.352,
"grad_norm": 1.6138980388641357,
"learning_rate": 1.6963204061482972e-05,
"loss": 0.9933,
"mean_token_accuracy": 0.7366263665258884,
"num_tokens": 454742.0,
"step": 1290
},
{
"epoch": 41.672,
"grad_norm": 2.201198101043701,
"learning_rate": 1.6917788324691083e-05,
"loss": 1.12,
"mean_token_accuracy": 0.7349841587245465,
"num_tokens": 471732.0,
"step": 1300
},
{
"epoch": 41.992,
"grad_norm": 2.3492226600646973,
"learning_rate": 1.687209728746989e-05,
"loss": 1.0594,
"mean_token_accuracy": 0.745047352835536,
"num_tokens": 487349.0,
"step": 1310
},
{
"epoch": 42.288,
"grad_norm": 1.7862104177474976,
"learning_rate": 1.6826132768140735e-05,
"loss": 0.9756,
"mean_token_accuracy": 0.7570219249338717,
"num_tokens": 502115.0,
"step": 1320
},
{
"epoch": 42.608,
"grad_norm": 2.4716343879699707,
"learning_rate": 1.6779896595908462e-05,
"loss": 1.0208,
"mean_token_accuracy": 0.7443521052598954,
"num_tokens": 517825.0,
"step": 1330
},
{
"epoch": 42.928,
"grad_norm": 2.661140203475952,
"learning_rate": 1.6733390610788622e-05,
"loss": 1.0313,
"mean_token_accuracy": 0.7418102856725455,
"num_tokens": 534561.0,
"step": 1340
},
{
"epoch": 43.224,
"grad_norm": 1.9998219013214111,
"learning_rate": 1.668661666353423e-05,
"loss": 1.0699,
"mean_token_accuracy": 0.7479387578126546,
"num_tokens": 548327.0,
"step": 1350
},
{
"epoch": 43.544,
"grad_norm": 2.4526405334472656,
"learning_rate": 1.6639576615562143e-05,
"loss": 0.9673,
"mean_token_accuracy": 0.7542693041265011,
"num_tokens": 565164.0,
"step": 1360
},
{
"epoch": 43.864,
"grad_norm": 1.7199647426605225,
"learning_rate": 1.6592272338878963e-05,
"loss": 1.0644,
"mean_token_accuracy": 0.743690374866128,
"num_tokens": 580754.0,
"step": 1370
},
{
"epoch": 44.16,
"grad_norm": 1.7065895795822144,
"learning_rate": 1.6544705716006537e-05,
"loss": 0.9511,
"mean_token_accuracy": 0.7495483123772854,
"num_tokens": 595953.0,
"step": 1380
},
{
"epoch": 44.48,
"grad_norm": 1.5984984636306763,
"learning_rate": 1.649687863990705e-05,
"loss": 1.0901,
"mean_token_accuracy": 0.7480962604284287,
"num_tokens": 611850.0,
"step": 1390
},
{
"epoch": 44.8,
"grad_norm": 2.719882011413574,
"learning_rate": 1.644879301390769e-05,
"loss": 0.9664,
"mean_token_accuracy": 0.7527227349579334,
"num_tokens": 627428.0,
"step": 1400
},
{
"epoch": 45.096,
"grad_norm": 2.04146409034729,
"learning_rate": 1.6400450751624897e-05,
"loss": 0.9673,
"mean_token_accuracy": 0.7482488421169488,
"num_tokens": 641538.0,
"step": 1410
},
{
"epoch": 45.416,
"grad_norm": 2.128373384475708,
"learning_rate": 1.6351853776888214e-05,
"loss": 0.9908,
"mean_token_accuracy": 0.7453075967729091,
"num_tokens": 658145.0,
"step": 1420
},
{
"epoch": 45.736,
"grad_norm": 1.9842469692230225,
"learning_rate": 1.630300402366373e-05,
"loss": 1.0387,
"mean_token_accuracy": 0.7478526467457414,
"num_tokens": 675926.0,
"step": 1430
},
{
"epoch": 46.032,
"grad_norm": 1.5507521629333496,
"learning_rate": 1.6253903435977103e-05,
"loss": 0.959,
"mean_token_accuracy": 0.7599469971012425,
"num_tokens": 689837.0,
"step": 1440
},
{
"epoch": 46.352,
"grad_norm": 2.250763416290283,
"learning_rate": 1.6204553967836216e-05,
"loss": 1.0544,
"mean_token_accuracy": 0.7468490976840257,
"num_tokens": 705912.0,
"step": 1450
},
{
"epoch": 46.672,
"grad_norm": 1.7809251546859741,
"learning_rate": 1.6154957583153388e-05,
"loss": 1.034,
"mean_token_accuracy": 0.7534692898392678,
"num_tokens": 722631.0,
"step": 1460
},
{
"epoch": 46.992,
"grad_norm": 2.4856886863708496,
"learning_rate": 1.6105116255667246e-05,
"loss": 0.9083,
"mean_token_accuracy": 0.7516257427632809,
"num_tokens": 737649.0,
"step": 1470
},
{
"epoch": 47.288,
"grad_norm": 2.1131696701049805,
"learning_rate": 1.605503196886416e-05,
"loss": 0.9908,
"mean_token_accuracy": 0.7506888621562237,
"num_tokens": 754612.0,
"step": 1480
},
{
"epoch": 47.608,
"grad_norm": 1.3065401315689087,
"learning_rate": 1.600470671589931e-05,
"loss": 0.9346,
"mean_token_accuracy": 0.757453129440546,
"num_tokens": 771515.0,
"step": 1490
},
{
"epoch": 47.928,
"grad_norm": 2.0020365715026855,
"learning_rate": 1.5954142499517377e-05,
"loss": 1.0396,
"mean_token_accuracy": 0.7508561560884118,
"num_tokens": 785665.0,
"step": 1500
},
{
"epoch": 48.224,
"grad_norm": 1.8463741540908813,
"learning_rate": 1.5903341331972832e-05,
"loss": 0.9151,
"mean_token_accuracy": 0.7590098671011023,
"num_tokens": 799293.0,
"step": 1510
},
{
"epoch": 48.544,
"grad_norm": 1.851616382598877,
"learning_rate": 1.585230523494985e-05,
"loss": 0.9102,
"mean_token_accuracy": 0.7564024582505227,
"num_tokens": 813555.0,
"step": 1520
},
{
"epoch": 48.864,
"grad_norm": 1.4981343746185303,
"learning_rate": 1.580103623948188e-05,
"loss": 1.0654,
"mean_token_accuracy": 0.748985405266285,
"num_tokens": 831868.0,
"step": 1530
},
{
"epoch": 49.16,
"grad_norm": 1.8819829225540161,
"learning_rate": 1.574953638587079e-05,
"loss": 0.993,
"mean_token_accuracy": 0.7556418059645472,
"num_tokens": 846798.0,
"step": 1540
},
{
"epoch": 49.48,
"grad_norm": 2.24092960357666,
"learning_rate": 1.569780772360568e-05,
"loss": 0.9818,
"mean_token_accuracy": 0.7535504069179296,
"num_tokens": 862063.0,
"step": 1550
},
{
"epoch": 49.8,
"grad_norm": 1.7873568534851074,
"learning_rate": 1.5645852311281343e-05,
"loss": 1.0086,
"mean_token_accuracy": 0.7555014498531818,
"num_tokens": 878215.0,
"step": 1560
},
{
"epoch": 50.096,
"grad_norm": 2.5300111770629883,
"learning_rate": 1.559367221651629e-05,
"loss": 0.8826,
"mean_token_accuracy": 0.7630251637987189,
"num_tokens": 893320.0,
"step": 1570
},
{
"epoch": 50.416,
"grad_norm": 1.9504714012145996,
"learning_rate": 1.554126951587053e-05,
"loss": 0.9572,
"mean_token_accuracy": 0.7577113211154938,
"num_tokens": 908230.0,
"step": 1580
},
{
"epoch": 50.736,
"grad_norm": 1.8482609987258911,
"learning_rate": 1.548864629476288e-05,
"loss": 0.9715,
"mean_token_accuracy": 0.7632556769996881,
"num_tokens": 925533.0,
"step": 1590
},
{
"epoch": 51.032,
"grad_norm": 1.7342660427093506,
"learning_rate": 1.5435804647388003e-05,
"loss": 1.0049,
"mean_token_accuracy": 0.753706334410487,
"num_tokens": 940557.0,
"step": 1600
},
{
"epoch": 51.352,
"grad_norm": 1.7231630086898804,
"learning_rate": 1.5382746676633053e-05,
"loss": 0.9577,
"mean_token_accuracy": 0.7602146591991186,
"num_tokens": 955898.0,
"step": 1610
},
{
"epoch": 51.672,
"grad_norm": 1.9401224851608276,
"learning_rate": 1.5329474493993984e-05,
"loss": 0.9607,
"mean_token_accuracy": 0.7621455781161786,
"num_tokens": 972435.0,
"step": 1620
},
{
"epoch": 51.992,
"grad_norm": 2.089966297149658,
"learning_rate": 1.5275990219491553e-05,
"loss": 0.9482,
"mean_token_accuracy": 0.762396826967597,
"num_tokens": 988434.0,
"step": 1630
},
{
"epoch": 52.288,
"grad_norm": 1.7538946866989136,
"learning_rate": 1.522229598158691e-05,
"loss": 0.9943,
"mean_token_accuracy": 0.7541912862577954,
"num_tokens": 1001536.0,
"step": 1640
},
{
"epoch": 52.608,
"grad_norm": 1.8982934951782227,
"learning_rate": 1.5168393917096917e-05,
"loss": 0.9258,
"mean_token_accuracy": 0.7704043008387089,
"num_tokens": 1018633.0,
"step": 1650
},
{
"epoch": 52.928,
"grad_norm": 2.195676803588867,
"learning_rate": 1.5114286171109109e-05,
"loss": 0.9363,
"mean_token_accuracy": 0.7571658097207546,
"num_tokens": 1035378.0,
"step": 1660
},
{
"epoch": 53.224,
"grad_norm": 1.6514253616333008,
"learning_rate": 1.5059974896896324e-05,
"loss": 1.0186,
"mean_token_accuracy": 0.7523992178407876,
"num_tokens": 1050349.0,
"step": 1670
},
{
"epoch": 53.544,
"grad_norm": 2.6811511516571045,
"learning_rate": 1.5005462255831014e-05,
"loss": 1.0254,
"mean_token_accuracy": 0.7484087854623794,
"num_tokens": 1064873.0,
"step": 1680
},
{
"epoch": 53.864,
"grad_norm": 2.0554141998291016,
"learning_rate": 1.4950750417299227e-05,
"loss": 0.878,
"mean_token_accuracy": 0.7748427361249923,
"num_tokens": 1082105.0,
"step": 1690
},
{
"epoch": 54.16,
"grad_norm": 1.614017128944397,
"learning_rate": 1.489584155861428e-05,
"loss": 0.9688,
"mean_token_accuracy": 0.7686513742885074,
"num_tokens": 1097886.0,
"step": 1700
},
{
"epoch": 54.48,
"grad_norm": 1.7928838729858398,
"learning_rate": 1.4840737864930106e-05,
"loss": 0.8874,
"mean_token_accuracy": 0.7716922122985125,
"num_tokens": 1112624.0,
"step": 1710
},
{
"epoch": 54.8,
"grad_norm": 1.688085675239563,
"learning_rate": 1.4785441529154294e-05,
"loss": 0.9361,
"mean_token_accuracy": 0.767570473998785,
"num_tokens": 1129549.0,
"step": 1720
},
{
"epoch": 55.096,
"grad_norm": 1.3455687761306763,
"learning_rate": 1.4729954751860827e-05,
"loss": 1.0524,
"mean_token_accuracy": 0.7470491971518542,
"num_tokens": 1145039.0,
"step": 1730
},
{
"epoch": 55.416,
"grad_norm": 1.7406009435653687,
"learning_rate": 1.4674279741202495e-05,
"loss": 0.8839,
"mean_token_accuracy": 0.7727594949305058,
"num_tokens": 1159810.0,
"step": 1740
},
{
"epoch": 55.736,
"grad_norm": 2.1520540714263916,
"learning_rate": 1.4618418712823028e-05,
"loss": 0.9652,
"mean_token_accuracy": 0.7532628539949655,
"num_tokens": 1176245.0,
"step": 1750
},
{
"epoch": 56.032,
"grad_norm": 1.581739902496338,
"learning_rate": 1.4562373889768927e-05,
"loss": 0.9332,
"mean_token_accuracy": 0.7696672396079914,
"num_tokens": 1191008.0,
"step": 1760
},
{
"epoch": 56.352,
"grad_norm": 1.6474453210830688,
"learning_rate": 1.4506147502400977e-05,
"loss": 0.8376,
"mean_token_accuracy": 0.772033654898405,
"num_tokens": 1205755.0,
"step": 1770
},
{
"epoch": 56.672,
"grad_norm": 1.8299458026885986,
"learning_rate": 1.4449741788305514e-05,
"loss": 0.9889,
"mean_token_accuracy": 0.760890544205904,
"num_tokens": 1221863.0,
"step": 1780
},
{
"epoch": 56.992,
"grad_norm": 1.6759440898895264,
"learning_rate": 1.4393158992205348e-05,
"loss": 0.9799,
"mean_token_accuracy": 0.7623420935124159,
"num_tokens": 1238647.0,
"step": 1790
},
{
"epoch": 57.288,
"grad_norm": 2.1239564418792725,
"learning_rate": 1.4336401365870466e-05,
"loss": 0.9944,
"mean_token_accuracy": 0.7618524045557589,
"num_tokens": 1253030.0,
"step": 1800
},
{
"epoch": 57.608,
"grad_norm": 2.75298810005188,
"learning_rate": 1.4279471168028382e-05,
"loss": 0.9822,
"mean_token_accuracy": 0.7654153741896152,
"num_tokens": 1269147.0,
"step": 1810
},
{
"epoch": 57.928,
"grad_norm": 1.8775372505187988,
"learning_rate": 1.422237066427429e-05,
"loss": 0.8866,
"mean_token_accuracy": 0.7653848383575678,
"num_tokens": 1285368.0,
"step": 1820
},
{
"epoch": 58.224,
"grad_norm": 1.6810104846954346,
"learning_rate": 1.416510212698086e-05,
"loss": 0.9072,
"mean_token_accuracy": 0.7690872151303936,
"num_tokens": 1300660.0,
"step": 1830
},
{
"epoch": 58.544,
"grad_norm": 1.914070725440979,
"learning_rate": 1.4107667835207844e-05,
"loss": 1.0272,
"mean_token_accuracy": 0.7550359651446342,
"num_tokens": 1317143.0,
"step": 1840
},
{
"epoch": 58.864,
"grad_norm": 2.164189338684082,
"learning_rate": 1.4050070074611355e-05,
"loss": 0.9304,
"mean_token_accuracy": 0.7650556772947311,
"num_tokens": 1332705.0,
"step": 1850
},
{
"epoch": 59.16,
"grad_norm": 2.7804877758026123,
"learning_rate": 1.3992311137352918e-05,
"loss": 0.8424,
"mean_token_accuracy": 0.7625659327652003,
"num_tokens": 1345993.0,
"step": 1860
},
{
"epoch": 59.48,
"grad_norm": 1.7922106981277466,
"learning_rate": 1.3934393322008241e-05,
"loss": 0.8732,
"mean_token_accuracy": 0.7774093203246594,
"num_tokens": 1362688.0,
"step": 1870
},
{
"epoch": 59.8,
"grad_norm": 1.39845609664917,
"learning_rate": 1.387631893347575e-05,
"loss": 0.8986,
"mean_token_accuracy": 0.7775574192404747,
"num_tokens": 1379021.0,
"step": 1880
},
{
"epoch": 60.096,
"grad_norm": 2.3520147800445557,
"learning_rate": 1.3818090282884869e-05,
"loss": 0.9055,
"mean_token_accuracy": 0.7666742781529555,
"num_tokens": 1394388.0,
"step": 1890
},
{
"epoch": 60.416,
"grad_norm": 1.9911949634552002,
"learning_rate": 1.3759709687504022e-05,
"loss": 0.9495,
"mean_token_accuracy": 0.7690058574080467,
"num_tokens": 1410943.0,
"step": 1900
},
{
"epoch": 60.736,
"grad_norm": 2.0429329872131348,
"learning_rate": 1.3701179470648444e-05,
"loss": 0.9081,
"mean_token_accuracy": 0.764681476354599,
"num_tokens": 1428993.0,
"step": 1910
},
{
"epoch": 61.032,
"grad_norm": 1.7519456148147583,
"learning_rate": 1.36425019615877e-05,
"loss": 0.9026,
"mean_token_accuracy": 0.7673927166977444,
"num_tokens": 1441530.0,
"step": 1920
},
{
"epoch": 61.352,
"grad_norm": 2.105077028274536,
"learning_rate": 1.3583679495453e-05,
"loss": 0.8834,
"mean_token_accuracy": 0.7748925991356372,
"num_tokens": 1459071.0,
"step": 1930
},
{
"epoch": 61.672,
"grad_norm": 1.9322600364685059,
"learning_rate": 1.3524714413144282e-05,
"loss": 0.91,
"mean_token_accuracy": 0.7671246759593486,
"num_tokens": 1474214.0,
"step": 1940
},
{
"epoch": 61.992,
"grad_norm": 2.1808035373687744,
"learning_rate": 1.346560906123702e-05,
"loss": 0.915,
"mean_token_accuracy": 0.7675775479525327,
"num_tokens": 1489457.0,
"step": 1950
},
{
"epoch": 62.288,
"grad_norm": 1.765626311302185,
"learning_rate": 1.3406365791888865e-05,
"loss": 1.0076,
"mean_token_accuracy": 0.7589444365050342,
"num_tokens": 1504842.0,
"step": 1960
},
{
"epoch": 62.608,
"grad_norm": 2.268444061279297,
"learning_rate": 1.3346986962746038e-05,
"loss": 0.8381,
"mean_token_accuracy": 0.780813368782401,
"num_tokens": 1519276.0,
"step": 1970
},
{
"epoch": 62.928,
"grad_norm": 1.4477503299713135,
"learning_rate": 1.32874749368495e-05,
"loss": 0.925,
"mean_token_accuracy": 0.7658030860126018,
"num_tokens": 1535446.0,
"step": 1980
},
{
"epoch": 63.224,
"grad_norm": 2.732478618621826,
"learning_rate": 1.3227832082540908e-05,
"loss": 0.9051,
"mean_token_accuracy": 0.7650254467451895,
"num_tokens": 1550674.0,
"step": 1990
},
{
"epoch": 63.544,
"grad_norm": 2.2961671352386475,
"learning_rate": 1.3168060773368375e-05,
"loss": 0.9873,
"mean_token_accuracy": 0.7675742536783219,
"num_tokens": 1564485.0,
"step": 2000
},
{
"epoch": 64.832,
"grad_norm": 2.223515272140503,
"learning_rate": 1.3108163387991993e-05,
"loss": 0.8791,
"mean_token_accuracy": 0.7629961850121617,
"num_tokens": 16127.0,
"step": 2010
},
{
"epoch": 65.16,
"grad_norm": 2.646225690841675,
"learning_rate": 1.30481423100892e-05,
"loss": 0.9661,
"mean_token_accuracy": 0.768963757811523,
"num_tokens": 31851.0,
"step": 2020
},
{
"epoch": 65.48,
"grad_norm": 2.350883722305298,
"learning_rate": 1.2987999928259897e-05,
"loss": 0.9412,
"mean_token_accuracy": 0.7736104667186737,
"num_tokens": 47921.0,
"step": 2030
},
{
"epoch": 65.8,
"grad_norm": 1.8255304098129272,
"learning_rate": 1.2927738635931402e-05,
"loss": 0.9436,
"mean_token_accuracy": 0.7672818608582019,
"num_tokens": 64830.0,
"step": 2040
},
{
"epoch": 66.096,
"grad_norm": 1.562624454498291,
"learning_rate": 1.2867360831263191e-05,
"loss": 0.8713,
"mean_token_accuracy": 0.7805772717740085,
"num_tokens": 79811.0,
"step": 2050
},
{
"epoch": 66.416,
"grad_norm": 2.139047145843506,
"learning_rate": 1.280686891705147e-05,
"loss": 0.9338,
"mean_token_accuracy": 0.7658140640705824,
"num_tokens": 95253.0,
"step": 2060
},
{
"epoch": 66.736,
"grad_norm": 1.71339750289917,
"learning_rate": 1.2746265300633556e-05,
"loss": 0.8785,
"mean_token_accuracy": 0.7795989379286766,
"num_tokens": 110479.0,
"step": 2070
},
{
"epoch": 67.032,
"grad_norm": 2.284088134765625,
"learning_rate": 1.268555239379206e-05,
"loss": 0.9184,
"mean_token_accuracy": 0.7648406020692877,
"num_tokens": 125681.0,
"step": 2080
},
{
"epoch": 67.352,
"grad_norm": 1.8913801908493042,
"learning_rate": 1.2624732612658923e-05,
"loss": 0.8725,
"mean_token_accuracy": 0.7707512844353914,
"num_tokens": 141796.0,
"step": 2090
},
{
"epoch": 67.672,
"grad_norm": 1.9446955919265747,
"learning_rate": 1.2563808377619253e-05,
"loss": 1.0064,
"mean_token_accuracy": 0.7678989730775356,
"num_tokens": 158724.0,
"step": 2100
},
{
"epoch": 67.992,
"grad_norm": 2.2045912742614746,
"learning_rate": 1.250278211321501e-05,
"loss": 0.7989,
"mean_token_accuracy": 0.7780600219964982,
"num_tokens": 173629.0,
"step": 2110
},
{
"epoch": 68.288,
"grad_norm": 1.9509689807891846,
"learning_rate": 1.244165624804852e-05,
"loss": 0.8634,
"mean_token_accuracy": 0.7794965231740797,
"num_tokens": 188518.0,
"step": 2120
},
{
"epoch": 68.608,
"grad_norm": 2.1073553562164307,
"learning_rate": 1.2380433214685813e-05,
"loss": 0.8981,
"mean_token_accuracy": 0.7774934440851211,
"num_tokens": 205654.0,
"step": 2130
},
{
"epoch": 68.928,
"grad_norm": 1.652787208557129,
"learning_rate": 1.2319115449559835e-05,
"loss": 0.8801,
"mean_token_accuracy": 0.7723641652613878,
"num_tokens": 220311.0,
"step": 2140
},
{
"epoch": 69.224,
"grad_norm": 2.534707546234131,
"learning_rate": 1.2257705392873476e-05,
"loss": 0.8723,
"mean_token_accuracy": 0.7854163485604364,
"num_tokens": 236282.0,
"step": 2150
},
{
"epoch": 69.544,
"grad_norm": 1.578347086906433,
"learning_rate": 1.2196205488502463e-05,
"loss": 0.8169,
"mean_token_accuracy": 0.7866261303424835,
"num_tokens": 252837.0,
"step": 2160
},
{
"epoch": 69.864,
"grad_norm": 2.228119373321533,
"learning_rate": 1.2134618183898105e-05,
"loss": 0.9254,
"mean_token_accuracy": 0.7748822212219239,
"num_tokens": 267785.0,
"step": 2170
},
{
"epoch": 70.16,
"grad_norm": 2.410616159439087,
"learning_rate": 1.2072945929989888e-05,
"loss": 0.8046,
"mean_token_accuracy": 0.7787431329488754,
"num_tokens": 281535.0,
"step": 2180
},
{
"epoch": 70.48,
"grad_norm": 1.7590594291687012,
"learning_rate": 1.201119118108794e-05,
"loss": 0.8912,
"mean_token_accuracy": 0.7787077182903885,
"num_tokens": 298775.0,
"step": 2190
},
{
"epoch": 70.8,
"grad_norm": 3.3293755054473877,
"learning_rate": 1.1949356394785373e-05,
"loss": 0.9112,
"mean_token_accuracy": 0.7765318274497985,
"num_tokens": 314484.0,
"step": 2200
},
{
"epoch": 71.096,
"grad_norm": 2.363255739212036,
"learning_rate": 1.1887444031860456e-05,
"loss": 0.9063,
"mean_token_accuracy": 0.776000738546655,
"num_tokens": 327608.0,
"step": 2210
},
{
"epoch": 71.416,
"grad_norm": 1.7942370176315308,
"learning_rate": 1.1825456556178705e-05,
"loss": 0.8095,
"mean_token_accuracy": 0.7899976089596749,
"num_tokens": 345798.0,
"step": 2220
},
{
"epoch": 71.736,
"grad_norm": 1.9774558544158936,
"learning_rate": 1.1763396434594823e-05,
"loss": 0.9154,
"mean_token_accuracy": 0.7691428020596505,
"num_tokens": 361462.0,
"step": 2230
},
{
"epoch": 72.032,
"grad_norm": 1.6556707620620728,
"learning_rate": 1.1701266136854532e-05,
"loss": 0.8829,
"mean_token_accuracy": 0.7704721173724612,
"num_tokens": 376304.0,
"step": 2240
},
{
"epoch": 72.352,
"grad_norm": 2.80587100982666,
"learning_rate": 1.1639068135496285e-05,
"loss": 0.9485,
"mean_token_accuracy": 0.7709558174014092,
"num_tokens": 390379.0,
"step": 2250
},
{
"epoch": 72.672,
"grad_norm": 2.0841872692108154,
"learning_rate": 1.1576804905752873e-05,
"loss": 0.9589,
"mean_token_accuracy": 0.7605574566870927,
"num_tokens": 408237.0,
"step": 2260
},
{
"epoch": 72.992,
"grad_norm": 2.8403215408325195,
"learning_rate": 1.1514478925452905e-05,
"loss": 0.7252,
"mean_token_accuracy": 0.7972878247499466,
"num_tokens": 423763.0,
"step": 2270
},
{
"epoch": 73.288,
"grad_norm": 6.484622955322266,
"learning_rate": 1.1452092674922224e-05,
"loss": 0.9519,
"mean_token_accuracy": 0.7691420135465828,
"num_tokens": 437835.0,
"step": 2280
},
{
"epoch": 73.608,
"grad_norm": 2.27260160446167,
"learning_rate": 1.1389648636885186e-05,
"loss": 0.8394,
"mean_token_accuracy": 0.7912575013935566,
"num_tokens": 455397.0,
"step": 2290
},
{
"epoch": 73.928,
"grad_norm": 1.805159091949463,
"learning_rate": 1.132714929636586e-05,
"loss": 0.8545,
"mean_token_accuracy": 0.7838539175689221,
"num_tokens": 471371.0,
"step": 2300
},
{
"epoch": 74.224,
"grad_norm": 2.250121593475342,
"learning_rate": 1.1264597140589127e-05,
"loss": 0.8243,
"mean_token_accuracy": 0.7824344949142353,
"num_tokens": 486629.0,
"step": 2310
},
{
"epoch": 74.544,
"grad_norm": 3.2095444202423096,
"learning_rate": 1.120199465888171e-05,
"loss": 0.8556,
"mean_token_accuracy": 0.7762523703277111,
"num_tokens": 501461.0,
"step": 2320
},
{
"epoch": 74.864,
"grad_norm": 2.3047547340393066,
"learning_rate": 1.1139344342573106e-05,
"loss": 0.8754,
"mean_token_accuracy": 0.7786926485598087,
"num_tokens": 516976.0,
"step": 2330
},
{
"epoch": 75.16,
"grad_norm": 2.0419108867645264,
"learning_rate": 1.1076648684896441e-05,
"loss": 0.8008,
"mean_token_accuracy": 0.7848166005836951,
"num_tokens": 532021.0,
"step": 2340
},
{
"epoch": 75.48,
"grad_norm": 2.602372646331787,
"learning_rate": 1.101391018088923e-05,
"loss": 0.9487,
"mean_token_accuracy": 0.7746396526694298,
"num_tokens": 546596.0,
"step": 2350
},
{
"epoch": 75.8,
"grad_norm": 1.9730421304702759,
"learning_rate": 1.0951131327294123e-05,
"loss": 0.8744,
"mean_token_accuracy": 0.7984356313943863,
"num_tokens": 563545.0,
"step": 2360
},
{
"epoch": 76.096,
"grad_norm": 2.331416130065918,
"learning_rate": 1.0888314622459509e-05,
"loss": 0.8102,
"mean_token_accuracy": 0.7831854063111383,
"num_tokens": 578977.0,
"step": 2370
},
{
"epoch": 76.416,
"grad_norm": 2.8027427196502686,
"learning_rate": 1.082546256624011e-05,
"loss": 0.8598,
"mean_token_accuracy": 0.7751214955002069,
"num_tokens": 594479.0,
"step": 2380
},
{
"epoch": 76.736,
"grad_norm": 1.8376470804214478,
"learning_rate": 1.0762577659897495e-05,
"loss": 0.8722,
"mean_token_accuracy": 0.7737262137234211,
"num_tokens": 611581.0,
"step": 2390
},
{
"epoch": 77.032,
"grad_norm": 2.3731982707977295,
"learning_rate": 1.0699662406000533e-05,
"loss": 0.8581,
"mean_token_accuracy": 0.7886427938938141,
"num_tokens": 626188.0,
"step": 2400
},
{
"epoch": 77.352,
"grad_norm": 1.711204171180725,
"learning_rate": 1.0636719308325803e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.7730351705104113,
"num_tokens": 643408.0,
"step": 2410
},
{
"epoch": 77.672,
"grad_norm": 1.7660971879959106,
"learning_rate": 1.0573750871757965e-05,
"loss": 0.7626,
"mean_token_accuracy": 0.7915604203939438,
"num_tokens": 657604.0,
"step": 2420
},
{
"epoch": 77.992,
"grad_norm": 2.0509514808654785,
"learning_rate": 1.0510759602190055e-05,
"loss": 0.8603,
"mean_token_accuracy": 0.784786606580019,
"num_tokens": 674373.0,
"step": 2430
},
{
"epoch": 78.288,
"grad_norm": 2.348026752471924,
"learning_rate": 1.0447748006423775e-05,
"loss": 0.8823,
"mean_token_accuracy": 0.7760254515183939,
"num_tokens": 690196.0,
"step": 2440
},
{
"epoch": 78.608,
"grad_norm": 2.094943046569824,
"learning_rate": 1.0384718592069733e-05,
"loss": 0.8474,
"mean_token_accuracy": 0.7716075176373124,
"num_tokens": 706149.0,
"step": 2450
},
{
"epoch": 78.928,
"grad_norm": 2.465407609939575,
"learning_rate": 1.0321673867447642e-05,
"loss": 0.8644,
"mean_token_accuracy": 0.786153320223093,
"num_tokens": 721536.0,
"step": 2460
},
{
"epoch": 79.224,
"grad_norm": 2.3234193325042725,
"learning_rate": 1.0258616341486505e-05,
"loss": 0.9199,
"mean_token_accuracy": 0.7744305520444303,
"num_tokens": 737605.0,
"step": 2470
},
{
"epoch": 79.544,
"grad_norm": 1.9042166471481323,
"learning_rate": 1.019554852362476e-05,
"loss": 0.8054,
"mean_token_accuracy": 0.7926479011774064,
"num_tokens": 753913.0,
"step": 2480
},
{
"epoch": 79.864,
"grad_norm": 2.5160131454467773,
"learning_rate": 1.0132472923710437e-05,
"loss": 0.8329,
"mean_token_accuracy": 0.7762512426823378,
"num_tokens": 769204.0,
"step": 2490
},
{
"epoch": 80.16,
"grad_norm": 2.8922526836395264,
"learning_rate": 1.0069392051901241e-05,
"loss": 0.8492,
"mean_token_accuracy": 0.7814656487993292,
"num_tokens": 784216.0,
"step": 2500
},
{
"epoch": 80.48,
"grad_norm": 2.763730049133301,
"learning_rate": 1.0006308418564697e-05,
"loss": 0.8454,
"mean_token_accuracy": 0.7843520522117615,
"num_tokens": 800421.0,
"step": 2510
},
{
"epoch": 80.8,
"grad_norm": 2.41654372215271,
"learning_rate": 9.94322453417821e-06,
"loss": 0.7333,
"mean_token_accuracy": 0.8014784809201956,
"num_tokens": 815977.0,
"step": 2520
},
{
"epoch": 81.096,
"grad_norm": 2.7866134643554688,
"learning_rate": 9.880142909229188e-06,
"loss": 0.8167,
"mean_token_accuracy": 0.7932786933473639,
"num_tokens": 829350.0,
"step": 2530
},
{
"epoch": 81.416,
"grad_norm": 1.8219573497772217,
"learning_rate": 9.817066054115117e-06,
"loss": 0.8731,
"mean_token_accuracy": 0.7871743485331535,
"num_tokens": 845363.0,
"step": 2540
},
{
"epoch": 81.736,
"grad_norm": 1.9417917728424072,
"learning_rate": 9.753996479043672e-06,
"loss": 0.8828,
"mean_token_accuracy": 0.7732684839516878,
"num_tokens": 862288.0,
"step": 2550
},
{
"epoch": 82.032,
"grad_norm": 1.784688949584961,
"learning_rate": 9.690936693932793e-06,
"loss": 0.8167,
"mean_token_accuracy": 0.791683446716618,
"num_tokens": 876947.0,
"step": 2560
},
{
"epoch": 82.352,
"grad_norm": 2.9179623126983643,
"learning_rate": 9.627889208310831e-06,
"loss": 0.8395,
"mean_token_accuracy": 0.7839712589979172,
"num_tokens": 891614.0,
"step": 2570
},
{
"epoch": 82.672,
"grad_norm": 1.7839528322219849,
"learning_rate": 9.564856531216666e-06,
"loss": 0.8015,
"mean_token_accuracy": 0.8007228754460811,
"num_tokens": 909761.0,
"step": 2580
},
{
"epoch": 82.992,
"grad_norm": 2.202512741088867,
"learning_rate": 9.50184117109986e-06,
"loss": 0.8684,
"mean_token_accuracy": 0.7817719358950853,
"num_tokens": 925239.0,
"step": 2590
},
{
"epoch": 83.288,
"grad_norm": 2.0895087718963623,
"learning_rate": 9.438845635720817e-06,
"loss": 0.8603,
"mean_token_accuracy": 0.7858564757012032,
"num_tokens": 940941.0,
"step": 2600
},
{
"epoch": 84.192,
"grad_norm": 1.9542677402496338,
"learning_rate": 9.375872432051006e-06,
"loss": 0.9471,
"mean_token_accuracy": 0.780240989312893,
"num_tokens": 17808.0,
"step": 2610
},
{
"epoch": 84.512,
"grad_norm": 2.257493257522583,
"learning_rate": 9.312924066173178e-06,
"loss": 0.8627,
"mean_token_accuracy": 0.7828688979148865,
"num_tokens": 33397.0,
"step": 2620
},
{
"epoch": 84.832,
"grad_norm": 1.9773480892181396,
"learning_rate": 9.25000304318164e-06,
"loss": 0.8219,
"mean_token_accuracy": 0.7853579100221395,
"num_tokens": 49526.0,
"step": 2630
},
{
"epoch": 85.128,
"grad_norm": 2.3270950317382812,
"learning_rate": 9.187111867082568e-06,
"loss": 0.8709,
"mean_token_accuracy": 0.7843060026297698,
"num_tokens": 64196.0,
"step": 2640
},
{
"epoch": 85.448,
"grad_norm": 1.751816749572754,
"learning_rate": 9.124253040694334e-06,
"loss": 0.8058,
"mean_token_accuracy": 0.7842564310878515,
"num_tokens": 82640.0,
"step": 2650
},
{
"epoch": 85.768,
"grad_norm": 2.2109670639038086,
"learning_rate": 9.061429065547933e-06,
"loss": 0.7968,
"mean_token_accuracy": 0.7830525517463685,
"num_tokens": 96760.0,
"step": 2660
},
{
"epoch": 86.064,
"grad_norm": 1.9558287858963013,
"learning_rate": 8.998642441787417e-06,
"loss": 0.7927,
"mean_token_accuracy": 0.7956994892777624,
"num_tokens": 110140.0,
"step": 2670
},
{
"epoch": 86.384,
"grad_norm": 2.14022159576416,
"learning_rate": 8.935895668070405e-06,
"loss": 0.8324,
"mean_token_accuracy": 0.7832688026130199,
"num_tokens": 125468.0,
"step": 2680
},
{
"epoch": 86.704,
"grad_norm": 2.9838459491729736,
"learning_rate": 8.873191241468631e-06,
"loss": 0.8433,
"mean_token_accuracy": 0.7785748850554228,
"num_tokens": 141001.0,
"step": 2690
},
{
"epoch": 87.0,
"grad_norm": 3.1861932277679443,
"learning_rate": 8.810531657368594e-06,
"loss": 0.8058,
"mean_token_accuracy": 0.794649675891206,
"num_tokens": 156865.0,
"step": 2700
},
{
"epoch": 87.32,
"grad_norm": 1.9827327728271484,
"learning_rate": 8.747919409372236e-06,
"loss": 0.8459,
"mean_token_accuracy": 0.7892976485192775,
"num_tokens": 173221.0,
"step": 2710
},
{
"epoch": 87.64,
"grad_norm": 2.972670078277588,
"learning_rate": 8.685356989197717e-06,
"loss": 0.8232,
"mean_token_accuracy": 0.7798152294009923,
"num_tokens": 188954.0,
"step": 2720
},
{
"epoch": 87.96,
"grad_norm": 2.67842173576355,
"learning_rate": 8.62284688658023e-06,
"loss": 0.8046,
"mean_token_accuracy": 0.79255036637187,
"num_tokens": 205013.0,
"step": 2730
},
{
"epoch": 88.256,
"grad_norm": 1.817650556564331,
"learning_rate": 8.56039158917296e-06,
"loss": 0.8276,
"mean_token_accuracy": 0.7901485256246619,
"num_tokens": 219723.0,
"step": 2740
},
{
"epoch": 88.576,
"grad_norm": 1.7845501899719238,
"learning_rate": 8.497993582448044e-06,
"loss": 0.8554,
"mean_token_accuracy": 0.7930382348597049,
"num_tokens": 235112.0,
"step": 2750
},
{
"epoch": 88.896,
"grad_norm": 2.3108439445495605,
"learning_rate": 8.43565534959769e-06,
"loss": 0.8519,
"mean_token_accuracy": 0.7890813775360584,
"num_tokens": 252361.0,
"step": 2760
},
{
"epoch": 89.192,
"grad_norm": 1.8335771560668945,
"learning_rate": 8.373379371435346e-06,
"loss": 0.7812,
"mean_token_accuracy": 0.7960183253964862,
"num_tokens": 265617.0,
"step": 2770
},
{
"epoch": 89.512,
"grad_norm": 2.717653512954712,
"learning_rate": 8.31116812629696e-06,
"loss": 0.8319,
"mean_token_accuracy": 0.7901519671082496,
"num_tokens": 281770.0,
"step": 2780
},
{
"epoch": 89.832,
"grad_norm": 1.9572986364364624,
"learning_rate": 8.249024089942364e-06,
"loss": 0.7733,
"mean_token_accuracy": 0.7931222733110189,
"num_tokens": 298511.0,
"step": 2790
},
{
"epoch": 90.128,
"grad_norm": 1.8655132055282593,
"learning_rate": 8.186949735456758e-06,
"loss": 0.9238,
"mean_token_accuracy": 0.7922740490049929,
"num_tokens": 312957.0,
"step": 2800
},
{
"epoch": 90.448,
"grad_norm": 2.0918149948120117,
"learning_rate": 8.12494753315228e-06,
"loss": 0.8428,
"mean_token_accuracy": 0.7884520322084427,
"num_tokens": 330412.0,
"step": 2810
},
{
"epoch": 90.768,
"grad_norm": 1.7944889068603516,
"learning_rate": 8.063019950469688e-06,
"loss": 0.8145,
"mean_token_accuracy": 0.7932636447250843,
"num_tokens": 345474.0,
"step": 2820
},
{
"epoch": 91.064,
"grad_norm": 1.7774523496627808,
"learning_rate": 8.001169451880186e-06,
"loss": 0.7867,
"mean_token_accuracy": 0.7842674186906299,
"num_tokens": 360670.0,
"step": 2830
},
{
"epoch": 91.384,
"grad_norm": 2.441330909729004,
"learning_rate": 7.939398498787332e-06,
"loss": 0.835,
"mean_token_accuracy": 0.7940668806433677,
"num_tokens": 375578.0,
"step": 2840
},
{
"epoch": 91.704,
"grad_norm": 1.986222505569458,
"learning_rate": 7.877709549429092e-06,
"loss": 0.8162,
"mean_token_accuracy": 0.7950244933366776,
"num_tokens": 392683.0,
"step": 2850
},
{
"epoch": 92.0,
"grad_norm": 4.878885269165039,
"learning_rate": 7.816105058780019e-06,
"loss": 0.788,
"mean_token_accuracy": 0.7837782482037673,
"num_tokens": 407330.0,
"step": 2860
},
{
"epoch": 92.32,
"grad_norm": 2.343815326690674,
"learning_rate": 7.754587478453528e-06,
"loss": 0.7753,
"mean_token_accuracy": 0.7878943778574466,
"num_tokens": 420579.0,
"step": 2870
},
{
"epoch": 92.64,
"grad_norm": 2.5471577644348145,
"learning_rate": 7.69315925660436e-06,
"loss": 0.88,
"mean_token_accuracy": 0.791867159307003,
"num_tokens": 438517.0,
"step": 2880
},
{
"epoch": 92.96,
"grad_norm": 2.2550160884857178,
"learning_rate": 7.631822837831143e-06,
"loss": 0.8228,
"mean_token_accuracy": 0.7879139900207519,
"num_tokens": 455707.0,
"step": 2890
},
{
"epoch": 93.256,
"grad_norm": 2.0642154216766357,
"learning_rate": 7.570580663079114e-06,
"loss": 0.8605,
"mean_token_accuracy": 0.7856367556629954,
"num_tokens": 469780.0,
"step": 2900
},
{
"epoch": 93.576,
"grad_norm": 2.1604714393615723,
"learning_rate": 7.509435169542961e-06,
"loss": 0.7849,
"mean_token_accuracy": 0.7887919537723065,
"num_tokens": 484586.0,
"step": 2910
},
{
"epoch": 93.896,
"grad_norm": 2.2268590927124023,
"learning_rate": 7.448388790569851e-06,
"loss": 0.8657,
"mean_token_accuracy": 0.7843763899058104,
"num_tokens": 502557.0,
"step": 2920
},
{
"epoch": 94.192,
"grad_norm": 1.8110442161560059,
"learning_rate": 7.387443955562586e-06,
"loss": 0.7889,
"mean_token_accuracy": 0.7898652823390188,
"num_tokens": 516331.0,
"step": 2930
},
{
"epoch": 94.512,
"grad_norm": 2.456662178039551,
"learning_rate": 7.326603089882925e-06,
"loss": 0.7788,
"mean_token_accuracy": 0.7980688564479351,
"num_tokens": 532511.0,
"step": 2940
},
{
"epoch": 94.832,
"grad_norm": 2.060681104660034,
"learning_rate": 7.26586861475506e-06,
"loss": 0.7585,
"mean_token_accuracy": 0.7954543896019459,
"num_tokens": 549222.0,
"step": 2950
},
{
"epoch": 95.128,
"grad_norm": 2.5429089069366455,
"learning_rate": 7.205242947169258e-06,
"loss": 0.8637,
"mean_token_accuracy": 0.7921945170776264,
"num_tokens": 563980.0,
"step": 2960
},
{
"epoch": 95.448,
"grad_norm": 2.3039979934692383,
"learning_rate": 7.144728499785683e-06,
"loss": 0.7492,
"mean_token_accuracy": 0.801618828624487,
"num_tokens": 579326.0,
"step": 2970
},
{
"epoch": 95.768,
"grad_norm": 1.8464511632919312,
"learning_rate": 7.0843276808383785e-06,
"loss": 0.8439,
"mean_token_accuracy": 0.7837361056357622,
"num_tokens": 596726.0,
"step": 2980
},
{
"epoch": 96.064,
"grad_norm": 2.409407377243042,
"learning_rate": 7.024042894039434e-06,
"loss": 0.7315,
"mean_token_accuracy": 0.7905531976674054,
"num_tokens": 611478.0,
"step": 2990
},
{
"epoch": 96.384,
"grad_norm": 3.4677658081054688,
"learning_rate": 6.963876538483305e-06,
"loss": 0.7926,
"mean_token_accuracy": 0.7856792386621236,
"num_tokens": 626726.0,
"step": 3000
},
{
"epoch": 96.704,
"grad_norm": 2.2152152061462402,
"learning_rate": 6.9038310085513716e-06,
"loss": 0.8723,
"mean_token_accuracy": 0.781861812621355,
"num_tokens": 641499.0,
"step": 3010
},
{
"epoch": 97.0,
"grad_norm": 2.4535887241363525,
"learning_rate": 6.843908693816627e-06,
"loss": 0.8416,
"mean_token_accuracy": 0.8028259905608924,
"num_tokens": 657795.0,
"step": 3020
},
{
"epoch": 97.32,
"grad_norm": 1.937121033668518,
"learning_rate": 6.784111978948596e-06,
"loss": 0.746,
"mean_token_accuracy": 0.7986438237130642,
"num_tokens": 673802.0,
"step": 3030
},
{
"epoch": 97.64,
"grad_norm": 1.616132140159607,
"learning_rate": 6.724443243618421e-06,
"loss": 0.8305,
"mean_token_accuracy": 0.7848228823393584,
"num_tokens": 690896.0,
"step": 3040
},
{
"epoch": 97.96,
"grad_norm": 2.3996787071228027,
"learning_rate": 6.664904862404175e-06,
"loss": 0.8508,
"mean_token_accuracy": 0.7884074129164219,
"num_tokens": 705680.0,
"step": 3050
},
{
"epoch": 98.256,
"grad_norm": 3.018188714981079,
"learning_rate": 6.605499204696351e-06,
"loss": 0.8035,
"mean_token_accuracy": 0.801042732354757,
"num_tokens": 720238.0,
"step": 3060
},
{
"epoch": 98.576,
"grad_norm": 2.550436496734619,
"learning_rate": 6.546228634603578e-06,
"loss": 0.7711,
"mean_token_accuracy": 0.798908605799079,
"num_tokens": 735457.0,
"step": 3070
},
{
"epoch": 98.896,
"grad_norm": 3.060084819793701,
"learning_rate": 6.487095510858543e-06,
"loss": 0.9337,
"mean_token_accuracy": 0.7785589572042226,
"num_tokens": 752742.0,
"step": 3080
},
{
"epoch": 99.192,
"grad_norm": 2.1915123462677,
"learning_rate": 6.428102186724101e-06,
"loss": 0.9185,
"mean_token_accuracy": 0.7807568505003646,
"num_tokens": 765549.0,
"step": 3090
},
{
"epoch": 99.512,
"grad_norm": 2.3755106925964355,
"learning_rate": 6.369251009899644e-06,
"loss": 0.7954,
"mean_token_accuracy": 0.788112024590373,
"num_tokens": 782597.0,
"step": 3100
},
{
"epoch": 99.832,
"grad_norm": 1.9347033500671387,
"learning_rate": 6.310544322427674e-06,
"loss": 0.8488,
"mean_token_accuracy": 0.8023913279175758,
"num_tokens": 799203.0,
"step": 3110
},
{
"epoch": 100.128,
"grad_norm": 2.046133279800415,
"learning_rate": 6.251984460600588e-06,
"loss": 0.7156,
"mean_token_accuracy": 0.7995543536302205,
"num_tokens": 813931.0,
"step": 3120
},
{
"epoch": 100.448,
"grad_norm": 2.557436943054199,
"learning_rate": 6.193573754867708e-06,
"loss": 0.7914,
"mean_token_accuracy": 0.8036689855158329,
"num_tokens": 830433.0,
"step": 3130
},
{
"epoch": 100.768,
"grad_norm": 2.666550636291504,
"learning_rate": 6.135314529742529e-06,
"loss": 0.8148,
"mean_token_accuracy": 0.79065520465374,
"num_tokens": 846129.0,
"step": 3140
},
{
"epoch": 101.064,
"grad_norm": 2.4647037982940674,
"learning_rate": 6.077209103710232e-06,
"loss": 0.8138,
"mean_token_accuracy": 0.7805173554130502,
"num_tokens": 860395.0,
"step": 3150
},
{
"epoch": 101.384,
"grad_norm": 1.9933632612228394,
"learning_rate": 6.019259789135404e-06,
"loss": 0.7916,
"mean_token_accuracy": 0.7982403136789799,
"num_tokens": 878034.0,
"step": 3160
},
{
"epoch": 101.704,
"grad_norm": 2.3307456970214844,
"learning_rate": 5.961468892170016e-06,
"loss": 0.7931,
"mean_token_accuracy": 0.7907839316874743,
"num_tokens": 892819.0,
"step": 3170
},
{
"epoch": 102.0,
"grad_norm": 4.047188758850098,
"learning_rate": 5.903838712661647e-06,
"loss": 0.7685,
"mean_token_accuracy": 0.7972375758596368,
"num_tokens": 908260.0,
"step": 3180
},
{
"epoch": 102.32,
"grad_norm": 1.9516690969467163,
"learning_rate": 5.846371544061962e-06,
"loss": 0.7943,
"mean_token_accuracy": 0.7980046071112156,
"num_tokens": 924521.0,
"step": 3190
},
{
"epoch": 102.64,
"grad_norm": 2.3500168323516846,
"learning_rate": 5.789069673335446e-06,
"loss": 0.7704,
"mean_token_accuracy": 0.8008730575442314,
"num_tokens": 940805.0,
"step": 3200
},
{
"epoch": 102.96,
"grad_norm": 1.9596396684646606,
"learning_rate": 5.731935380868381e-06,
"loss": 0.816,
"mean_token_accuracy": 0.7914111088961363,
"num_tokens": 957150.0,
"step": 3210
},
{
"epoch": 103.256,
"grad_norm": 2.2512779235839844,
"learning_rate": 5.674970940378102e-06,
"loss": 0.7422,
"mean_token_accuracy": 0.80112284542741,
"num_tokens": 970896.0,
"step": 3220
},
{
"epoch": 103.576,
"grad_norm": 2.6935369968414307,
"learning_rate": 5.618178618822512e-06,
"loss": 0.856,
"mean_token_accuracy": 0.7918466597795486,
"num_tokens": 986051.0,
"step": 3230
},
{
"epoch": 103.896,
"grad_norm": 2.1991372108459473,
"learning_rate": 5.561560676309874e-06,
"loss": 0.7392,
"mean_token_accuracy": 0.7981615476310253,
"num_tokens": 1001657.0,
"step": 3240
},
{
"epoch": 104.192,
"grad_norm": 2.4802706241607666,
"learning_rate": 5.505119366008847e-06,
"loss": 0.8709,
"mean_token_accuracy": 0.7797639261226397,
"num_tokens": 1018539.0,
"step": 3250
},
{
"epoch": 104.512,
"grad_norm": 2.416335105895996,
"learning_rate": 5.448856934058837e-06,
"loss": 0.7811,
"mean_token_accuracy": 0.802381145209074,
"num_tokens": 1035770.0,
"step": 3260
},
{
"epoch": 104.832,
"grad_norm": 1.9266993999481201,
"learning_rate": 5.392775619480606e-06,
"loss": 0.7597,
"mean_token_accuracy": 0.801979061216116,
"num_tokens": 1050287.0,
"step": 3270
},
{
"epoch": 105.128,
"grad_norm": 3.1635992527008057,
"learning_rate": 5.336877654087161e-06,
"loss": 0.8394,
"mean_token_accuracy": 0.7894677262048464,
"num_tokens": 1063888.0,
"step": 3280
},
{
"epoch": 105.448,
"grad_norm": 2.317321300506592,
"learning_rate": 5.281165262394938e-06,
"loss": 0.8313,
"mean_token_accuracy": 0.7858642000705004,
"num_tokens": 1080743.0,
"step": 3290
},
{
"epoch": 105.768,
"grad_norm": 1.9168007373809814,
"learning_rate": 5.2256406615353015e-06,
"loss": 0.8397,
"mean_token_accuracy": 0.7893419295549393,
"num_tokens": 1097525.0,
"step": 3300
},
{
"epoch": 106.064,
"grad_norm": 1.7733817100524902,
"learning_rate": 5.170306061166254e-06,
"loss": 0.6765,
"mean_token_accuracy": 0.8171853680868406,
"num_tokens": 1112336.0,
"step": 3310
},
{
"epoch": 106.384,
"grad_norm": 2.349670648574829,
"learning_rate": 5.115163663384563e-06,
"loss": 0.7588,
"mean_token_accuracy": 0.789124884083867,
"num_tokens": 1126428.0,
"step": 3320
},
{
"epoch": 106.704,
"grad_norm": 1.7135353088378906,
"learning_rate": 5.060215662638084e-06,
"loss": 0.7926,
"mean_token_accuracy": 0.7968744553625584,
"num_tokens": 1142993.0,
"step": 3330
},
{
"epoch": 107.0,
"grad_norm": 6.969696044921875,
"learning_rate": 5.005464245638447e-06,
"loss": 0.8879,
"mean_token_accuracy": 0.790745651399767,
"num_tokens": 1158725.0,
"step": 3340
},
{
"epoch": 107.32,
"grad_norm": 2.188507080078125,
"learning_rate": 4.9509115912740445e-06,
"loss": 0.7252,
"mean_token_accuracy": 0.8085566960275173,
"num_tokens": 1174330.0,
"step": 3350
},
{
"epoch": 107.64,
"grad_norm": 2.8108561038970947,
"learning_rate": 4.896559870523279e-06,
"loss": 0.887,
"mean_token_accuracy": 0.7786224085837603,
"num_tokens": 1188830.0,
"step": 3360
},
{
"epoch": 107.96,
"grad_norm": 2.438131093978882,
"learning_rate": 4.842411246368226e-06,
"loss": 0.795,
"mean_token_accuracy": 0.8030483074486255,
"num_tokens": 1207364.0,
"step": 3370
},
{
"epoch": 108.256,
"grad_norm": 2.4335777759552,
"learning_rate": 4.788467873708508e-06,
"loss": 0.8032,
"mean_token_accuracy": 0.7993817514664417,
"num_tokens": 1223655.0,
"step": 3380
},
{
"epoch": 108.576,
"grad_norm": 2.748537540435791,
"learning_rate": 4.734731899275557e-06,
"loss": 0.8288,
"mean_token_accuracy": 0.7918653458356857,
"num_tokens": 1238999.0,
"step": 3390
},
{
"epoch": 108.896,
"grad_norm": 2.6951160430908203,
"learning_rate": 4.681205461547187e-06,
"loss": 0.7515,
"mean_token_accuracy": 0.8007099393755197,
"num_tokens": 1253439.0,
"step": 3400
},
{
"epoch": 109.992,
"grad_norm": 2.926764726638794,
"learning_rate": 4.62789069066248e-06,
"loss": 0.8692,
"mean_token_accuracy": 0.7860011033713817,
"num_tokens": 16823.0,
"step": 3410
},
{
"epoch": 110.32,
"grad_norm": 2.0920846462249756,
"learning_rate": 4.574789708337018e-06,
"loss": 0.9181,
"mean_token_accuracy": 0.7867580187029954,
"num_tokens": 31381.0,
"step": 3420
},
{
"epoch": 110.64,
"grad_norm": 2.0056655406951904,
"learning_rate": 4.521904627778463e-06,
"loss": 0.7652,
"mean_token_accuracy": 0.801980373263359,
"num_tokens": 48922.0,
"step": 3430
},
{
"epoch": 110.96,
"grad_norm": 2.2784852981567383,
"learning_rate": 4.469237553602433e-06,
"loss": 0.7703,
"mean_token_accuracy": 0.7932860311120749,
"num_tokens": 65112.0,
"step": 3440
},
{
"epoch": 111.256,
"grad_norm": 2.1628239154815674,
"learning_rate": 4.416790581748766e-06,
"loss": 0.7087,
"mean_token_accuracy": 0.8054195183354456,
"num_tokens": 80534.0,
"step": 3450
},
{
"epoch": 111.576,
"grad_norm": 2.2064616680145264,
"learning_rate": 4.364565799398102e-06,
"loss": 0.7648,
"mean_token_accuracy": 0.7998192355036735,
"num_tokens": 97437.0,
"step": 3460
},
{
"epoch": 111.896,
"grad_norm": 2.387873888015747,
"learning_rate": 4.312565284888819e-06,
"loss": 0.81,
"mean_token_accuracy": 0.7902990553528071,
"num_tokens": 111663.0,
"step": 3470
},
{
"epoch": 112.192,
"grad_norm": 2.14197039604187,
"learning_rate": 4.2607911076343455e-06,
"loss": 0.7631,
"mean_token_accuracy": 0.7986521447027052,
"num_tokens": 127399.0,
"step": 3480
},
{
"epoch": 112.512,
"grad_norm": 2.459407091140747,
"learning_rate": 4.2092453280407605e-06,
"loss": 0.7647,
"mean_token_accuracy": 0.7940516691654921,
"num_tokens": 143237.0,
"step": 3490
},
{
"epoch": 112.832,
"grad_norm": 2.606563091278076,
"learning_rate": 4.157929997424853e-06,
"loss": 0.7958,
"mean_token_accuracy": 0.8024938710033893,
"num_tokens": 159484.0,
"step": 3500
},
{
"epoch": 113.128,
"grad_norm": 2.4822981357574463,
"learning_rate": 4.106847157932445e-06,
"loss": 0.7599,
"mean_token_accuracy": 0.8050464319216238,
"num_tokens": 172111.0,
"step": 3510
},
{
"epoch": 113.448,
"grad_norm": 2.0524537563323975,
"learning_rate": 4.0559988424571365e-06,
"loss": 0.7676,
"mean_token_accuracy": 0.797309584543109,
"num_tokens": 189542.0,
"step": 3520
},
{
"epoch": 113.768,
"grad_norm": 2.3543918132781982,
"learning_rate": 4.005387074559421e-06,
"loss": 0.8929,
"mean_token_accuracy": 0.783655048161745,
"num_tokens": 207117.0,
"step": 3530
},
{
"epoch": 114.064,
"grad_norm": 2.1995062828063965,
"learning_rate": 3.9550138683861184e-06,
"loss": 0.7502,
"mean_token_accuracy": 0.8102532826565407,
"num_tokens": 220475.0,
"step": 3540
},
{
"epoch": 114.384,
"grad_norm": 2.4602413177490234,
"learning_rate": 3.904881228590253e-06,
"loss": 0.7807,
"mean_token_accuracy": 0.7911395899951458,
"num_tokens": 235641.0,
"step": 3550
},
{
"epoch": 114.704,
"grad_norm": 1.7863750457763672,
"learning_rate": 3.854991150251271e-06,
"loss": 0.8174,
"mean_token_accuracy": 0.7878696542233229,
"num_tokens": 252557.0,
"step": 3560
},
{
"epoch": 115.0,
"grad_norm": 2.476003885269165,
"learning_rate": 3.8053456187956315e-06,
"loss": 0.8084,
"mean_token_accuracy": 0.7975163347012287,
"num_tokens": 267658.0,
"step": 3570
},
{
"epoch": 115.32,
"grad_norm": 2.536450147628784,
"learning_rate": 3.7559466099178e-06,
"loss": 0.8019,
"mean_token_accuracy": 0.7918349288403987,
"num_tokens": 283710.0,
"step": 3580
},
{
"epoch": 115.64,
"grad_norm": 1.9077178239822388,
"learning_rate": 3.7067960895016277e-06,
"loss": 0.8318,
"mean_token_accuracy": 0.7928959406912327,
"num_tokens": 299210.0,
"step": 3590
},
{
"epoch": 115.96,
"grad_norm": 1.9973210096359253,
"learning_rate": 3.6578960135421117e-06,
"loss": 0.7243,
"mean_token_accuracy": 0.8063468877226114,
"num_tokens": 315732.0,
"step": 3600
},
{
"epoch": 116.256,
"grad_norm": 2.727466344833374,
"learning_rate": 3.6092483280675683e-06,
"loss": 0.7522,
"mean_token_accuracy": 0.814963810347222,
"num_tokens": 329521.0,
"step": 3610
},
{
"epoch": 116.576,
"grad_norm": 5.467685222625732,
"learning_rate": 3.5608549690621562e-06,
"loss": 0.8147,
"mean_token_accuracy": 0.7913577631115913,
"num_tokens": 346404.0,
"step": 3620
},
{
"epoch": 116.896,
"grad_norm": 2.106499433517456,
"learning_rate": 3.512717862388876e-06,
"loss": 0.8025,
"mean_token_accuracy": 0.7868743006139993,
"num_tokens": 362461.0,
"step": 3630
},
{
"epoch": 117.192,
"grad_norm": 1.8872570991516113,
"learning_rate": 3.464838923712891e-06,
"loss": 0.8173,
"mean_token_accuracy": 0.7879228088501338,
"num_tokens": 378441.0,
"step": 3640
},
{
"epoch": 117.512,
"grad_norm": 2.7338979244232178,
"learning_rate": 3.4172200584253077e-06,
"loss": 0.7152,
"mean_token_accuracy": 0.8083719074726105,
"num_tokens": 392976.0,
"step": 3650
},
{
"epoch": 117.832,
"grad_norm": 2.1551365852355957,
"learning_rate": 3.369863161567363e-06,
"loss": 0.8131,
"mean_token_accuracy": 0.7942818276584148,
"num_tokens": 409377.0,
"step": 3660
},
{
"epoch": 118.128,
"grad_norm": 2.206017255783081,
"learning_rate": 3.322770117754963e-06,
"loss": 0.7659,
"mean_token_accuracy": 0.8010066038853413,
"num_tokens": 423629.0,
"step": 3670
},
{
"epoch": 118.448,
"grad_norm": 2.915071964263916,
"learning_rate": 3.2759428011037454e-06,
"loss": 0.7366,
"mean_token_accuracy": 0.7961924949660897,
"num_tokens": 438934.0,
"step": 3680
},
{
"epoch": 118.768,
"grad_norm": 2.040217876434326,
"learning_rate": 3.229383075154445e-06,
"loss": 0.7838,
"mean_token_accuracy": 0.8073360413312912,
"num_tokens": 456515.0,
"step": 3690
},
{
"epoch": 119.064,
"grad_norm": 2.2024896144866943,
"learning_rate": 3.18309279279876e-06,
"loss": 0.8328,
"mean_token_accuracy": 0.7914658515034495,
"num_tokens": 471476.0,
"step": 3700
},
{
"epoch": 119.384,
"grad_norm": 2.4495859146118164,
"learning_rate": 3.137073796205601e-06,
"loss": 0.8175,
"mean_token_accuracy": 0.8003936596214771,
"num_tokens": 486665.0,
"step": 3710
},
{
"epoch": 119.704,
"grad_norm": 2.1028919219970703,
"learning_rate": 3.0913279167477916e-06,
"loss": 0.8224,
"mean_token_accuracy": 0.8022113911807537,
"num_tokens": 504053.0,
"step": 3720
},
{
"epoch": 120.0,
"grad_norm": 4.113481521606445,
"learning_rate": 3.0458569749291743e-06,
"loss": 0.7272,
"mean_token_accuracy": 0.8031640326654589,
"num_tokens": 518123.0,
"step": 3730
},
{
"epoch": 120.32,
"grad_norm": 2.537456750869751,
"learning_rate": 3.000662780312178e-06,
"loss": 0.7771,
"mean_token_accuracy": 0.7974813230335712,
"num_tokens": 534453.0,
"step": 3740
},
{
"epoch": 120.64,
"grad_norm": 2.442348003387451,
"learning_rate": 2.9557471314457866e-06,
"loss": 0.7646,
"mean_token_accuracy": 0.8017565876245498,
"num_tokens": 549839.0,
"step": 3750
},
{
"epoch": 120.96,
"grad_norm": 1.9900853633880615,
"learning_rate": 2.9111118157939745e-06,
"loss": 0.8154,
"mean_token_accuracy": 0.8000222038477659,
"num_tokens": 566121.0,
"step": 3760
},
{
"epoch": 121.256,
"grad_norm": 2.138803005218506,
"learning_rate": 2.866758609664572e-06,
"loss": 0.7823,
"mean_token_accuracy": 0.8003743641279839,
"num_tokens": 581873.0,
"step": 3770
},
{
"epoch": 121.576,
"grad_norm": 2.2369134426116943,
"learning_rate": 2.8226892781385673e-06,
"loss": 0.8084,
"mean_token_accuracy": 0.7941499546170234,
"num_tokens": 597192.0,
"step": 3780
},
{
"epoch": 121.896,
"grad_norm": 2.110830783843994,
"learning_rate": 2.7789055749998863e-06,
"loss": 0.8422,
"mean_token_accuracy": 0.7905905708670616,
"num_tokens": 612868.0,
"step": 3790
},
{
"epoch": 122.192,
"grad_norm": 1.9145809412002563,
"learning_rate": 2.7354092426655565e-06,
"loss": 0.737,
"mean_token_accuracy": 0.8060021779021701,
"num_tokens": 628187.0,
"step": 3800
},
{
"epoch": 122.512,
"grad_norm": 2.281447649002075,
"learning_rate": 2.6922020121164182e-06,
"loss": 0.7426,
"mean_token_accuracy": 0.8067140795290471,
"num_tokens": 645037.0,
"step": 3810
},
{
"epoch": 122.832,
"grad_norm": 2.424180030822754,
"learning_rate": 2.6492856028281956e-06,
"loss": 0.7792,
"mean_token_accuracy": 0.8041324406862259,
"num_tokens": 660729.0,
"step": 3820
},
{
"epoch": 123.128,
"grad_norm": 2.1080105304718018,
"learning_rate": 2.606661722703084e-06,
"loss": 0.8608,
"mean_token_accuracy": 0.7844757274598688,
"num_tokens": 675782.0,
"step": 3830
},
{
"epoch": 123.448,
"grad_norm": 1.995034098625183,
"learning_rate": 2.5643320680018012e-06,
"loss": 0.7897,
"mean_token_accuracy": 0.8024365931749344,
"num_tokens": 693109.0,
"step": 3840
},
{
"epoch": 123.768,
"grad_norm": 2.0973665714263916,
"learning_rate": 2.522298323276039e-06,
"loss": 0.7599,
"mean_token_accuracy": 0.7944286055862904,
"num_tokens": 707242.0,
"step": 3850
},
{
"epoch": 124.064,
"grad_norm": 2.0112831592559814,
"learning_rate": 2.480562161301464e-06,
"loss": 0.7615,
"mean_token_accuracy": 0.8035188607267432,
"num_tokens": 721297.0,
"step": 3860
},
{
"epoch": 124.384,
"grad_norm": 2.4737842082977295,
"learning_rate": 2.4391252430111388e-06,
"loss": 0.8339,
"mean_token_accuracy": 0.7951025106012821,
"num_tokens": 738725.0,
"step": 3870
},
{
"epoch": 124.704,
"grad_norm": 2.4330711364746094,
"learning_rate": 2.3979892174294105e-06,
"loss": 0.8477,
"mean_token_accuracy": 0.7900912150740623,
"num_tokens": 754563.0,
"step": 3880
},
{
"epoch": 125.0,
"grad_norm": 5.609644412994385,
"learning_rate": 2.3571557216062967e-06,
"loss": 0.659,
"mean_token_accuracy": 0.8116728329175228,
"num_tokens": 768588.0,
"step": 3890
},
{
"epoch": 125.32,
"grad_norm": 2.646641254425049,
"learning_rate": 2.316626380552337e-06,
"loss": 0.7614,
"mean_token_accuracy": 0.8065689295530319,
"num_tokens": 785761.0,
"step": 3900
},
{
"epoch": 125.64,
"grad_norm": 2.1917035579681396,
"learning_rate": 2.2764028071739162e-06,
"loss": 0.7887,
"mean_token_accuracy": 0.7956257075071335,
"num_tokens": 800456.0,
"step": 3910
},
{
"epoch": 125.96,
"grad_norm": 2.0703041553497314,
"learning_rate": 2.236486602209097e-06,
"loss": 0.7823,
"mean_token_accuracy": 0.7975949931889772,
"num_tokens": 817034.0,
"step": 3920
},
{
"epoch": 126.256,
"grad_norm": 3.7488138675689697,
"learning_rate": 2.1968793541638877e-06,
"loss": 0.7804,
"mean_token_accuracy": 0.7973396097486084,
"num_tokens": 831010.0,
"step": 3930
},
{
"epoch": 126.576,
"grad_norm": 1.8520132303237915,
"learning_rate": 2.1575826392490507e-06,
"loss": 0.8349,
"mean_token_accuracy": 0.7970767199993134,
"num_tokens": 847034.0,
"step": 3940
},
{
"epoch": 126.896,
"grad_norm": 2.562690258026123,
"learning_rate": 2.118598021317362e-06,
"loss": 0.7205,
"mean_token_accuracy": 0.8056481756269932,
"num_tokens": 862562.0,
"step": 3950
},
{
"epoch": 127.192,
"grad_norm": 2.3542673587799072,
"learning_rate": 2.07992705180138e-06,
"loss": 0.7441,
"mean_token_accuracy": 0.7991788826278738,
"num_tokens": 876729.0,
"step": 3960
},
{
"epoch": 127.512,
"grad_norm": 2.78625750541687,
"learning_rate": 2.0415712696517155e-06,
"loss": 0.8173,
"mean_token_accuracy": 0.7964733302593231,
"num_tokens": 895143.0,
"step": 3970
},
{
"epoch": 127.832,
"grad_norm": 2.283376455307007,
"learning_rate": 2.00353220127576e-06,
"loss": 0.7863,
"mean_token_accuracy": 0.8018839418888092,
"num_tokens": 911917.0,
"step": 3980
},
{
"epoch": 128.128,
"grad_norm": 2.6434993743896484,
"learning_rate": 1.965811360476967e-06,
"loss": 0.784,
"mean_token_accuracy": 0.7953068952302675,
"num_tokens": 924458.0,
"step": 3990
},
{
"epoch": 128.448,
"grad_norm": 2.869091033935547,
"learning_rate": 1.9284102483946042e-06,
"loss": 0.7912,
"mean_token_accuracy": 0.7926684629172087,
"num_tokens": 939111.0,
"step": 4000
},
{
"epoch": 128.768,
"grad_norm": 1.8905816078186035,
"learning_rate": 1.8913303534440019e-06,
"loss": 0.7592,
"mean_token_accuracy": 0.8064320608973503,
"num_tokens": 956948.0,
"step": 4010
},
{
"epoch": 129.064,
"grad_norm": 2.6615140438079834,
"learning_rate": 1.8545731512573317e-06,
"loss": 0.725,
"mean_token_accuracy": 0.8079512683120934,
"num_tokens": 973125.0,
"step": 4020
},
{
"epoch": 129.384,
"grad_norm": 1.934066653251648,
"learning_rate": 1.8181401046248748e-06,
"loss": 0.8115,
"mean_token_accuracy": 0.7953744523227215,
"num_tokens": 988824.0,
"step": 4030
},
{
"epoch": 129.704,
"grad_norm": 2.660801410675049,
"learning_rate": 1.7820326634368124e-06,
"loss": 0.7548,
"mean_token_accuracy": 0.8010114066302776,
"num_tokens": 1003868.0,
"step": 4040
},
{
"epoch": 130.0,
"grad_norm": 5.5373663902282715,
"learning_rate": 1.7462522646255319e-06,
"loss": 0.8302,
"mean_token_accuracy": 0.7884616557810757,
"num_tokens": 1019053.0,
"step": 4050
},
{
"epoch": 130.32,
"grad_norm": 2.5071728229522705,
"learning_rate": 1.7108003321084299e-06,
"loss": 0.7475,
"mean_token_accuracy": 0.80089957639575,
"num_tokens": 1035776.0,
"step": 4060
},
{
"epoch": 130.64,
"grad_norm": 3.022608518600464,
"learning_rate": 1.675678276731253e-06,
"loss": 0.7242,
"mean_token_accuracy": 0.7966047372668982,
"num_tokens": 1049970.0,
"step": 4070
},
{
"epoch": 130.96,
"grad_norm": 2.2591309547424316,
"learning_rate": 1.6408874962119526e-06,
"loss": 0.8193,
"mean_token_accuracy": 0.7990051701664924,
"num_tokens": 1066724.0,
"step": 4080
},
{
"epoch": 131.256,
"grad_norm": 1.9211347103118896,
"learning_rate": 1.606429375085058e-06,
"loss": 0.8557,
"mean_token_accuracy": 0.7852608090316927,
"num_tokens": 1081511.0,
"step": 4090
},
{
"epoch": 131.576,
"grad_norm": 2.843675136566162,
"learning_rate": 1.572305284646587e-06,
"loss": 0.744,
"mean_token_accuracy": 0.8146931059658528,
"num_tokens": 1097216.0,
"step": 4100
},
{
"epoch": 131.896,
"grad_norm": 1.981909990310669,
"learning_rate": 1.538516582899453e-06,
"loss": 0.7557,
"mean_token_accuracy": 0.80064931884408,
"num_tokens": 1114422.0,
"step": 4110
},
{
"epoch": 132.192,
"grad_norm": 2.5102710723876953,
"learning_rate": 1.505064614499443e-06,
"loss": 0.9181,
"mean_token_accuracy": 0.7895874824072864,
"num_tokens": 1130322.0,
"step": 4120
},
{
"epoch": 132.512,
"grad_norm": 2.168246269226074,
"learning_rate": 1.4719507107017005e-06,
"loss": 0.7341,
"mean_token_accuracy": 0.8004750736057759,
"num_tokens": 1145434.0,
"step": 4130
},
{
"epoch": 132.832,
"grad_norm": 2.0837020874023438,
"learning_rate": 1.439176189307735e-06,
"loss": 0.7871,
"mean_token_accuracy": 0.8010189373046159,
"num_tokens": 1161082.0,
"step": 4140
},
{
"epoch": 133.128,
"grad_norm": 2.729126453399658,
"learning_rate": 1.406742354613e-06,
"loss": 0.7355,
"mean_token_accuracy": 0.8093010356297364,
"num_tokens": 1176663.0,
"step": 4150
},
{
"epoch": 133.448,
"grad_norm": 2.5754497051239014,
"learning_rate": 1.3746504973549613e-06,
"loss": 0.7523,
"mean_token_accuracy": 0.8098891712725163,
"num_tokens": 1192788.0,
"step": 4160
},
{
"epoch": 133.768,
"grad_norm": 3.8127379417419434,
"learning_rate": 1.34290189466175e-06,
"loss": 0.7298,
"mean_token_accuracy": 0.7972260743379593,
"num_tokens": 1207918.0,
"step": 4170
},
{
"epoch": 134.064,
"grad_norm": 2.3844289779663086,
"learning_rate": 1.3114978100013376e-06,
"loss": 0.8014,
"mean_token_accuracy": 0.7910445159351503,
"num_tokens": 1222595.0,
"step": 4180
},
{
"epoch": 134.384,
"grad_norm": 2.5769917964935303,
"learning_rate": 1.2804394931312446e-06,
"loss": 0.6971,
"mean_token_accuracy": 0.8134579740464687,
"num_tokens": 1238530.0,
"step": 4190
},
{
"epoch": 134.704,
"grad_norm": 2.293607234954834,
"learning_rate": 1.2497281800488092e-06,
"loss": 0.8715,
"mean_token_accuracy": 0.7885241828858852,
"num_tokens": 1255860.0,
"step": 4200
},
{
"epoch": 135.0,
"grad_norm": 9.035529136657715,
"learning_rate": 1.219365092942003e-06,
"loss": 0.7438,
"mean_token_accuracy": 0.7974654679765573,
"num_tokens": 1269518.0,
"step": 4210
},
{
"epoch": 135.32,
"grad_norm": 2.2957992553710938,
"learning_rate": 1.189351440140788e-06,
"loss": 0.8218,
"mean_token_accuracy": 0.7884160943329335,
"num_tokens": 1285778.0,
"step": 4220
},
{
"epoch": 135.64,
"grad_norm": 2.6251449584960938,
"learning_rate": 1.159688416069038e-06,
"loss": 0.7602,
"mean_token_accuracy": 0.8091854326426983,
"num_tokens": 1301508.0,
"step": 4230
},
{
"epoch": 135.96,
"grad_norm": 2.2512965202331543,
"learning_rate": 1.1303772011969928e-06,
"loss": 0.7572,
"mean_token_accuracy": 0.8062782268971205,
"num_tokens": 1317471.0,
"step": 4240
},
{
"epoch": 136.256,
"grad_norm": 2.4756977558135986,
"learning_rate": 1.1014189619942905e-06,
"loss": 0.7439,
"mean_token_accuracy": 0.8019913362490164,
"num_tokens": 1333505.0,
"step": 4250
},
{
"epoch": 136.576,
"grad_norm": 3.272853374481201,
"learning_rate": 1.0728148508835424e-06,
"loss": 0.8706,
"mean_token_accuracy": 0.7980553403496742,
"num_tokens": 1349049.0,
"step": 4260
},
{
"epoch": 136.896,
"grad_norm": 2.3439319133758545,
"learning_rate": 1.0445660061944684e-06,
"loss": 0.7531,
"mean_token_accuracy": 0.7939603064209223,
"num_tokens": 1365036.0,
"step": 4270
},
{
"epoch": 137.192,
"grad_norm": 2.6307787895202637,
"learning_rate": 1.01667355211861e-06,
"loss": 0.7414,
"mean_token_accuracy": 0.8034155070781708,
"num_tokens": 1379248.0,
"step": 4280
},
{
"epoch": 137.512,
"grad_norm": 2.3606624603271484,
"learning_rate": 9.891385986645675e-07,
"loss": 0.6851,
"mean_token_accuracy": 0.8086177695542573,
"num_tokens": 1394888.0,
"step": 4290
},
{
"epoch": 137.832,
"grad_norm": 2.0615415573120117,
"learning_rate": 9.619622416138475e-07,
"loss": 0.8681,
"mean_token_accuracy": 0.7964320003986358,
"num_tokens": 1410073.0,
"step": 4300
},
{
"epoch": 138.128,
"grad_norm": 2.324324131011963,
"learning_rate": 9.351455624772487e-07,
"loss": 0.7584,
"mean_token_accuracy": 0.8120040643859554,
"num_tokens": 1427006.0,
"step": 4310
},
{
"epoch": 138.448,
"grad_norm": 2.1296613216400146,
"learning_rate": 9.086896284518198e-07,
"loss": 0.7939,
"mean_token_accuracy": 0.802574060857296,
"num_tokens": 1442836.0,
"step": 4320
},
{
"epoch": 138.768,
"grad_norm": 2.3658032417297363,
"learning_rate": 8.825954923783875e-07,
"loss": 0.8059,
"mean_token_accuracy": 0.7980705320835113,
"num_tokens": 1457625.0,
"step": 4330
},
{
"epoch": 139.064,
"grad_norm": 2.4179608821868896,
"learning_rate": 8.568641926996646e-07,
"loss": 0.8155,
"mean_token_accuracy": 0.7932797682446402,
"num_tokens": 1473259.0,
"step": 4340
},
{
"epoch": 139.384,
"grad_norm": 2.3579256534576416,
"learning_rate": 8.314967534189166e-07,
"loss": 0.8503,
"mean_token_accuracy": 0.7952963810414075,
"num_tokens": 1490309.0,
"step": 4350
},
{
"epoch": 139.704,
"grad_norm": 2.3982760906219482,
"learning_rate": 8.064941840592178e-07,
"loss": 0.6937,
"mean_token_accuracy": 0.8161114897578955,
"num_tokens": 1505580.0,
"step": 4360
},
{
"epoch": 140.0,
"grad_norm": 3.710239887237549,
"learning_rate": 7.818574796232714e-07,
"loss": 0.776,
"mean_token_accuracy": 0.789946156579095,
"num_tokens": 1519983.0,
"step": 4370
},
{
"epoch": 140.32,
"grad_norm": 2.348114013671875,
"learning_rate": 7.575876205538113e-07,
"loss": 0.8371,
"mean_token_accuracy": 0.7868118450045586,
"num_tokens": 1535154.0,
"step": 4380
},
{
"epoch": 140.64,
"grad_norm": 1.7529124021530151,
"learning_rate": 7.336855726945891e-07,
"loss": 0.8106,
"mean_token_accuracy": 0.7890769924968482,
"num_tokens": 1552288.0,
"step": 4390
},
{
"epoch": 140.96,
"grad_norm": 2.1464691162109375,
"learning_rate": 7.101522872519306e-07,
"loss": 0.7481,
"mean_token_accuracy": 0.8147139415144921,
"num_tokens": 1567833.0,
"step": 4400
},
{
"epoch": 8.304941176470589,
"grad_norm": 3.626028537750244,
"learning_rate": 1.6054562751771983e-05,
"loss": 1.8624,
"mean_token_accuracy": 0.42639462910592557,
"num_tokens": 12548.0,
"step": 4410
},
{
"epoch": 8.323764705882352,
"grad_norm": 2.1800546646118164,
"learning_rate": 1.6037858352792722e-05,
"loss": 1.5835,
"mean_token_accuracy": 0.4775951974093914,
"num_tokens": 25755.0,
"step": 4420
},
{
"epoch": 8.342588235294118,
"grad_norm": 1.870821475982666,
"learning_rate": 1.602112739804461e-05,
"loss": 1.4872,
"mean_token_accuracy": 0.48393381759524345,
"num_tokens": 38667.0,
"step": 4430
},
{
"epoch": 8.361411764705883,
"grad_norm": 1.9865084886550903,
"learning_rate": 1.6004369961113897e-05,
"loss": 1.4383,
"mean_token_accuracy": 0.4954090975224972,
"num_tokens": 51649.0,
"step": 4440
},
{
"epoch": 8.380235294117647,
"grad_norm": 1.759982943534851,
"learning_rate": 1.5987586115703306e-05,
"loss": 1.4358,
"mean_token_accuracy": 0.5117561783641577,
"num_tokens": 66035.0,
"step": 4450
},
{
"epoch": 8.399058823529412,
"grad_norm": 1.9075089693069458,
"learning_rate": 1.5970775935631717e-05,
"loss": 1.3555,
"mean_token_accuracy": 0.5182104598730802,
"num_tokens": 79576.0,
"step": 4460
},
{
"epoch": 8.417882352941177,
"grad_norm": 1.3230409622192383,
"learning_rate": 1.5953939494833832e-05,
"loss": 1.3668,
"mean_token_accuracy": 0.519798369705677,
"num_tokens": 92225.0,
"step": 4470
},
{
"epoch": 8.43670588235294,
"grad_norm": 2.239945650100708,
"learning_rate": 1.5937076867359852e-05,
"loss": 1.3048,
"mean_token_accuracy": 0.5348641883581877,
"num_tokens": 105922.0,
"step": 4480
},
{
"epoch": 8.455529411764706,
"grad_norm": 1.7611688375473022,
"learning_rate": 1.5920188127375152e-05,
"loss": 1.3466,
"mean_token_accuracy": 0.521543862298131,
"num_tokens": 119527.0,
"step": 4490
},
{
"epoch": 8.47435294117647,
"grad_norm": 1.486075520515442,
"learning_rate": 1.5903273349159958e-05,
"loss": 1.3115,
"mean_token_accuracy": 0.5352868799120187,
"num_tokens": 133451.0,
"step": 4500
},
{
"epoch": 8.493176470588235,
"grad_norm": 1.6955538988113403,
"learning_rate": 1.5886332607109017e-05,
"loss": 1.3427,
"mean_token_accuracy": 0.5247942265123129,
"num_tokens": 147565.0,
"step": 4510
},
{
"epoch": 8.512,
"grad_norm": 1.5570602416992188,
"learning_rate": 1.5869365975731267e-05,
"loss": 1.2547,
"mean_token_accuracy": 0.5451988846063613,
"num_tokens": 160377.0,
"step": 4520
},
{
"epoch": 8.530823529411764,
"grad_norm": 1.4915376901626587,
"learning_rate": 1.585237352964952e-05,
"loss": 1.358,
"mean_token_accuracy": 0.526292197033763,
"num_tokens": 174242.0,
"step": 4530
},
{
"epoch": 8.54964705882353,
"grad_norm": 1.603037714958191,
"learning_rate": 1.583535534360012e-05,
"loss": 1.2699,
"mean_token_accuracy": 0.5363341204822063,
"num_tokens": 187399.0,
"step": 4540
},
{
"epoch": 8.568470588235295,
"grad_norm": 2.1536943912506104,
"learning_rate": 1.581831149243262e-05,
"loss": 1.2976,
"mean_token_accuracy": 0.5274909067898989,
"num_tokens": 200947.0,
"step": 4550
},
{
"epoch": 8.587294117647058,
"grad_norm": 1.611542820930481,
"learning_rate": 1.580124205110946e-05,
"loss": 1.2672,
"mean_token_accuracy": 0.5402051657438278,
"num_tokens": 214010.0,
"step": 4560
},
{
"epoch": 8.606117647058824,
"grad_norm": 1.5133346319198608,
"learning_rate": 1.578414709470562e-05,
"loss": 1.3097,
"mean_token_accuracy": 0.5325882468372584,
"num_tokens": 227829.0,
"step": 4570
},
{
"epoch": 8.624941176470589,
"grad_norm": 1.4743294715881348,
"learning_rate": 1.576702669840832e-05,
"loss": 1.2504,
"mean_token_accuracy": 0.5380570895969867,
"num_tokens": 240838.0,
"step": 4580
},
{
"epoch": 8.643764705882353,
"grad_norm": 1.4610170125961304,
"learning_rate": 1.5749880937516647e-05,
"loss": 1.2727,
"mean_token_accuracy": 0.5317132595926524,
"num_tokens": 253845.0,
"step": 4590
},
{
"epoch": 8.662588235294118,
"grad_norm": 1.8520996570587158,
"learning_rate": 1.573270988744126e-05,
"loss": 1.2746,
"mean_token_accuracy": 0.5325201127678156,
"num_tokens": 266058.0,
"step": 4600
},
{
"epoch": 8.681411764705881,
"grad_norm": 2.241778612136841,
"learning_rate": 1.5715513623704052e-05,
"loss": 1.2703,
"mean_token_accuracy": 0.5311647448688745,
"num_tokens": 279149.0,
"step": 4610
},
{
"epoch": 8.700235294117647,
"grad_norm": 1.921618938446045,
"learning_rate": 1.5698292221937787e-05,
"loss": 1.2823,
"mean_token_accuracy": 0.5341210236772895,
"num_tokens": 293451.0,
"step": 4620
},
{
"epoch": 8.719058823529412,
"grad_norm": 1.5892717838287354,
"learning_rate": 1.5681045757885817e-05,
"loss": 1.2531,
"mean_token_accuracy": 0.5422347262501717,
"num_tokens": 306138.0,
"step": 4630
},
{
"epoch": 8.737882352941176,
"grad_norm": 1.6042686700820923,
"learning_rate": 1.566377430740171e-05,
"loss": 1.2764,
"mean_token_accuracy": 0.5341788738965988,
"num_tokens": 319399.0,
"step": 4640
},
{
"epoch": 8.756705882352941,
"grad_norm": 2.2580060958862305,
"learning_rate": 1.5646477946448927e-05,
"loss": 1.2348,
"mean_token_accuracy": 0.5448929745703935,
"num_tokens": 332882.0,
"step": 4650
},
{
"epoch": 8.775529411764706,
"grad_norm": 1.2103066444396973,
"learning_rate": 1.5629156751100502e-05,
"loss": 1.2542,
"mean_token_accuracy": 0.545468046143651,
"num_tokens": 345343.0,
"step": 4660
},
{
"epoch": 8.79435294117647,
"grad_norm": 0.8362689018249512,
"learning_rate": 1.561181079753868e-05,
"loss": 1.2827,
"mean_token_accuracy": 0.5429604861885309,
"num_tokens": 358912.0,
"step": 4670
},
{
"epoch": 8.813176470588235,
"grad_norm": 1.158046841621399,
"learning_rate": 1.5594440162054615e-05,
"loss": 1.2471,
"mean_token_accuracy": 0.5337832469493151,
"num_tokens": 372248.0,
"step": 4680
},
{
"epoch": 8.832,
"grad_norm": 1.3598729372024536,
"learning_rate": 1.557704492104801e-05,
"loss": 1.3124,
"mean_token_accuracy": 0.5272687204182148,
"num_tokens": 386263.0,
"step": 4690
},
{
"epoch": 8.850823529411764,
"grad_norm": 1.7355713844299316,
"learning_rate": 1.5559625151026785e-05,
"loss": 1.3023,
"mean_token_accuracy": 0.524540626257658,
"num_tokens": 399314.0,
"step": 4700
},
{
"epoch": 8.86964705882353,
"grad_norm": 1.342244267463684,
"learning_rate": 1.5542180928606747e-05,
"loss": 1.2199,
"mean_token_accuracy": 0.5468841027468443,
"num_tokens": 413612.0,
"step": 4710
},
{
"epoch": 8.888470588235293,
"grad_norm": 1.322409987449646,
"learning_rate": 1.5524712330511246e-05,
"loss": 1.2383,
"mean_token_accuracy": 0.5588106140494347,
"num_tokens": 427389.0,
"step": 4720
},
{
"epoch": 8.907294117647059,
"grad_norm": 1.3516113758087158,
"learning_rate": 1.5507219433570848e-05,
"loss": 1.2482,
"mean_token_accuracy": 0.5358951542526483,
"num_tokens": 440751.0,
"step": 4730
},
{
"epoch": 8.926117647058824,
"grad_norm": 1.5260019302368164,
"learning_rate": 1.5489702314722986e-05,
"loss": 1.2168,
"mean_token_accuracy": 0.5595146797597408,
"num_tokens": 453892.0,
"step": 4740
},
{
"epoch": 8.944941176470588,
"grad_norm": 1.5382399559020996,
"learning_rate": 1.547216105101162e-05,
"loss": 1.2772,
"mean_token_accuracy": 0.531356817483902,
"num_tokens": 468069.0,
"step": 4750
},
{
"epoch": 8.963764705882353,
"grad_norm": 1.362877368927002,
"learning_rate": 1.5454595719586926e-05,
"loss": 1.2325,
"mean_token_accuracy": 0.5457029201090335,
"num_tokens": 480208.0,
"step": 4760
},
{
"epoch": 8.982588235294118,
"grad_norm": 1.0237706899642944,
"learning_rate": 1.543700639770491e-05,
"loss": 1.2282,
"mean_token_accuracy": 0.542092502117157,
"num_tokens": 493653.0,
"step": 4770
},
{
"epoch": 9.001882352941177,
"grad_norm": 3.654766082763672,
"learning_rate": 1.5419393162727105e-05,
"loss": 1.3508,
"mean_token_accuracy": 0.5442763832284183,
"num_tokens": 507301.0,
"step": 4780
},
{
"epoch": 9.02070588235294,
"grad_norm": 1.20900297164917,
"learning_rate": 1.5401756092120215e-05,
"loss": 1.2509,
"mean_token_accuracy": 0.5424667615443468,
"num_tokens": 520131.0,
"step": 4790
},
{
"epoch": 9.039529411764706,
"grad_norm": 1.2988379001617432,
"learning_rate": 1.5384095263455782e-05,
"loss": 1.2669,
"mean_token_accuracy": 0.5415636003017426,
"num_tokens": 533609.0,
"step": 4800
},
{
"epoch": 9.058352941176471,
"grad_norm": 2.150287628173828,
"learning_rate": 1.5366410754409837e-05,
"loss": 1.2693,
"mean_token_accuracy": 0.5377780050039291,
"num_tokens": 547255.0,
"step": 4810
},
{
"epoch": 9.077176470588235,
"grad_norm": 1.0066241025924683,
"learning_rate": 1.5348702642762563e-05,
"loss": 1.2117,
"mean_token_accuracy": 0.5584665209054946,
"num_tokens": 560812.0,
"step": 4820
},
{
"epoch": 9.096,
"grad_norm": 1.0327008962631226,
"learning_rate": 1.5330971006397962e-05,
"loss": 1.1753,
"mean_token_accuracy": 0.5651697292923927,
"num_tokens": 574553.0,
"step": 4830
},
{
"epoch": 9.114823529411765,
"grad_norm": 1.200286865234375,
"learning_rate": 1.5313215923303482e-05,
"loss": 1.2833,
"mean_token_accuracy": 0.5195233155041933,
"num_tokens": 587992.0,
"step": 4840
},
{
"epoch": 9.133647058823529,
"grad_norm": 0.9596078991889954,
"learning_rate": 1.5295437471569714e-05,
"loss": 1.2403,
"mean_token_accuracy": 0.538974242284894,
"num_tokens": 602053.0,
"step": 4850
},
{
"epoch": 9.152470588235294,
"grad_norm": 1.0736156702041626,
"learning_rate": 1.5277635729390022e-05,
"loss": 1.2346,
"mean_token_accuracy": 0.5428169660270215,
"num_tokens": 616596.0,
"step": 4860
},
{
"epoch": 9.171294117647058,
"grad_norm": 1.773108959197998,
"learning_rate": 1.5259810775060202e-05,
"loss": 1.2516,
"mean_token_accuracy": 0.5292404495179653,
"num_tokens": 629154.0,
"step": 4870
},
{
"epoch": 9.190117647058823,
"grad_norm": 1.4007513523101807,
"learning_rate": 1.524196268697815e-05,
"loss": 1.1987,
"mean_token_accuracy": 0.5568405143916607,
"num_tokens": 641946.0,
"step": 4880
},
{
"epoch": 9.208941176470589,
"grad_norm": 1.3242895603179932,
"learning_rate": 1.5224091543643504e-05,
"loss": 1.2781,
"mean_token_accuracy": 0.5264579340815544,
"num_tokens": 655771.0,
"step": 4890
},
{
"epoch": 9.227764705882352,
"grad_norm": 1.3015270233154297,
"learning_rate": 1.52061974236573e-05,
"loss": 1.1972,
"mean_token_accuracy": 0.5521455116569995,
"num_tokens": 669074.0,
"step": 4900
},
{
"epoch": 9.246588235294118,
"grad_norm": 1.4676063060760498,
"learning_rate": 1.5188280405721643e-05,
"loss": 1.2169,
"mean_token_accuracy": 0.5410921085625887,
"num_tokens": 682391.0,
"step": 4910
},
{
"epoch": 9.265411764705883,
"grad_norm": 1.505129098892212,
"learning_rate": 1.5170340568639335e-05,
"loss": 1.2445,
"mean_token_accuracy": 0.5468276925384998,
"num_tokens": 695279.0,
"step": 4920
},
{
"epoch": 9.284235294117646,
"grad_norm": 1.4586368799209595,
"learning_rate": 1.5152377991313547e-05,
"loss": 1.2183,
"mean_token_accuracy": 0.5493371106684208,
"num_tokens": 709036.0,
"step": 4930
},
{
"epoch": 9.303058823529412,
"grad_norm": 1.3103828430175781,
"learning_rate": 1.5134392752747469e-05,
"loss": 1.2207,
"mean_token_accuracy": 0.5371036138385534,
"num_tokens": 721600.0,
"step": 4940
},
{
"epoch": 9.321882352941177,
"grad_norm": 1.406219720840454,
"learning_rate": 1.5116384932043953e-05,
"loss": 1.2197,
"mean_token_accuracy": 0.5394637104123831,
"num_tokens": 734972.0,
"step": 4950
},
{
"epoch": 9.34070588235294,
"grad_norm": 1.3175715208053589,
"learning_rate": 1.5098354608405177e-05,
"loss": 1.3009,
"mean_token_accuracy": 0.5217017080634833,
"num_tokens": 749524.0,
"step": 4960
},
{
"epoch": 9.359529411764706,
"grad_norm": 1.1799266338348389,
"learning_rate": 1.5080301861132291e-05,
"loss": 1.233,
"mean_token_accuracy": 0.5553332667797803,
"num_tokens": 763976.0,
"step": 4970
},
{
"epoch": 9.378352941176471,
"grad_norm": 1.2330571413040161,
"learning_rate": 1.5062226769625068e-05,
"loss": 1.2127,
"mean_token_accuracy": 0.5426539558917284,
"num_tokens": 777548.0,
"step": 4980
},
{
"epoch": 9.397176470588235,
"grad_norm": 1.3530794382095337,
"learning_rate": 1.5044129413381551e-05,
"loss": 1.2137,
"mean_token_accuracy": 0.5432845208793878,
"num_tokens": 791104.0,
"step": 4990
},
{
"epoch": 9.416,
"grad_norm": 1.174985647201538,
"learning_rate": 1.5026009871997725e-05,
"loss": 1.1936,
"mean_token_accuracy": 0.5486832950264215,
"num_tokens": 804784.0,
"step": 5000
},
{
"epoch": 9.434823529411764,
"grad_norm": 0.9708495140075684,
"learning_rate": 1.5007868225167124e-05,
"loss": 1.2447,
"mean_token_accuracy": 0.5287159774452448,
"num_tokens": 817605.0,
"step": 5010
},
{
"epoch": 9.45364705882353,
"grad_norm": 1.4748586416244507,
"learning_rate": 1.4989704552680527e-05,
"loss": 1.1782,
"mean_token_accuracy": 0.5548595078289509,
"num_tokens": 830334.0,
"step": 5020
},
{
"epoch": 9.472470588235295,
"grad_norm": 1.4649749994277954,
"learning_rate": 1.497151893442558e-05,
"loss": 1.1558,
"mean_token_accuracy": 0.5786185275763274,
"num_tokens": 843520.0,
"step": 5030
},
{
"epoch": 9.491294117647058,
"grad_norm": 1.3614012002944946,
"learning_rate": 1.4953311450386447e-05,
"loss": 1.2294,
"mean_token_accuracy": 0.5436280608177185,
"num_tokens": 856605.0,
"step": 5040
},
{
"epoch": 9.510117647058824,
"grad_norm": 0.8162552714347839,
"learning_rate": 1.493508218064347e-05,
"loss": 1.1795,
"mean_token_accuracy": 0.5606917165219784,
"num_tokens": 869281.0,
"step": 5050
},
{
"epoch": 9.528941176470589,
"grad_norm": 1.1542294025421143,
"learning_rate": 1.4916831205372803e-05,
"loss": 1.283,
"mean_token_accuracy": 0.539304967597127,
"num_tokens": 883498.0,
"step": 5060
},
{
"epoch": 9.547764705882352,
"grad_norm": 1.3006714582443237,
"learning_rate": 1.4898558604846067e-05,
"loss": 1.2342,
"mean_token_accuracy": 0.5408715981990099,
"num_tokens": 897313.0,
"step": 5070
},
{
"epoch": 9.566588235294118,
"grad_norm": 0.9996142983436584,
"learning_rate": 1.488026445943e-05,
"loss": 1.2156,
"mean_token_accuracy": 0.5489041075110436,
"num_tokens": 910640.0,
"step": 5080
},
{
"epoch": 9.585411764705881,
"grad_norm": 2.1211931705474854,
"learning_rate": 1.486194884958609e-05,
"loss": 1.1633,
"mean_token_accuracy": 0.5579564660787583,
"num_tokens": 923363.0,
"step": 5090
},
{
"epoch": 9.604235294117647,
"grad_norm": 1.2634146213531494,
"learning_rate": 1.4843611855870235e-05,
"loss": 1.2593,
"mean_token_accuracy": 0.5273831244558096,
"num_tokens": 936250.0,
"step": 5100
},
{
"epoch": 9.623058823529412,
"grad_norm": 1.7456119060516357,
"learning_rate": 1.4825253558932386e-05,
"loss": 1.2228,
"mean_token_accuracy": 0.5505132492631674,
"num_tokens": 949552.0,
"step": 5110
},
{
"epoch": 9.641882352941176,
"grad_norm": 1.605895757675171,
"learning_rate": 1.480687403951619e-05,
"loss": 1.1788,
"mean_token_accuracy": 0.5624800592660903,
"num_tokens": 963342.0,
"step": 5120
},
{
"epoch": 9.660705882352941,
"grad_norm": 1.3311768770217896,
"learning_rate": 1.4788473378458626e-05,
"loss": 1.2062,
"mean_token_accuracy": 0.5582063946872949,
"num_tokens": 976717.0,
"step": 5130
},
{
"epoch": 9.679529411764706,
"grad_norm": 1.4497061967849731,
"learning_rate": 1.4770051656689672e-05,
"loss": 1.228,
"mean_token_accuracy": 0.5460193831473589,
"num_tokens": 989772.0,
"step": 5140
},
{
"epoch": 9.69835294117647,
"grad_norm": 1.1696816682815552,
"learning_rate": 1.4751608955231924e-05,
"loss": 1.1884,
"mean_token_accuracy": 0.5445575587451458,
"num_tokens": 1003123.0,
"step": 5150
},
{
"epoch": 9.717176470588235,
"grad_norm": 0.9232364892959595,
"learning_rate": 1.4733145355200255e-05,
"loss": 1.152,
"mean_token_accuracy": 0.5746063582599163,
"num_tokens": 1016187.0,
"step": 5160
},
{
"epoch": 9.736,
"grad_norm": 1.6106712818145752,
"learning_rate": 1.4714660937801461e-05,
"loss": 1.1799,
"mean_token_accuracy": 0.5663762982934714,
"num_tokens": 1029873.0,
"step": 5170
},
{
"epoch": 9.754823529411764,
"grad_norm": 1.334657073020935,
"learning_rate": 1.4696155784333885e-05,
"loss": 1.1942,
"mean_token_accuracy": 0.5546817529946566,
"num_tokens": 1043425.0,
"step": 5180
},
{
"epoch": 9.77364705882353,
"grad_norm": 0.8071675896644592,
"learning_rate": 1.467762997618708e-05,
"loss": 1.2465,
"mean_token_accuracy": 0.535656175762415,
"num_tokens": 1057319.0,
"step": 5190
},
{
"epoch": 9.792470588235295,
"grad_norm": 1.1653850078582764,
"learning_rate": 1.465908359484144e-05,
"loss": 1.2336,
"mean_token_accuracy": 0.5504809945821763,
"num_tokens": 1070725.0,
"step": 5200
},
{
"epoch": 9.811294117647058,
"grad_norm": 1.1270978450775146,
"learning_rate": 1.4640516721867843e-05,
"loss": 1.1989,
"mean_token_accuracy": 0.5558116808533669,
"num_tokens": 13834.0,
"step": 5210
},
{
"epoch": 9.830117647058824,
"grad_norm": 1.6317771673202515,
"learning_rate": 1.4621929438927299e-05,
"loss": 1.2151,
"mean_token_accuracy": 0.5420542072504759,
"num_tokens": 27298.0,
"step": 5220
},
{
"epoch": 9.848941176470587,
"grad_norm": 1.1831214427947998,
"learning_rate": 1.4603321827770578e-05,
"loss": 1.2075,
"mean_token_accuracy": 0.5548371035605669,
"num_tokens": 40750.0,
"step": 5230
},
{
"epoch": 9.867764705882353,
"grad_norm": 1.4046541452407837,
"learning_rate": 1.458469397023786e-05,
"loss": 1.2385,
"mean_token_accuracy": 0.5390195321291685,
"num_tokens": 54449.0,
"step": 5240
},
{
"epoch": 9.886588235294118,
"grad_norm": 1.1588149070739746,
"learning_rate": 1.4566045948258376e-05,
"loss": 1.2143,
"mean_token_accuracy": 0.5551448825746774,
"num_tokens": 67816.0,
"step": 5250
},
{
"epoch": 9.905411764705882,
"grad_norm": 1.3225456476211548,
"learning_rate": 1.4547377843850044e-05,
"loss": 1.2199,
"mean_token_accuracy": 0.5484016731381416,
"num_tokens": 81192.0,
"step": 5260
},
{
"epoch": 9.924235294117647,
"grad_norm": 0.9215822219848633,
"learning_rate": 1.45286897391191e-05,
"loss": 1.2136,
"mean_token_accuracy": 0.5466381188482046,
"num_tokens": 94106.0,
"step": 5270
},
{
"epoch": 9.943058823529412,
"grad_norm": 1.9844329357147217,
"learning_rate": 1.4509981716259762e-05,
"loss": 1.2251,
"mean_token_accuracy": 0.5436500191688538,
"num_tokens": 107211.0,
"step": 5280
},
{
"epoch": 9.961882352941176,
"grad_norm": 0.7866172194480896,
"learning_rate": 1.4491253857553838e-05,
"loss": 1.1928,
"mean_token_accuracy": 0.5536798264831304,
"num_tokens": 120603.0,
"step": 5290
},
{
"epoch": 9.980705882352941,
"grad_norm": 1.3284730911254883,
"learning_rate": 1.4472506245370382e-05,
"loss": 1.2201,
"mean_token_accuracy": 0.551696864143014,
"num_tokens": 135253.0,
"step": 5300
},
{
"epoch": 9.999529411764707,
"grad_norm": 0.8189272880554199,
"learning_rate": 1.445373896216533e-05,
"loss": 1.2535,
"mean_token_accuracy": 0.5395314753055572,
"num_tokens": 148070.0,
"step": 5310
},
{
"epoch": 10.018823529411765,
"grad_norm": 0.9590490460395813,
"learning_rate": 1.4434952090481135e-05,
"loss": 1.3331,
"mean_token_accuracy": 0.5544926153450478,
"num_tokens": 162263.0,
"step": 5320
},
{
"epoch": 10.037647058823529,
"grad_norm": 1.4627238512039185,
"learning_rate": 1.4416145712946406e-05,
"loss": 1.2488,
"mean_token_accuracy": 0.5324025351554156,
"num_tokens": 175371.0,
"step": 5330
},
{
"epoch": 10.056470588235294,
"grad_norm": 0.6929643154144287,
"learning_rate": 1.4397319912275535e-05,
"loss": 1.2071,
"mean_token_accuracy": 0.5509116105735302,
"num_tokens": 188794.0,
"step": 5340
},
{
"epoch": 10.07529411764706,
"grad_norm": 1.5115923881530762,
"learning_rate": 1.437847477126835e-05,
"loss": 1.1505,
"mean_token_accuracy": 0.5615889120846986,
"num_tokens": 201733.0,
"step": 5350
},
{
"epoch": 10.094117647058823,
"grad_norm": 1.651714563369751,
"learning_rate": 1.4359610372809739e-05,
"loss": 1.2233,
"mean_token_accuracy": 0.5453080747276544,
"num_tokens": 214934.0,
"step": 5360
},
{
"epoch": 10.112941176470589,
"grad_norm": 1.2535176277160645,
"learning_rate": 1.4340726799869283e-05,
"loss": 1.1925,
"mean_token_accuracy": 0.5584179207682609,
"num_tokens": 227831.0,
"step": 5370
},
{
"epoch": 10.131764705882352,
"grad_norm": 1.8965996503829956,
"learning_rate": 1.4321824135500904e-05,
"loss": 1.2347,
"mean_token_accuracy": 0.5445710398256779,
"num_tokens": 242053.0,
"step": 5380
},
{
"epoch": 10.150588235294117,
"grad_norm": 1.9367871284484863,
"learning_rate": 1.430290246284249e-05,
"loss": 1.2115,
"mean_token_accuracy": 0.5574517220258712,
"num_tokens": 256086.0,
"step": 5390
},
{
"epoch": 10.169411764705883,
"grad_norm": 0.6884622573852539,
"learning_rate": 1.4283961865115528e-05,
"loss": 1.2457,
"mean_token_accuracy": 0.5295402128249407,
"num_tokens": 269977.0,
"step": 5400
},
{
"epoch": 10.188235294117646,
"grad_norm": 0.7671216726303101,
"learning_rate": 1.426500242562474e-05,
"loss": 1.1288,
"mean_token_accuracy": 0.5702939372509718,
"num_tokens": 283412.0,
"step": 5410
},
{
"epoch": 10.207058823529412,
"grad_norm": 1.1199065446853638,
"learning_rate": 1.4246024227757735e-05,
"loss": 1.2184,
"mean_token_accuracy": 0.5337141178548336,
"num_tokens": 296574.0,
"step": 5420
},
{
"epoch": 10.225882352941177,
"grad_norm": 0.8241312503814697,
"learning_rate": 1.4227027354984602e-05,
"loss": 1.1945,
"mean_token_accuracy": 0.5481650296598672,
"num_tokens": 310305.0,
"step": 5430
},
{
"epoch": 10.24470588235294,
"grad_norm": 1.6059694290161133,
"learning_rate": 1.4208011890857577e-05,
"loss": 1.1322,
"mean_token_accuracy": 0.5755776699632407,
"num_tokens": 323670.0,
"step": 5440
},
{
"epoch": 10.263529411764706,
"grad_norm": 1.0941455364227295,
"learning_rate": 1.4188977919010664e-05,
"loss": 1.1634,
"mean_token_accuracy": 0.5623828198760747,
"num_tokens": 336409.0,
"step": 5450
},
{
"epoch": 10.282352941176471,
"grad_norm": 0.760979175567627,
"learning_rate": 1.4169925523159274e-05,
"loss": 1.2111,
"mean_token_accuracy": 0.5577297646552324,
"num_tokens": 349680.0,
"step": 5460
},
{
"epoch": 10.301176470588235,
"grad_norm": 1.41929292678833,
"learning_rate": 1.4150854787099836e-05,
"loss": 1.1846,
"mean_token_accuracy": 0.5624632347375155,
"num_tokens": 363183.0,
"step": 5470
},
{
"epoch": 10.32,
"grad_norm": 0.7982503771781921,
"learning_rate": 1.413176579470946e-05,
"loss": 1.2039,
"mean_token_accuracy": 0.5504359491169453,
"num_tokens": 376390.0,
"step": 5480
},
{
"epoch": 10.338823529411764,
"grad_norm": 1.3889517784118652,
"learning_rate": 1.4112658629945535e-05,
"loss": 1.1928,
"mean_token_accuracy": 0.5593543030321598,
"num_tokens": 389745.0,
"step": 5490
},
{
"epoch": 10.35764705882353,
"grad_norm": 1.3614208698272705,
"learning_rate": 1.409353337684539e-05,
"loss": 1.2334,
"mean_token_accuracy": 0.5366870552301407,
"num_tokens": 404220.0,
"step": 5500
},
{
"epoch": 10.376470588235295,
"grad_norm": 0.9981026649475098,
"learning_rate": 1.4074390119525898e-05,
"loss": 1.1739,
"mean_token_accuracy": 0.5642281893640757,
"num_tokens": 417700.0,
"step": 5510
},
{
"epoch": 10.395294117647058,
"grad_norm": 1.0381234884262085,
"learning_rate": 1.4055228942183128e-05,
"loss": 1.1977,
"mean_token_accuracy": 0.5563704077154398,
"num_tokens": 430901.0,
"step": 5520
},
{
"epoch": 10.414117647058823,
"grad_norm": 0.8158124089241028,
"learning_rate": 1.4036049929091964e-05,
"loss": 1.1914,
"mean_token_accuracy": 0.5571797143667936,
"num_tokens": 445094.0,
"step": 5530
},
{
"epoch": 10.432941176470589,
"grad_norm": 0.7652572393417358,
"learning_rate": 1.4016853164605728e-05,
"loss": 1.2376,
"mean_token_accuracy": 0.5498634003102779,
"num_tokens": 459543.0,
"step": 5540
},
{
"epoch": 10.451764705882352,
"grad_norm": 0.7951592206954956,
"learning_rate": 1.3997638733155822e-05,
"loss": 1.1997,
"mean_token_accuracy": 0.5588535733520985,
"num_tokens": 473275.0,
"step": 5550
},
{
"epoch": 10.470588235294118,
"grad_norm": 1.2788842916488647,
"learning_rate": 1.3978406719251352e-05,
"loss": 1.204,
"mean_token_accuracy": 0.5432504419237375,
"num_tokens": 485574.0,
"step": 5560
},
{
"epoch": 10.489411764705883,
"grad_norm": 1.9643447399139404,
"learning_rate": 1.3959157207478753e-05,
"loss": 1.1918,
"mean_token_accuracy": 0.5582812011241913,
"num_tokens": 498349.0,
"step": 5570
},
{
"epoch": 10.508235294117647,
"grad_norm": 1.2677149772644043,
"learning_rate": 1.3939890282501418e-05,
"loss": 1.2043,
"mean_token_accuracy": 0.5601174239069223,
"num_tokens": 511915.0,
"step": 5580
},
{
"epoch": 10.527058823529412,
"grad_norm": 1.0180656909942627,
"learning_rate": 1.3920606029059332e-05,
"loss": 1.2173,
"mean_token_accuracy": 0.5526633080095052,
"num_tokens": 524995.0,
"step": 5590
},
{
"epoch": 10.545882352941177,
"grad_norm": 1.1644375324249268,
"learning_rate": 1.3901304531968684e-05,
"loss": 1.1837,
"mean_token_accuracy": 0.5520532440394164,
"num_tokens": 537557.0,
"step": 5600
},
{
"epoch": 10.564705882352941,
"grad_norm": 1.3104006052017212,
"learning_rate": 1.388198587612152e-05,
"loss": 1.2209,
"mean_token_accuracy": 0.5339883405715227,
"num_tokens": 551827.0,
"step": 5610
},
{
"epoch": 10.583529411764706,
"grad_norm": 1.103053331375122,
"learning_rate": 1.386265014648534e-05,
"loss": 1.154,
"mean_token_accuracy": 0.5668028537184,
"num_tokens": 565218.0,
"step": 5620
},
{
"epoch": 10.60235294117647,
"grad_norm": 0.8747602105140686,
"learning_rate": 1.3843297428102742e-05,
"loss": 1.2476,
"mean_token_accuracy": 0.5371836949139833,
"num_tokens": 578938.0,
"step": 5630
},
{
"epoch": 10.621176470588235,
"grad_norm": 0.8349719047546387,
"learning_rate": 1.382392780609105e-05,
"loss": 1.1669,
"mean_token_accuracy": 0.5530536573380231,
"num_tokens": 592617.0,
"step": 5640
},
{
"epoch": 10.64,
"grad_norm": 1.4140478372573853,
"learning_rate": 1.3804541365641923e-05,
"loss": 1.2016,
"mean_token_accuracy": 0.5596294030547142,
"num_tokens": 606658.0,
"step": 5650
},
{
"epoch": 10.658823529411764,
"grad_norm": 1.2245830297470093,
"learning_rate": 1.3785138192021002e-05,
"loss": 1.1768,
"mean_token_accuracy": 0.5598421145230532,
"num_tokens": 619930.0,
"step": 5660
},
{
"epoch": 10.67764705882353,
"grad_norm": 1.3025885820388794,
"learning_rate": 1.3765718370567514e-05,
"loss": 1.1994,
"mean_token_accuracy": 0.5509582087397575,
"num_tokens": 633099.0,
"step": 5670
},
{
"epoch": 10.696470588235295,
"grad_norm": 0.9705594778060913,
"learning_rate": 1.3746281986693917e-05,
"loss": 1.1976,
"mean_token_accuracy": 0.5644174017012119,
"num_tokens": 647248.0,
"step": 5680
},
{
"epoch": 10.715294117647058,
"grad_norm": 1.8273649215698242,
"learning_rate": 1.3726829125885501e-05,
"loss": 1.1895,
"mean_token_accuracy": 0.5520309090614319,
"num_tokens": 660733.0,
"step": 5690
},
{
"epoch": 10.734117647058824,
"grad_norm": 0.9448793530464172,
"learning_rate": 1.370735987370004e-05,
"loss": 1.1612,
"mean_token_accuracy": 0.5580623522400856,
"num_tokens": 674331.0,
"step": 5700
},
{
"epoch": 10.75294117647059,
"grad_norm": 0.7382903099060059,
"learning_rate": 1.3687874315767388e-05,
"loss": 1.1906,
"mean_token_accuracy": 0.5549033779650927,
"num_tokens": 687329.0,
"step": 5710
},
{
"epoch": 10.771764705882353,
"grad_norm": 0.9180198907852173,
"learning_rate": 1.3668372537789122e-05,
"loss": 1.1708,
"mean_token_accuracy": 0.554550190269947,
"num_tokens": 701188.0,
"step": 5720
},
{
"epoch": 10.790588235294118,
"grad_norm": 1.3416258096694946,
"learning_rate": 1.3648854625538161e-05,
"loss": 1.2009,
"mean_token_accuracy": 0.5456226222217083,
"num_tokens": 715055.0,
"step": 5730
},
{
"epoch": 10.809411764705882,
"grad_norm": 0.9519694447517395,
"learning_rate": 1.3629320664858373e-05,
"loss": 1.188,
"mean_token_accuracy": 0.5580568216741085,
"num_tokens": 728299.0,
"step": 5740
},
{
"epoch": 10.828235294117647,
"grad_norm": 0.9768867492675781,
"learning_rate": 1.3609770741664225e-05,
"loss": 1.1748,
"mean_token_accuracy": 0.5568192675709724,
"num_tokens": 740400.0,
"step": 5750
},
{
"epoch": 10.847058823529412,
"grad_norm": 1.2277079820632935,
"learning_rate": 1.3590204941940384e-05,
"loss": 1.1883,
"mean_token_accuracy": 0.5541429404169321,
"num_tokens": 753926.0,
"step": 5760
},
{
"epoch": 10.865882352941176,
"grad_norm": 0.912382960319519,
"learning_rate": 1.3570623351741343e-05,
"loss": 1.2201,
"mean_token_accuracy": 0.5405797265470028,
"num_tokens": 767363.0,
"step": 5770
},
{
"epoch": 10.884705882352941,
"grad_norm": 1.2783665657043457,
"learning_rate": 1.3551026057191045e-05,
"loss": 1.2285,
"mean_token_accuracy": 0.5442549273371696,
"num_tokens": 780491.0,
"step": 5780
},
{
"epoch": 10.903529411764707,
"grad_norm": 0.789916455745697,
"learning_rate": 1.3531413144482512e-05,
"loss": 1.23,
"mean_token_accuracy": 0.5449609015136957,
"num_tokens": 793793.0,
"step": 5790
},
{
"epoch": 10.92235294117647,
"grad_norm": 1.2650339603424072,
"learning_rate": 1.351178469987745e-05,
"loss": 1.2049,
"mean_token_accuracy": 0.543266024813056,
"num_tokens": 807792.0,
"step": 5800
},
{
"epoch": 10.941176470588236,
"grad_norm": 0.9021736979484558,
"learning_rate": 1.3492140809705881e-05,
"loss": 1.1796,
"mean_token_accuracy": 0.5581005875021219,
"num_tokens": 821010.0,
"step": 5810
},
{
"epoch": 10.96,
"grad_norm": 0.8924301266670227,
"learning_rate": 1.3472481560365758e-05,
"loss": 1.1796,
"mean_token_accuracy": 0.5598813854157925,
"num_tokens": 833101.0,
"step": 5820
},
{
"epoch": 10.978823529411764,
"grad_norm": 1.8735415935516357,
"learning_rate": 1.3452807038322585e-05,
"loss": 1.1992,
"mean_token_accuracy": 0.5387950103729964,
"num_tokens": 845747.0,
"step": 5830
},
{
"epoch": 10.99764705882353,
"grad_norm": 0.8486454486846924,
"learning_rate": 1.3433117330109045e-05,
"loss": 1.1955,
"mean_token_accuracy": 0.550658929720521,
"num_tokens": 859896.0,
"step": 5840
},
{
"epoch": 11.015058823529412,
"grad_norm": 0.8041768670082092,
"learning_rate": 1.3413412522324609e-05,
"loss": 1.1699,
"mean_token_accuracy": 0.5523232479353208,
"num_tokens": 871831.0,
"step": 5850
},
{
"epoch": 11.033882352941177,
"grad_norm": 1.445483922958374,
"learning_rate": 1.3393692701635154e-05,
"loss": 1.214,
"mean_token_accuracy": 0.5537016060203314,
"num_tokens": 885406.0,
"step": 5860
},
{
"epoch": 11.05270588235294,
"grad_norm": 0.697123110294342,
"learning_rate": 1.33739579547726e-05,
"loss": 1.1622,
"mean_token_accuracy": 0.5540354669094085,
"num_tokens": 898421.0,
"step": 5870
},
{
"epoch": 11.071529411764706,
"grad_norm": 1.465420126914978,
"learning_rate": 1.3354208368534503e-05,
"loss": 1.2069,
"mean_token_accuracy": 0.551685893163085,
"num_tokens": 912642.0,
"step": 5880
},
{
"epoch": 11.090352941176471,
"grad_norm": 0.859109103679657,
"learning_rate": 1.333444402978369e-05,
"loss": 1.1826,
"mean_token_accuracy": 0.5513388890773058,
"num_tokens": 926366.0,
"step": 5890
},
{
"epoch": 11.109176470588235,
"grad_norm": 1.5829471349716187,
"learning_rate": 1.3314665025447876e-05,
"loss": 1.2038,
"mean_token_accuracy": 0.5505582805722952,
"num_tokens": 939684.0,
"step": 5900
},
{
"epoch": 11.128,
"grad_norm": 0.8772886991500854,
"learning_rate": 1.3294871442519271e-05,
"loss": 1.1629,
"mean_token_accuracy": 0.5473615158349275,
"num_tokens": 953085.0,
"step": 5910
},
{
"epoch": 11.146823529411765,
"grad_norm": 0.9280370473861694,
"learning_rate": 1.3275063368054208e-05,
"loss": 1.1943,
"mean_token_accuracy": 0.5425072379410267,
"num_tokens": 966596.0,
"step": 5920
},
{
"epoch": 11.165647058823529,
"grad_norm": 1.6158865690231323,
"learning_rate": 1.3255240889172764e-05,
"loss": 1.1669,
"mean_token_accuracy": 0.5687259271740913,
"num_tokens": 981302.0,
"step": 5930
},
{
"epoch": 11.184470588235294,
"grad_norm": 0.9575416445732117,
"learning_rate": 1.323540409305836e-05,
"loss": 1.1828,
"mean_token_accuracy": 0.5527924958616495,
"num_tokens": 993437.0,
"step": 5940
},
{
"epoch": 11.203294117647058,
"grad_norm": 0.8492655158042908,
"learning_rate": 1.3215553066957391e-05,
"loss": 1.1352,
"mean_token_accuracy": 0.5696950633078813,
"num_tokens": 1006394.0,
"step": 5950
},
{
"epoch": 11.222117647058823,
"grad_norm": 1.3289772272109985,
"learning_rate": 1.3195687898178837e-05,
"loss": 1.198,
"mean_token_accuracy": 0.5483724296092987,
"num_tokens": 1019972.0,
"step": 5960
},
{
"epoch": 11.240941176470589,
"grad_norm": 0.8151838779449463,
"learning_rate": 1.3175808674093882e-05,
"loss": 1.2229,
"mean_token_accuracy": 0.5452193580567837,
"num_tokens": 1033578.0,
"step": 5970
},
{
"epoch": 11.259764705882352,
"grad_norm": 1.242629051208496,
"learning_rate": 1.3155915482135528e-05,
"loss": 1.2035,
"mean_token_accuracy": 0.5487495046108961,
"num_tokens": 1046758.0,
"step": 5980
},
{
"epoch": 11.278588235294118,
"grad_norm": 1.238258719444275,
"learning_rate": 1.3136008409798214e-05,
"loss": 1.1993,
"mean_token_accuracy": 0.5489524565637112,
"num_tokens": 1061219.0,
"step": 5990
},
{
"epoch": 11.297411764705883,
"grad_norm": 1.2245213985443115,
"learning_rate": 1.3116087544637415e-05,
"loss": 1.1616,
"mean_token_accuracy": 0.5596596848219633,
"num_tokens": 1074213.0,
"step": 6000
},
{
"epoch": 11.316235294117647,
"grad_norm": 0.8657311797142029,
"learning_rate": 1.3096152974269289e-05,
"loss": 1.179,
"mean_token_accuracy": 0.5485074911266565,
"num_tokens": 1088151.0,
"step": 6010
},
{
"epoch": 11.335058823529412,
"grad_norm": 0.8435884714126587,
"learning_rate": 1.3076204786370256e-05,
"loss": 1.1862,
"mean_token_accuracy": 0.5667649589478969,
"num_tokens": 1101736.0,
"step": 6020
},
{
"epoch": 11.353882352941177,
"grad_norm": 4.664355278015137,
"learning_rate": 1.3056243068676637e-05,
"loss": 1.1899,
"mean_token_accuracy": 0.5602201897650957,
"num_tokens": 1115368.0,
"step": 6030
},
{
"epoch": 11.37270588235294,
"grad_norm": 0.7196553945541382,
"learning_rate": 1.3036267908984257e-05,
"loss": 1.2337,
"mean_token_accuracy": 0.5351801011711359,
"num_tokens": 1128875.0,
"step": 6040
},
{
"epoch": 11.391529411764706,
"grad_norm": 0.8050165772438049,
"learning_rate": 1.3016279395148067e-05,
"loss": 1.2082,
"mean_token_accuracy": 0.5454613540321589,
"num_tokens": 1141185.0,
"step": 6050
},
{
"epoch": 11.41035294117647,
"grad_norm": 1.6665176153182983,
"learning_rate": 1.2996277615081738e-05,
"loss": 1.1941,
"mean_token_accuracy": 0.5567255288362503,
"num_tokens": 1154738.0,
"step": 6060
},
{
"epoch": 11.429176470588235,
"grad_norm": 1.7246068716049194,
"learning_rate": 1.297626265675731e-05,
"loss": 1.19,
"mean_token_accuracy": 0.5438788242638111,
"num_tokens": 1168172.0,
"step": 6070
},
{
"epoch": 11.448,
"grad_norm": 0.8024677038192749,
"learning_rate": 1.2956234608204765e-05,
"loss": 1.2072,
"mean_token_accuracy": 0.5483981113880873,
"num_tokens": 1181363.0,
"step": 6080
},
{
"epoch": 11.466823529411764,
"grad_norm": 1.0496245622634888,
"learning_rate": 1.293619355751167e-05,
"loss": 1.1437,
"mean_token_accuracy": 0.5686488572508097,
"num_tokens": 1195397.0,
"step": 6090
},
{
"epoch": 11.48564705882353,
"grad_norm": 0.6598522067070007,
"learning_rate": 1.2916139592822776e-05,
"loss": 1.2051,
"mean_token_accuracy": 0.5419987261295318,
"num_tokens": 1208020.0,
"step": 6100
},
{
"epoch": 11.504470588235295,
"grad_norm": 1.8896702527999878,
"learning_rate": 1.2896072802339623e-05,
"loss": 1.1603,
"mean_token_accuracy": 0.571081367880106,
"num_tokens": 1222044.0,
"step": 6110
},
{
"epoch": 11.523294117647058,
"grad_norm": 0.8881447911262512,
"learning_rate": 1.2875993274320173e-05,
"loss": 1.1703,
"mean_token_accuracy": 0.5585772100836038,
"num_tokens": 1236218.0,
"step": 6120
},
{
"epoch": 11.542117647058824,
"grad_norm": 1.263449788093567,
"learning_rate": 1.2855901097078412e-05,
"loss": 1.1544,
"mean_token_accuracy": 0.5590313211083412,
"num_tokens": 1249412.0,
"step": 6130
},
{
"epoch": 11.560941176470589,
"grad_norm": 1.120104432106018,
"learning_rate": 1.2835796358983943e-05,
"loss": 1.2265,
"mean_token_accuracy": 0.5422938629984856,
"num_tokens": 1262521.0,
"step": 6140
},
{
"epoch": 11.579764705882353,
"grad_norm": 1.0734158754348755,
"learning_rate": 1.2815679148461636e-05,
"loss": 1.2199,
"mean_token_accuracy": 0.5645121570676566,
"num_tokens": 1276508.0,
"step": 6150
},
{
"epoch": 11.598588235294118,
"grad_norm": 0.7284833192825317,
"learning_rate": 1.2795549553991202e-05,
"loss": 1.1995,
"mean_token_accuracy": 0.55781021527946,
"num_tokens": 1289814.0,
"step": 6160
},
{
"epoch": 11.617411764705881,
"grad_norm": 0.9633259773254395,
"learning_rate": 1.2775407664106825e-05,
"loss": 1.1882,
"mean_token_accuracy": 0.555243044346571,
"num_tokens": 1303074.0,
"step": 6170
},
{
"epoch": 11.636235294117647,
"grad_norm": 0.6576571464538574,
"learning_rate": 1.2755253567396766e-05,
"loss": 1.208,
"mean_token_accuracy": 0.5453934874385595,
"num_tokens": 1316357.0,
"step": 6180
},
{
"epoch": 11.655058823529412,
"grad_norm": 2.550999879837036,
"learning_rate": 1.2735087352502977e-05,
"loss": 1.2179,
"mean_token_accuracy": 0.5549823541194201,
"num_tokens": 1329683.0,
"step": 6190
},
{
"epoch": 11.673882352941176,
"grad_norm": 1.061109185218811,
"learning_rate": 1.2714909108120698e-05,
"loss": 1.1632,
"mean_token_accuracy": 0.5776005409657955,
"num_tokens": 1342884.0,
"step": 6200
},
{
"epoch": 11.694588235294118,
"grad_norm": 2.3584885597229004,
"learning_rate": 1.2694718922998097e-05,
"loss": 1.1887,
"mean_token_accuracy": 0.5592446334660053,
"num_tokens": 13595.0,
"step": 6210
},
{
"epoch": 11.713411764705882,
"grad_norm": 0.8795002698898315,
"learning_rate": 1.2674516885935835e-05,
"loss": 1.1999,
"mean_token_accuracy": 0.5442189387977123,
"num_tokens": 26488.0,
"step": 6220
},
{
"epoch": 11.732235294117647,
"grad_norm": 0.8854806423187256,
"learning_rate": 1.2654303085786723e-05,
"loss": 1.1295,
"mean_token_accuracy": 0.5750894896686077,
"num_tokens": 40270.0,
"step": 6230
},
{
"epoch": 11.751058823529412,
"grad_norm": 0.8246656656265259,
"learning_rate": 1.2634077611455294e-05,
"loss": 1.1946,
"mean_token_accuracy": 0.5488846648484469,
"num_tokens": 54551.0,
"step": 6240
},
{
"epoch": 11.769882352941176,
"grad_norm": 1.1240957975387573,
"learning_rate": 1.2613840551897428e-05,
"loss": 1.2098,
"mean_token_accuracy": 0.5464079327881336,
"num_tokens": 68016.0,
"step": 6250
},
{
"epoch": 11.788705882352941,
"grad_norm": 0.672888994216919,
"learning_rate": 1.2593591996119965e-05,
"loss": 1.2383,
"mean_token_accuracy": 0.5364337969571352,
"num_tokens": 82740.0,
"step": 6260
},
{
"epoch": 11.807529411764706,
"grad_norm": 0.9078545570373535,
"learning_rate": 1.257333203318031e-05,
"loss": 1.1736,
"mean_token_accuracy": 0.5599259410053492,
"num_tokens": 95371.0,
"step": 6270
},
{
"epoch": 11.82635294117647,
"grad_norm": 1.5212457180023193,
"learning_rate": 1.2553060752186024e-05,
"loss": 1.1655,
"mean_token_accuracy": 0.5664675917476416,
"num_tokens": 108447.0,
"step": 6280
},
{
"epoch": 11.845176470588235,
"grad_norm": 0.6960899829864502,
"learning_rate": 1.2532778242294467e-05,
"loss": 1.184,
"mean_token_accuracy": 0.559162225574255,
"num_tokens": 121975.0,
"step": 6290
},
{
"epoch": 11.864,
"grad_norm": 1.1340759992599487,
"learning_rate": 1.2512484592712373e-05,
"loss": 1.1874,
"mean_token_accuracy": 0.5441106397658586,
"num_tokens": 135823.0,
"step": 6300
},
{
"epoch": 11.882823529411764,
"grad_norm": 0.7761991024017334,
"learning_rate": 1.2492179892695473e-05,
"loss": 1.1632,
"mean_token_accuracy": 0.5639519464224577,
"num_tokens": 147887.0,
"step": 6310
},
{
"epoch": 11.90164705882353,
"grad_norm": 1.0354701280593872,
"learning_rate": 1.24718642315481e-05,
"loss": 1.1432,
"mean_token_accuracy": 0.5634768087416887,
"num_tokens": 161408.0,
"step": 6320
},
{
"epoch": 11.920470588235293,
"grad_norm": 0.9502993822097778,
"learning_rate": 1.2451537698622799e-05,
"loss": 1.134,
"mean_token_accuracy": 0.5664606466889381,
"num_tokens": 173813.0,
"step": 6330
},
{
"epoch": 11.939294117647059,
"grad_norm": 0.7904968857765198,
"learning_rate": 1.2431200383319931e-05,
"loss": 1.1782,
"mean_token_accuracy": 0.5518010076135397,
"num_tokens": 186645.0,
"step": 6340
},
{
"epoch": 11.958117647058824,
"grad_norm": 0.6425819993019104,
"learning_rate": 1.2410852375087279e-05,
"loss": 1.1928,
"mean_token_accuracy": 0.5603426963090896,
"num_tokens": 201324.0,
"step": 6350
},
{
"epoch": 11.976941176470588,
"grad_norm": 1.029788851737976,
"learning_rate": 1.2390493763419654e-05,
"loss": 1.2296,
"mean_token_accuracy": 0.530813605338335,
"num_tokens": 213844.0,
"step": 6360
},
{
"epoch": 11.995764705882353,
"grad_norm": 1.0189554691314697,
"learning_rate": 1.2370124637858508e-05,
"loss": 1.125,
"mean_token_accuracy": 0.5650646161288023,
"num_tokens": 227343.0,
"step": 6370
},
{
"epoch": 12.015058823529412,
"grad_norm": 1.3754558563232422,
"learning_rate": 1.2349745087991529e-05,
"loss": 1.2719,
"mean_token_accuracy": 0.5659251939959642,
"num_tokens": 241245.0,
"step": 6380
},
{
"epoch": 12.033882352941177,
"grad_norm": 1.338383436203003,
"learning_rate": 1.2329355203452258e-05,
"loss": 1.2056,
"mean_token_accuracy": 0.5403179809451103,
"num_tokens": 254997.0,
"step": 6390
},
{
"epoch": 12.05270588235294,
"grad_norm": 0.7401157021522522,
"learning_rate": 1.2308955073919688e-05,
"loss": 1.2027,
"mean_token_accuracy": 0.5495895497500897,
"num_tokens": 268506.0,
"step": 6400
},
{
"epoch": 12.071529411764706,
"grad_norm": 1.382067084312439,
"learning_rate": 1.2288544789117876e-05,
"loss": 1.1167,
"mean_token_accuracy": 0.5725257787853479,
"num_tokens": 281845.0,
"step": 6410
},
{
"epoch": 12.090352941176471,
"grad_norm": 1.1322115659713745,
"learning_rate": 1.2268124438815531e-05,
"loss": 1.1649,
"mean_token_accuracy": 0.5655132979154587,
"num_tokens": 294981.0,
"step": 6420
},
{
"epoch": 12.109176470588235,
"grad_norm": 2.3306519985198975,
"learning_rate": 1.2247694112825654e-05,
"loss": 1.1841,
"mean_token_accuracy": 0.5553363788872957,
"num_tokens": 308385.0,
"step": 6430
},
{
"epoch": 12.128,
"grad_norm": 1.036372423171997,
"learning_rate": 1.2227253901005101e-05,
"loss": 1.1893,
"mean_token_accuracy": 0.550970122590661,
"num_tokens": 321685.0,
"step": 6440
},
{
"epoch": 12.146823529411765,
"grad_norm": 1.155049443244934,
"learning_rate": 1.2206803893254215e-05,
"loss": 1.1504,
"mean_token_accuracy": 0.5654265254735946,
"num_tokens": 334803.0,
"step": 6450
},
{
"epoch": 12.165647058823529,
"grad_norm": 0.8562523126602173,
"learning_rate": 1.2186344179516425e-05,
"loss": 1.1566,
"mean_token_accuracy": 0.5620875429362059,
"num_tokens": 347350.0,
"step": 6460
},
{
"epoch": 12.184470588235294,
"grad_norm": 1.5091642141342163,
"learning_rate": 1.2165874849777853e-05,
"loss": 1.1878,
"mean_token_accuracy": 0.5486861743032933,
"num_tokens": 361251.0,
"step": 6470
},
{
"epoch": 12.203294117647058,
"grad_norm": 1.0562283992767334,
"learning_rate": 1.21453959940669e-05,
"loss": 1.2455,
"mean_token_accuracy": 0.5419348709285259,
"num_tokens": 374607.0,
"step": 6480
},
{
"epoch": 12.222117647058823,
"grad_norm": 2.187586545944214,
"learning_rate": 1.2124907702453883e-05,
"loss": 1.1733,
"mean_token_accuracy": 0.5536637313663959,
"num_tokens": 388714.0,
"step": 6490
},
{
"epoch": 12.240941176470589,
"grad_norm": 1.4512325525283813,
"learning_rate": 1.2104410065050605e-05,
"loss": 1.2252,
"mean_token_accuracy": 0.5447334434837103,
"num_tokens": 402108.0,
"step": 6500
},
{
"epoch": 12.259764705882352,
"grad_norm": 1.3915634155273438,
"learning_rate": 1.208390317200998e-05,
"loss": 1.1606,
"mean_token_accuracy": 0.5588117640465498,
"num_tokens": 417457.0,
"step": 6510
},
{
"epoch": 12.278588235294118,
"grad_norm": 1.3058298826217651,
"learning_rate": 1.2063387113525635e-05,
"loss": 1.1624,
"mean_token_accuracy": 0.567823113501072,
"num_tokens": 429732.0,
"step": 6520
},
{
"epoch": 12.297411764705883,
"grad_norm": 1.4169563055038452,
"learning_rate": 1.2042861979831496e-05,
"loss": 1.1758,
"mean_token_accuracy": 0.5708753641694784,
"num_tokens": 444198.0,
"step": 6530
},
{
"epoch": 12.316235294117647,
"grad_norm": 0.916401207447052,
"learning_rate": 1.202232786120141e-05,
"loss": 1.1952,
"mean_token_accuracy": 0.5533534411340952,
"num_tokens": 457447.0,
"step": 6540
},
{
"epoch": 12.335058823529412,
"grad_norm": 1.6477797031402588,
"learning_rate": 1.200178484794875e-05,
"loss": 1.1731,
"mean_token_accuracy": 0.5523608162999153,
"num_tokens": 471188.0,
"step": 6550
},
{
"epoch": 12.353882352941177,
"grad_norm": 0.7916552424430847,
"learning_rate": 1.1981233030425996e-05,
"loss": 1.1525,
"mean_token_accuracy": 0.5627595514059067,
"num_tokens": 483973.0,
"step": 6560
},
{
"epoch": 12.37270588235294,
"grad_norm": 1.6025060415267944,
"learning_rate": 1.1960672499024359e-05,
"loss": 1.1149,
"mean_token_accuracy": 0.5760308355093002,
"num_tokens": 497279.0,
"step": 6570
},
{
"epoch": 12.391529411764706,
"grad_norm": 1.0256032943725586,
"learning_rate": 1.1940103344173375e-05,
"loss": 1.1646,
"mean_token_accuracy": 0.5561530087143183,
"num_tokens": 511615.0,
"step": 6580
},
{
"epoch": 12.41035294117647,
"grad_norm": 1.3151596784591675,
"learning_rate": 1.1919525656340503e-05,
"loss": 1.1609,
"mean_token_accuracy": 0.5567661169916391,
"num_tokens": 524047.0,
"step": 6590
},
{
"epoch": 12.429176470588235,
"grad_norm": 0.9940578937530518,
"learning_rate": 1.1898939526030732e-05,
"loss": 1.199,
"mean_token_accuracy": 0.5474131718277931,
"num_tokens": 537761.0,
"step": 6600
},
{
"epoch": 12.448,
"grad_norm": 0.7173454165458679,
"learning_rate": 1.1878345043786195e-05,
"loss": 1.1698,
"mean_token_accuracy": 0.5660860728472471,
"num_tokens": 551598.0,
"step": 6610
},
{
"epoch": 12.466823529411764,
"grad_norm": 1.1414166688919067,
"learning_rate": 1.1857742300185739e-05,
"loss": 1.2336,
"mean_token_accuracy": 0.5513837717473506,
"num_tokens": 566797.0,
"step": 6620
},
{
"epoch": 12.48564705882353,
"grad_norm": 0.6648653745651245,
"learning_rate": 1.1837131385844567e-05,
"loss": 1.1919,
"mean_token_accuracy": 0.5441902942955494,
"num_tokens": 580339.0,
"step": 6630
},
{
"epoch": 12.504470588235295,
"grad_norm": 1.2969242334365845,
"learning_rate": 1.1816512391413798e-05,
"loss": 1.1584,
"mean_token_accuracy": 0.5687514644116163,
"num_tokens": 592369.0,
"step": 6640
},
{
"epoch": 12.523294117647058,
"grad_norm": 0.6251775622367859,
"learning_rate": 1.179588540758011e-05,
"loss": 1.2068,
"mean_token_accuracy": 0.5384650267660618,
"num_tokens": 605961.0,
"step": 6650
},
{
"epoch": 12.542117647058824,
"grad_norm": 1.3755369186401367,
"learning_rate": 1.1775250525065297e-05,
"loss": 1.1859,
"mean_token_accuracy": 0.5518178451806307,
"num_tokens": 618337.0,
"step": 6660
},
{
"epoch": 12.560941176470589,
"grad_norm": 1.2308052778244019,
"learning_rate": 1.1754607834625915e-05,
"loss": 1.2075,
"mean_token_accuracy": 0.5420106790959835,
"num_tokens": 632237.0,
"step": 6670
},
{
"epoch": 12.579764705882353,
"grad_norm": 0.7645729184150696,
"learning_rate": 1.1733957427052842e-05,
"loss": 1.1931,
"mean_token_accuracy": 0.562155156955123,
"num_tokens": 645494.0,
"step": 6680
},
{
"epoch": 12.598588235294118,
"grad_norm": 0.6689856052398682,
"learning_rate": 1.1713299393170916e-05,
"loss": 1.1567,
"mean_token_accuracy": 0.5580319032073021,
"num_tokens": 658861.0,
"step": 6690
},
{
"epoch": 12.617411764705881,
"grad_norm": 1.2952977418899536,
"learning_rate": 1.1692633823838503e-05,
"loss": 1.1983,
"mean_token_accuracy": 0.5488288260996341,
"num_tokens": 671873.0,
"step": 6700
},
{
"epoch": 12.636235294117647,
"grad_norm": 0.988854169845581,
"learning_rate": 1.1671960809947116e-05,
"loss": 1.2001,
"mean_token_accuracy": 0.5581530544906854,
"num_tokens": 684288.0,
"step": 6710
},
{
"epoch": 12.655058823529412,
"grad_norm": 0.9140803813934326,
"learning_rate": 1.165128044242101e-05,
"loss": 1.1754,
"mean_token_accuracy": 0.5484160725027323,
"num_tokens": 696428.0,
"step": 6720
},
{
"epoch": 12.673882352941176,
"grad_norm": 1.194382905960083,
"learning_rate": 1.163059281221679e-05,
"loss": 1.16,
"mean_token_accuracy": 0.5650255784392357,
"num_tokens": 709841.0,
"step": 6730
},
{
"epoch": 12.692705882352941,
"grad_norm": 0.7279021143913269,
"learning_rate": 1.1609898010322989e-05,
"loss": 1.1799,
"mean_token_accuracy": 0.5441335134208203,
"num_tokens": 724299.0,
"step": 6740
},
{
"epoch": 12.711529411764706,
"grad_norm": 0.7829269766807556,
"learning_rate": 1.1589196127759697e-05,
"loss": 1.1982,
"mean_token_accuracy": 0.5436731087043881,
"num_tokens": 737467.0,
"step": 6750
},
{
"epoch": 12.73035294117647,
"grad_norm": 0.908854603767395,
"learning_rate": 1.1568487255578135e-05,
"loss": 1.1589,
"mean_token_accuracy": 0.5564702823758125,
"num_tokens": 751035.0,
"step": 6760
},
{
"epoch": 12.749176470588235,
"grad_norm": 0.8606781363487244,
"learning_rate": 1.1547771484860282e-05,
"loss": 1.1811,
"mean_token_accuracy": 0.5530305828899145,
"num_tokens": 764012.0,
"step": 6770
},
{
"epoch": 12.768,
"grad_norm": 0.8715227246284485,
"learning_rate": 1.1527048906718434e-05,
"loss": 1.1731,
"mean_token_accuracy": 0.5534448944032192,
"num_tokens": 777823.0,
"step": 6780
},
{
"epoch": 12.786823529411764,
"grad_norm": 1.383436918258667,
"learning_rate": 1.1506319612294855e-05,
"loss": 1.2038,
"mean_token_accuracy": 0.5430160872638226,
"num_tokens": 791112.0,
"step": 6790
},
{
"epoch": 12.80564705882353,
"grad_norm": 0.6807175278663635,
"learning_rate": 1.148558369276132e-05,
"loss": 1.1325,
"mean_token_accuracy": 0.5736443504691124,
"num_tokens": 804227.0,
"step": 6800
},
{
"epoch": 12.824470588235293,
"grad_norm": 1.107948660850525,
"learning_rate": 1.1464841239318764e-05,
"loss": 1.1518,
"mean_token_accuracy": 0.5673416070640087,
"num_tokens": 817620.0,
"step": 6810
},
{
"epoch": 12.843294117647059,
"grad_norm": 0.7133264541625977,
"learning_rate": 1.1444092343196855e-05,
"loss": 1.1768,
"mean_token_accuracy": 0.5533497478812933,
"num_tokens": 831699.0,
"step": 6820
},
{
"epoch": 12.862117647058824,
"grad_norm": 0.7470325231552124,
"learning_rate": 1.1423337095653595e-05,
"loss": 1.1794,
"mean_token_accuracy": 0.556913785263896,
"num_tokens": 845041.0,
"step": 6830
},
{
"epoch": 12.880941176470587,
"grad_norm": 0.7599585056304932,
"learning_rate": 1.1402575587974915e-05,
"loss": 1.1831,
"mean_token_accuracy": 0.5495749611407519,
"num_tokens": 858034.0,
"step": 6840
},
{
"epoch": 12.899764705882353,
"grad_norm": 0.9152631163597107,
"learning_rate": 1.1381807911474291e-05,
"loss": 1.1693,
"mean_token_accuracy": 0.5672723963856697,
"num_tokens": 871960.0,
"step": 6850
},
{
"epoch": 12.918588235294118,
"grad_norm": 1.0719937086105347,
"learning_rate": 1.1361034157492324e-05,
"loss": 1.2041,
"mean_token_accuracy": 0.5518028371036052,
"num_tokens": 884661.0,
"step": 6860
},
{
"epoch": 12.937411764705882,
"grad_norm": 1.084991455078125,
"learning_rate": 1.1340254417396343e-05,
"loss": 1.2011,
"mean_token_accuracy": 0.5481019847095012,
"num_tokens": 897816.0,
"step": 6870
},
{
"epoch": 12.956235294117647,
"grad_norm": 1.3787931203842163,
"learning_rate": 1.131946878258001e-05,
"loss": 1.1315,
"mean_token_accuracy": 0.5697043187916279,
"num_tokens": 910552.0,
"step": 6880
},
{
"epoch": 12.975058823529412,
"grad_norm": 1.2762988805770874,
"learning_rate": 1.1298677344462914e-05,
"loss": 1.1643,
"mean_token_accuracy": 0.5592548452317715,
"num_tokens": 924705.0,
"step": 6890
},
{
"epoch": 12.993882352941176,
"grad_norm": 0.8996446132659912,
"learning_rate": 1.127788019449016e-05,
"loss": 1.2202,
"mean_token_accuracy": 0.5417749028652906,
"num_tokens": 938010.0,
"step": 6900
},
{
"epoch": 13.01129411764706,
"grad_norm": 1.195081114768982,
"learning_rate": 1.1257077424131985e-05,
"loss": 1.1559,
"mean_token_accuracy": 0.5514025462640298,
"num_tokens": 951050.0,
"step": 6910
},
{
"epoch": 13.030117647058823,
"grad_norm": 1.7555843591690063,
"learning_rate": 1.1236269124883339e-05,
"loss": 1.1524,
"mean_token_accuracy": 0.5562022086232901,
"num_tokens": 964411.0,
"step": 6920
},
{
"epoch": 13.048941176470588,
"grad_norm": 1.3604152202606201,
"learning_rate": 1.1215455388263496e-05,
"loss": 1.1602,
"mean_token_accuracy": 0.5531352117657662,
"num_tokens": 976752.0,
"step": 6930
},
{
"epoch": 13.067764705882352,
"grad_norm": 1.0296913385391235,
"learning_rate": 1.1194636305815635e-05,
"loss": 1.1608,
"mean_token_accuracy": 0.5703556634485721,
"num_tokens": 989410.0,
"step": 6940
},
{
"epoch": 13.086588235294117,
"grad_norm": 1.0703682899475098,
"learning_rate": 1.1173811969106451e-05,
"loss": 1.1665,
"mean_token_accuracy": 0.5600442342460156,
"num_tokens": 1003038.0,
"step": 6950
},
{
"epoch": 13.105411764705883,
"grad_norm": 0.9015535712242126,
"learning_rate": 1.1152982469725755e-05,
"loss": 1.1816,
"mean_token_accuracy": 0.5589367963373661,
"num_tokens": 1017162.0,
"step": 6960
},
{
"epoch": 13.124235294117646,
"grad_norm": 0.7695736885070801,
"learning_rate": 1.1132147899286054e-05,
"loss": 1.2044,
"mean_token_accuracy": 0.5554168112576008,
"num_tokens": 1030524.0,
"step": 6970
},
{
"epoch": 13.143058823529412,
"grad_norm": 0.6659622192382812,
"learning_rate": 1.1111308349422165e-05,
"loss": 1.1464,
"mean_token_accuracy": 0.5591688379645348,
"num_tokens": 1043594.0,
"step": 6980
},
{
"epoch": 13.161882352941177,
"grad_norm": 0.7469462156295776,
"learning_rate": 1.1090463911790807e-05,
"loss": 1.1107,
"mean_token_accuracy": 0.5700595445930958,
"num_tokens": 1057281.0,
"step": 6990
},
{
"epoch": 13.18070588235294,
"grad_norm": 0.7391088008880615,
"learning_rate": 1.1069614678070193e-05,
"loss": 1.1526,
"mean_token_accuracy": 0.5636973019689322,
"num_tokens": 1071037.0,
"step": 7000
},
{
"epoch": 13.199529411764706,
"grad_norm": 1.2832854986190796,
"learning_rate": 1.1048760739959628e-05,
"loss": 1.1978,
"mean_token_accuracy": 0.5526088491082192,
"num_tokens": 1084747.0,
"step": 7010
},
{
"epoch": 13.218352941176471,
"grad_norm": 0.7462761402130127,
"learning_rate": 1.1027902189179107e-05,
"loss": 1.1735,
"mean_token_accuracy": 0.5575515639036894,
"num_tokens": 1098389.0,
"step": 7020
},
{
"epoch": 13.237176470588235,
"grad_norm": 0.6917766332626343,
"learning_rate": 1.1007039117468928e-05,
"loss": 1.1831,
"mean_token_accuracy": 0.5607794526964426,
"num_tokens": 1111600.0,
"step": 7030
},
{
"epoch": 13.256,
"grad_norm": 0.8664299845695496,
"learning_rate": 1.0986171616589247e-05,
"loss": 1.1687,
"mean_token_accuracy": 0.5606127306818962,
"num_tokens": 1125368.0,
"step": 7040
},
{
"epoch": 13.274823529411766,
"grad_norm": 0.7263596653938293,
"learning_rate": 1.0965299778319728e-05,
"loss": 1.2441,
"mean_token_accuracy": 0.5351598154753446,
"num_tokens": 1139128.0,
"step": 7050
},
{
"epoch": 13.29364705882353,
"grad_norm": 1.76611328125,
"learning_rate": 1.0944423694459087e-05,
"loss": 1.1559,
"mean_token_accuracy": 0.567909749224782,
"num_tokens": 1152782.0,
"step": 7060
},
{
"epoch": 13.312470588235294,
"grad_norm": 1.1992835998535156,
"learning_rate": 1.0923543456824737e-05,
"loss": 1.1802,
"mean_token_accuracy": 0.5539811603724957,
"num_tokens": 1166674.0,
"step": 7070
},
{
"epoch": 13.331294117647058,
"grad_norm": 1.3675031661987305,
"learning_rate": 1.0902659157252333e-05,
"loss": 1.1545,
"mean_token_accuracy": 0.5604531057178974,
"num_tokens": 1180331.0,
"step": 7080
},
{
"epoch": 13.350117647058823,
"grad_norm": 0.7396109700202942,
"learning_rate": 1.088177088759542e-05,
"loss": 1.158,
"mean_token_accuracy": 0.5598292458802462,
"num_tokens": 1193986.0,
"step": 7090
},
{
"epoch": 13.368941176470589,
"grad_norm": 0.7911150455474854,
"learning_rate": 1.0860878739724989e-05,
"loss": 1.1662,
"mean_token_accuracy": 0.5609062645584345,
"num_tokens": 1207593.0,
"step": 7100
},
{
"epoch": 13.387764705882352,
"grad_norm": 0.7450502514839172,
"learning_rate": 1.0839982805529097e-05,
"loss": 1.1734,
"mean_token_accuracy": 0.5542735267430544,
"num_tokens": 1221421.0,
"step": 7110
},
{
"epoch": 13.406588235294118,
"grad_norm": 0.695760190486908,
"learning_rate": 1.0819083176912446e-05,
"loss": 1.203,
"mean_token_accuracy": 0.5460193280130625,
"num_tokens": 1235153.0,
"step": 7120
},
{
"epoch": 13.425411764705883,
"grad_norm": 0.8482743501663208,
"learning_rate": 1.0798179945795996e-05,
"loss": 1.2084,
"mean_token_accuracy": 0.5523406885564327,
"num_tokens": 1248129.0,
"step": 7130
},
{
"epoch": 13.444235294117647,
"grad_norm": 0.7389087677001953,
"learning_rate": 1.0777273204116541e-05,
"loss": 1.1817,
"mean_token_accuracy": 0.5641430784016848,
"num_tokens": 1261697.0,
"step": 7140
},
{
"epoch": 13.463058823529412,
"grad_norm": 0.7677028179168701,
"learning_rate": 1.0756363043826328e-05,
"loss": 1.2031,
"mean_token_accuracy": 0.5505106158554554,
"num_tokens": 1274933.0,
"step": 7150
},
{
"epoch": 13.481882352941177,
"grad_norm": 0.8022538423538208,
"learning_rate": 1.0735449556892622e-05,
"loss": 1.1603,
"mean_token_accuracy": 0.5579750452190637,
"num_tokens": 1288707.0,
"step": 7160
},
{
"epoch": 13.500705882352941,
"grad_norm": 0.6393579840660095,
"learning_rate": 1.0714532835297344e-05,
"loss": 1.1945,
"mean_token_accuracy": 0.556298240274191,
"num_tokens": 1302937.0,
"step": 7170
},
{
"epoch": 13.519529411764706,
"grad_norm": 0.8472998142242432,
"learning_rate": 1.0693612971036616e-05,
"loss": 1.2097,
"mean_token_accuracy": 0.5355463117361069,
"num_tokens": 1316118.0,
"step": 7180
},
{
"epoch": 13.53835294117647,
"grad_norm": 0.5993279218673706,
"learning_rate": 1.0672690056120398e-05,
"loss": 1.1842,
"mean_token_accuracy": 0.5584869582206011,
"num_tokens": 1329672.0,
"step": 7190
},
{
"epoch": 13.557176470588235,
"grad_norm": 1.1063508987426758,
"learning_rate": 1.0651764182572063e-05,
"loss": 1.1652,
"mean_token_accuracy": 0.5537949342280626,
"num_tokens": 1342869.0,
"step": 7200
},
{
"epoch": 13.576,
"grad_norm": 0.9842997789382935,
"learning_rate": 1.0630835442428001e-05,
"loss": 1.2162,
"mean_token_accuracy": 0.5468976002186536,
"num_tokens": 1356452.0,
"step": 7210
},
{
"epoch": 13.594823529411764,
"grad_norm": 0.7008135914802551,
"learning_rate": 1.0609903927737196e-05,
"loss": 1.174,
"mean_token_accuracy": 0.5521068956702948,
"num_tokens": 1370184.0,
"step": 7220
},
{
"epoch": 13.61364705882353,
"grad_norm": 1.510910987854004,
"learning_rate": 1.0588969730560852e-05,
"loss": 1.2074,
"mean_token_accuracy": 0.5425486758351326,
"num_tokens": 1383485.0,
"step": 7230
},
{
"epoch": 13.632470588235295,
"grad_norm": 0.8757747411727905,
"learning_rate": 1.0568032942971962e-05,
"loss": 1.209,
"mean_token_accuracy": 0.5415149603039027,
"num_tokens": 1398031.0,
"step": 7240
},
{
"epoch": 13.651294117647058,
"grad_norm": 1.3053663969039917,
"learning_rate": 1.0547093657054914e-05,
"loss": 1.1542,
"mean_token_accuracy": 0.5658162008970976,
"num_tokens": 1410853.0,
"step": 7250
},
{
"epoch": 13.670117647058824,
"grad_norm": 1.395501971244812,
"learning_rate": 1.0526151964905085e-05,
"loss": 1.1775,
"mean_token_accuracy": 0.5551408220082521,
"num_tokens": 1423509.0,
"step": 7260
},
{
"epoch": 13.688941176470589,
"grad_norm": 1.4833544492721558,
"learning_rate": 1.0505207958628438e-05,
"loss": 1.0948,
"mean_token_accuracy": 0.5793862946331501,
"num_tokens": 1437175.0,
"step": 7270
},
{
"epoch": 13.707764705882353,
"grad_norm": 0.8894456624984741,
"learning_rate": 1.0484261730341101e-05,
"loss": 1.1577,
"mean_token_accuracy": 0.5599946200847625,
"num_tokens": 1449621.0,
"step": 7280
},
{
"epoch": 13.726588235294118,
"grad_norm": 0.9780417680740356,
"learning_rate": 1.0463313372168993e-05,
"loss": 1.2138,
"mean_token_accuracy": 0.5368953734636307,
"num_tokens": 1463044.0,
"step": 7290
},
{
"epoch": 13.745411764705882,
"grad_norm": 1.0154612064361572,
"learning_rate": 1.0442362976247384e-05,
"loss": 1.2187,
"mean_token_accuracy": 0.5392611972987652,
"num_tokens": 1476500.0,
"step": 7300
},
{
"epoch": 13.764235294117647,
"grad_norm": 0.8700633645057678,
"learning_rate": 1.0421410634720523e-05,
"loss": 1.1487,
"mean_token_accuracy": 0.5718079563230276,
"num_tokens": 1489072.0,
"step": 7310
},
{
"epoch": 13.783058823529412,
"grad_norm": 1.255520224571228,
"learning_rate": 1.0400456439741203e-05,
"loss": 1.1511,
"mean_token_accuracy": 0.5543713182210922,
"num_tokens": 1502885.0,
"step": 7320
},
{
"epoch": 13.801882352941176,
"grad_norm": 0.9495011568069458,
"learning_rate": 1.0379500483470373e-05,
"loss": 1.1583,
"mean_token_accuracy": 0.5580469910055399,
"num_tokens": 1515640.0,
"step": 7330
},
{
"epoch": 13.820705882352941,
"grad_norm": 0.7675338387489319,
"learning_rate": 1.035854285807673e-05,
"loss": 1.1739,
"mean_token_accuracy": 0.5525693718343974,
"num_tokens": 1529604.0,
"step": 7340
},
{
"epoch": 13.839529411764707,
"grad_norm": 1.0846037864685059,
"learning_rate": 1.0337583655736312e-05,
"loss": 1.2115,
"mean_token_accuracy": 0.5463377732783556,
"num_tokens": 1543205.0,
"step": 7350
},
{
"epoch": 13.85835294117647,
"grad_norm": 0.711812436580658,
"learning_rate": 1.0316622968632088e-05,
"loss": 1.2121,
"mean_token_accuracy": 0.5422242067754268,
"num_tokens": 1556017.0,
"step": 7360
},
{
"epoch": 13.877176470588235,
"grad_norm": 1.1142287254333496,
"learning_rate": 1.029566088895357e-05,
"loss": 1.1561,
"mean_token_accuracy": 0.5748393829911947,
"num_tokens": 1570116.0,
"step": 7370
},
{
"epoch": 13.896,
"grad_norm": 0.8056155443191528,
"learning_rate": 1.0274697508896372e-05,
"loss": 1.1411,
"mean_token_accuracy": 0.5808346830308437,
"num_tokens": 1582496.0,
"step": 7380
},
{
"epoch": 13.914823529411764,
"grad_norm": 1.1729984283447266,
"learning_rate": 1.0253732920661856e-05,
"loss": 1.1881,
"mean_token_accuracy": 0.5547745041549206,
"num_tokens": 1596198.0,
"step": 7390
},
{
"epoch": 13.93364705882353,
"grad_norm": 1.339120626449585,
"learning_rate": 1.0232767216456672e-05,
"loss": 1.1534,
"mean_token_accuracy": 0.5604519348591566,
"num_tokens": 1609177.0,
"step": 7400
},
{
"epoch": 13.952470588235293,
"grad_norm": 0.6790438294410706,
"learning_rate": 1.0211800488492401e-05,
"loss": 1.1662,
"mean_token_accuracy": 0.5518874824047089,
"num_tokens": 1622444.0,
"step": 7410
},
{
"epoch": 13.971294117647059,
"grad_norm": 1.8246535062789917,
"learning_rate": 1.01908328289851e-05,
"loss": 1.1406,
"mean_token_accuracy": 0.5649749383330345,
"num_tokens": 1635759.0,
"step": 7420
},
{
"epoch": 13.990117647058824,
"grad_norm": 1.3896183967590332,
"learning_rate": 1.0169864330154951e-05,
"loss": 1.1608,
"mean_token_accuracy": 0.5683747977018356,
"num_tokens": 1649592.0,
"step": 7430
},
{
"epoch": 14.007529411764706,
"grad_norm": 0.9073938131332397,
"learning_rate": 1.0148895084225807e-05,
"loss": 1.1792,
"mean_token_accuracy": 0.5546492849652832,
"num_tokens": 1661557.0,
"step": 7440
},
{
"epoch": 14.026352941176471,
"grad_norm": 0.770523190498352,
"learning_rate": 1.012792518342482e-05,
"loss": 1.1737,
"mean_token_accuracy": 0.5552055723965168,
"num_tokens": 1675410.0,
"step": 7450
},
{
"epoch": 14.045176470588235,
"grad_norm": 1.5216394662857056,
"learning_rate": 1.0106954719982014e-05,
"loss": 1.1718,
"mean_token_accuracy": 0.5549504559487104,
"num_tokens": 1689244.0,
"step": 7460
},
{
"epoch": 14.064,
"grad_norm": 0.9542890787124634,
"learning_rate": 1.0085983786129894e-05,
"loss": 1.1549,
"mean_token_accuracy": 0.5636588338762522,
"num_tokens": 1702377.0,
"step": 7470
},
{
"epoch": 14.082823529411765,
"grad_norm": 0.9031827449798584,
"learning_rate": 1.0065012474103027e-05,
"loss": 1.195,
"mean_token_accuracy": 0.5450264655053616,
"num_tokens": 1714838.0,
"step": 7480
},
{
"epoch": 14.101647058823529,
"grad_norm": 1.3046774864196777,
"learning_rate": 1.0044040876137647e-05,
"loss": 1.1586,
"mean_token_accuracy": 0.5547593403607607,
"num_tokens": 1728323.0,
"step": 7490
},
{
"epoch": 14.120470588235294,
"grad_norm": 0.7626868486404419,
"learning_rate": 1.0023069084471244e-05,
"loss": 1.193,
"mean_token_accuracy": 0.539740389585495,
"num_tokens": 1741948.0,
"step": 7500
},
{
"epoch": 14.13929411764706,
"grad_norm": 0.7623440027236938,
"learning_rate": 1.0002097191342167e-05,
"loss": 1.2001,
"mean_token_accuracy": 0.5466340240091085,
"num_tokens": 1754976.0,
"step": 7510
},
{
"epoch": 14.158117647058823,
"grad_norm": 0.5901451706886292,
"learning_rate": 9.981125288989197e-06,
"loss": 1.1834,
"mean_token_accuracy": 0.5578321043401957,
"num_tokens": 1769202.0,
"step": 7520
},
{
"epoch": 14.176941176470589,
"grad_norm": 1.7071183919906616,
"learning_rate": 9.960153469651173e-06,
"loss": 1.1479,
"mean_token_accuracy": 0.5668721627444029,
"num_tokens": 1782731.0,
"step": 7530
},
{
"epoch": 14.195764705882352,
"grad_norm": 0.84836345911026,
"learning_rate": 9.939181825566555e-06,
"loss": 1.1664,
"mean_token_accuracy": 0.5470958970487118,
"num_tokens": 1795962.0,
"step": 7540
},
{
"epoch": 14.214588235294118,
"grad_norm": 0.6625200510025024,
"learning_rate": 9.918210448973041e-06,
"loss": 1.167,
"mean_token_accuracy": 0.5696456581354141,
"num_tokens": 1809352.0,
"step": 7550
},
{
"epoch": 14.233411764705883,
"grad_norm": 1.065662145614624,
"learning_rate": 9.897239432107144e-06,
"loss": 1.2344,
"mean_token_accuracy": 0.5390154391527175,
"num_tokens": 1823306.0,
"step": 7560
},
{
"epoch": 14.252235294117646,
"grad_norm": 1.492067813873291,
"learning_rate": 9.876268867203803e-06,
"loss": 1.1505,
"mean_token_accuracy": 0.5786493707448244,
"num_tokens": 1836619.0,
"step": 7570
},
{
"epoch": 14.271058823529412,
"grad_norm": 0.8435829281806946,
"learning_rate": 9.855298846495964e-06,
"loss": 1.1985,
"mean_token_accuracy": 0.5462658539414406,
"num_tokens": 1850730.0,
"step": 7580
},
{
"epoch": 14.289882352941177,
"grad_norm": 0.8233511447906494,
"learning_rate": 9.834329462214186e-06,
"loss": 1.205,
"mean_token_accuracy": 0.5534395117312669,
"num_tokens": 1864080.0,
"step": 7590
},
{
"epoch": 14.30870588235294,
"grad_norm": 0.9905659556388855,
"learning_rate": 9.813360806586218e-06,
"loss": 1.223,
"mean_token_accuracy": 0.5520945586264133,
"num_tokens": 1877561.0,
"step": 7600
},
{
"epoch": 14.331294117647058,
"grad_norm": 0.8363655805587769,
"learning_rate": 9.792392971836614e-06,
"loss": 1.2066,
"mean_token_accuracy": 0.5393288742750884,
"num_tokens": 14227.0,
"step": 7610
},
{
"epoch": 14.350117647058823,
"grad_norm": 1.2111597061157227,
"learning_rate": 9.77142605018631e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.5616458028554916,
"num_tokens": 26809.0,
"step": 7620
},
{
"epoch": 14.368941176470589,
"grad_norm": 1.3134557008743286,
"learning_rate": 9.750460133852234e-06,
"loss": 1.1937,
"mean_token_accuracy": 0.5489944957196713,
"num_tokens": 40285.0,
"step": 7630
},
{
"epoch": 14.387764705882352,
"grad_norm": 1.0377771854400635,
"learning_rate": 9.729495315046886e-06,
"loss": 1.1611,
"mean_token_accuracy": 0.5607926532626152,
"num_tokens": 53708.0,
"step": 7640
},
{
"epoch": 14.406588235294118,
"grad_norm": 0.5833343267440796,
"learning_rate": 9.708531685977945e-06,
"loss": 1.1491,
"mean_token_accuracy": 0.5668897565454245,
"num_tokens": 66990.0,
"step": 7650
},
{
"epoch": 14.425411764705883,
"grad_norm": 0.6283827424049377,
"learning_rate": 9.687569338847848e-06,
"loss": 1.1669,
"mean_token_accuracy": 0.5548735350370407,
"num_tokens": 80285.0,
"step": 7660
},
{
"epoch": 14.444235294117647,
"grad_norm": 1.9695147275924683,
"learning_rate": 9.666608365853405e-06,
"loss": 1.0941,
"mean_token_accuracy": 0.586016109958291,
"num_tokens": 94238.0,
"step": 7670
},
{
"epoch": 14.463058823529412,
"grad_norm": 0.762913703918457,
"learning_rate": 9.645648859185372e-06,
"loss": 1.241,
"mean_token_accuracy": 0.5349636357277632,
"num_tokens": 109038.0,
"step": 7680
},
{
"epoch": 14.481882352941177,
"grad_norm": 1.1660629510879517,
"learning_rate": 9.624690911028062e-06,
"loss": 1.2031,
"mean_token_accuracy": 0.5475806038826704,
"num_tokens": 122828.0,
"step": 7690
},
{
"epoch": 14.500705882352941,
"grad_norm": 0.7873107194900513,
"learning_rate": 9.603734613558933e-06,
"loss": 1.1112,
"mean_token_accuracy": 0.5692310906946659,
"num_tokens": 136009.0,
"step": 7700
},
{
"epoch": 14.519529411764706,
"grad_norm": 0.7429578900337219,
"learning_rate": 9.582780058948182e-06,
"loss": 1.2031,
"mean_token_accuracy": 0.5352565940469504,
"num_tokens": 148439.0,
"step": 7710
},
{
"epoch": 14.53835294117647,
"grad_norm": 0.9231197834014893,
"learning_rate": 9.56182733935834e-06,
"loss": 1.1313,
"mean_token_accuracy": 0.5766253888607025,
"num_tokens": 162003.0,
"step": 7720
},
{
"epoch": 14.557176470588235,
"grad_norm": 0.9091134071350098,
"learning_rate": 9.540876546943863e-06,
"loss": 1.1224,
"mean_token_accuracy": 0.565272556990385,
"num_tokens": 174711.0,
"step": 7730
},
{
"epoch": 14.576,
"grad_norm": 0.7547168731689453,
"learning_rate": 9.51992777385074e-06,
"loss": 1.1618,
"mean_token_accuracy": 0.5629419464617967,
"num_tokens": 188030.0,
"step": 7740
},
{
"epoch": 14.594823529411764,
"grad_norm": 0.7168717980384827,
"learning_rate": 9.498981112216073e-06,
"loss": 1.1665,
"mean_token_accuracy": 0.562981392070651,
"num_tokens": 201691.0,
"step": 7750
},
{
"epoch": 14.61364705882353,
"grad_norm": 1.3017321825027466,
"learning_rate": 9.478036654167673e-06,
"loss": 1.1908,
"mean_token_accuracy": 0.5513414010405541,
"num_tokens": 215908.0,
"step": 7760
},
{
"epoch": 14.632470588235295,
"grad_norm": 0.7235598564147949,
"learning_rate": 9.457094491823674e-06,
"loss": 1.1668,
"mean_token_accuracy": 0.5548354998230934,
"num_tokens": 229890.0,
"step": 7770
},
{
"epoch": 14.651294117647058,
"grad_norm": 1.014742136001587,
"learning_rate": 9.436154717292095e-06,
"loss": 1.1381,
"mean_token_accuracy": 0.5685541749000549,
"num_tokens": 242083.0,
"step": 7780
},
{
"epoch": 14.670117647058824,
"grad_norm": 1.7762614488601685,
"learning_rate": 9.415217422670465e-06,
"loss": 1.2049,
"mean_token_accuracy": 0.5436010017991066,
"num_tokens": 255157.0,
"step": 7790
},
{
"epoch": 14.688941176470589,
"grad_norm": 1.2956479787826538,
"learning_rate": 9.3942827000454e-06,
"loss": 1.1581,
"mean_token_accuracy": 0.5617427326738834,
"num_tokens": 268163.0,
"step": 7800
},
{
"epoch": 14.707764705882353,
"grad_norm": 1.3141613006591797,
"learning_rate": 9.37335064149221e-06,
"loss": 1.2012,
"mean_token_accuracy": 0.5483662486076355,
"num_tokens": 281261.0,
"step": 7810
},
{
"epoch": 14.726588235294118,
"grad_norm": 0.9441389441490173,
"learning_rate": 9.352421339074481e-06,
"loss": 1.1539,
"mean_token_accuracy": 0.5657226879149675,
"num_tokens": 295835.0,
"step": 7820
},
{
"epoch": 14.745411764705882,
"grad_norm": 0.6852191686630249,
"learning_rate": 9.331494884843682e-06,
"loss": 1.1109,
"mean_token_accuracy": 0.5754560541361571,
"num_tokens": 308216.0,
"step": 7830
},
{
"epoch": 14.764235294117647,
"grad_norm": 1.1635503768920898,
"learning_rate": 9.310571370838747e-06,
"loss": 1.1794,
"mean_token_accuracy": 0.5661456611007452,
"num_tokens": 321205.0,
"step": 7840
},
{
"epoch": 14.783058823529412,
"grad_norm": 0.6046936511993408,
"learning_rate": 9.28965088908569e-06,
"loss": 1.1862,
"mean_token_accuracy": 0.5497476685792207,
"num_tokens": 336086.0,
"step": 7850
},
{
"epoch": 14.801882352941176,
"grad_norm": 1.6876798868179321,
"learning_rate": 9.268733531597185e-06,
"loss": 1.149,
"mean_token_accuracy": 0.5605622876435519,
"num_tokens": 348960.0,
"step": 7860
},
{
"epoch": 14.820705882352941,
"grad_norm": 0.6555068492889404,
"learning_rate": 9.24781939037215e-06,
"loss": 1.1488,
"mean_token_accuracy": 0.562832348421216,
"num_tokens": 361287.0,
"step": 7870
},
{
"epoch": 14.839529411764707,
"grad_norm": 1.4029412269592285,
"learning_rate": 9.226908557395384e-06,
"loss": 1.1503,
"mean_token_accuracy": 0.5643013104796409,
"num_tokens": 373901.0,
"step": 7880
},
{
"epoch": 14.85835294117647,
"grad_norm": 0.6774556636810303,
"learning_rate": 9.206001124637113e-06,
"loss": 1.118,
"mean_token_accuracy": 0.5655334409326315,
"num_tokens": 388185.0,
"step": 7890
},
{
"epoch": 14.877176470588235,
"grad_norm": 1.3459422588348389,
"learning_rate": 9.185097184052615e-06,
"loss": 1.2017,
"mean_token_accuracy": 0.547975680232048,
"num_tokens": 402096.0,
"step": 7900
},
{
"epoch": 14.896,
"grad_norm": 1.6914349794387817,
"learning_rate": 9.164196827581817e-06,
"loss": 1.1513,
"mean_token_accuracy": 0.5657749876379967,
"num_tokens": 415109.0,
"step": 7910
},
{
"epoch": 14.914823529411764,
"grad_norm": 0.5575766563415527,
"learning_rate": 9.143300147148869e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.5562089808285237,
"num_tokens": 428081.0,
"step": 7920
},
{
"epoch": 14.93364705882353,
"grad_norm": 1.4414509534835815,
"learning_rate": 9.122407234661764e-06,
"loss": 1.225,
"mean_token_accuracy": 0.5390350338071584,
"num_tokens": 441704.0,
"step": 7930
},
{
"epoch": 14.952470588235293,
"grad_norm": 1.122767448425293,
"learning_rate": 9.101518182011914e-06,
"loss": 1.1717,
"mean_token_accuracy": 0.5577954012900591,
"num_tokens": 454263.0,
"step": 7940
},
{
"epoch": 14.971294117647059,
"grad_norm": 0.7421872615814209,
"learning_rate": 9.080633081073763e-06,
"loss": 1.1535,
"mean_token_accuracy": 0.5636604465544224,
"num_tokens": 467572.0,
"step": 7950
},
{
"epoch": 14.990117647058824,
"grad_norm": 0.7315053939819336,
"learning_rate": 9.059752023704367e-06,
"loss": 1.1306,
"mean_token_accuracy": 0.5692378722131253,
"num_tokens": 481113.0,
"step": 7960
},
{
"epoch": 15.009411764705883,
"grad_norm": 0.8332544565200806,
"learning_rate": 9.038875101743003e-06,
"loss": 1.2939,
"mean_token_accuracy": 0.5470707878106977,
"num_tokens": 494860.0,
"step": 7970
},
{
"epoch": 15.028235294117646,
"grad_norm": 0.7542789578437805,
"learning_rate": 9.018002407010755e-06,
"loss": 1.2465,
"mean_token_accuracy": 0.5324202172458172,
"num_tokens": 509027.0,
"step": 7980
},
{
"epoch": 15.047058823529412,
"grad_norm": 0.6684849858283997,
"learning_rate": 8.997134031310123e-06,
"loss": 1.1783,
"mean_token_accuracy": 0.5569613084197045,
"num_tokens": 522094.0,
"step": 7990
},
{
"epoch": 15.065882352941177,
"grad_norm": 0.860490620136261,
"learning_rate": 8.976270066424602e-06,
"loss": 1.1533,
"mean_token_accuracy": 0.5639694180339575,
"num_tokens": 535686.0,
"step": 8000
},
{
"epoch": 15.08470588235294,
"grad_norm": 1.2201600074768066,
"learning_rate": 8.955410604118287e-06,
"loss": 1.1514,
"mean_token_accuracy": 0.5624887187033891,
"num_tokens": 547438.0,
"step": 8010
},
{
"epoch": 15.103529411764706,
"grad_norm": 0.7185003757476807,
"learning_rate": 8.934555736135475e-06,
"loss": 1.1916,
"mean_token_accuracy": 0.5531281687319278,
"num_tokens": 562299.0,
"step": 8020
},
{
"epoch": 15.122352941176471,
"grad_norm": 0.9599730372428894,
"learning_rate": 8.913705554200257e-06,
"loss": 1.2061,
"mean_token_accuracy": 0.5452544983476401,
"num_tokens": 575630.0,
"step": 8030
},
{
"epoch": 15.141176470588235,
"grad_norm": 1.0057822465896606,
"learning_rate": 8.892860150016108e-06,
"loss": 1.1375,
"mean_token_accuracy": 0.5605553191155195,
"num_tokens": 588679.0,
"step": 8040
},
{
"epoch": 15.16,
"grad_norm": 0.7954553961753845,
"learning_rate": 8.872019615265494e-06,
"loss": 1.164,
"mean_token_accuracy": 0.5492696654051542,
"num_tokens": 601254.0,
"step": 8050
},
{
"epoch": 15.178823529411765,
"grad_norm": 0.6151052713394165,
"learning_rate": 8.851184041609464e-06,
"loss": 1.2193,
"mean_token_accuracy": 0.5489672936499119,
"num_tokens": 614369.0,
"step": 8060
},
{
"epoch": 15.197647058823529,
"grad_norm": 0.8546686768531799,
"learning_rate": 8.830353520687245e-06,
"loss": 1.1938,
"mean_token_accuracy": 0.5609996184706688,
"num_tokens": 628665.0,
"step": 8070
},
{
"epoch": 15.216470588235294,
"grad_norm": 0.6473791003227234,
"learning_rate": 8.809528144115842e-06,
"loss": 1.1396,
"mean_token_accuracy": 0.5705398332327605,
"num_tokens": 643134.0,
"step": 8080
},
{
"epoch": 15.235294117647058,
"grad_norm": 1.5007941722869873,
"learning_rate": 8.788708003489636e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.5757338788360358,
"num_tokens": 655759.0,
"step": 8090
},
{
"epoch": 15.254117647058823,
"grad_norm": 0.6832001209259033,
"learning_rate": 8.767893190379974e-06,
"loss": 1.1718,
"mean_token_accuracy": 0.5538515329360962,
"num_tokens": 669100.0,
"step": 8100
},
{
"epoch": 15.272941176470589,
"grad_norm": 0.7107806205749512,
"learning_rate": 8.747083796334776e-06,
"loss": 1.1645,
"mean_token_accuracy": 0.5559101283550263,
"num_tokens": 683005.0,
"step": 8110
},
{
"epoch": 15.291764705882352,
"grad_norm": 0.6784150004386902,
"learning_rate": 8.726279912878126e-06,
"loss": 1.1888,
"mean_token_accuracy": 0.5452313166111707,
"num_tokens": 696921.0,
"step": 8120
},
{
"epoch": 15.310588235294118,
"grad_norm": 1.1530226469039917,
"learning_rate": 8.705481631509876e-06,
"loss": 1.1809,
"mean_token_accuracy": 0.561325515806675,
"num_tokens": 710780.0,
"step": 8130
},
{
"epoch": 15.329411764705883,
"grad_norm": 0.6158255934715271,
"learning_rate": 8.684689043705231e-06,
"loss": 1.1597,
"mean_token_accuracy": 0.5495276678353548,
"num_tokens": 724228.0,
"step": 8140
},
{
"epoch": 15.348235294117647,
"grad_norm": 0.6823238730430603,
"learning_rate": 8.663902240914357e-06,
"loss": 1.1347,
"mean_token_accuracy": 0.568655128031969,
"num_tokens": 737696.0,
"step": 8150
},
{
"epoch": 15.367058823529412,
"grad_norm": 0.6831355690956116,
"learning_rate": 8.643121314561976e-06,
"loss": 1.1547,
"mean_token_accuracy": 0.5554495759308338,
"num_tokens": 751387.0,
"step": 8160
},
{
"epoch": 15.385882352941177,
"grad_norm": 0.7401413321495056,
"learning_rate": 8.622346356046972e-06,
"loss": 1.1248,
"mean_token_accuracy": 0.5787703268229961,
"num_tokens": 764116.0,
"step": 8170
},
{
"epoch": 15.40470588235294,
"grad_norm": 1.320250391960144,
"learning_rate": 8.601577456741967e-06,
"loss": 1.1582,
"mean_token_accuracy": 0.5558317702263593,
"num_tokens": 777306.0,
"step": 8180
},
{
"epoch": 15.423529411764706,
"grad_norm": 1.5581915378570557,
"learning_rate": 8.580814707992949e-06,
"loss": 1.1475,
"mean_token_accuracy": 0.5548811592161655,
"num_tokens": 790684.0,
"step": 8190
},
{
"epoch": 15.44235294117647,
"grad_norm": 0.6925750970840454,
"learning_rate": 8.560058201118842e-06,
"loss": 1.19,
"mean_token_accuracy": 0.5531508523970843,
"num_tokens": 804244.0,
"step": 8200
},
{
"epoch": 15.461176470588235,
"grad_norm": 1.275739073753357,
"learning_rate": 8.539308027411123e-06,
"loss": 1.1916,
"mean_token_accuracy": 0.5411147933453322,
"num_tokens": 818070.0,
"step": 8210
},
{
"epoch": 15.48,
"grad_norm": 1.308816909790039,
"learning_rate": 8.51856427813341e-06,
"loss": 1.1389,
"mean_token_accuracy": 0.5609162572771311,
"num_tokens": 831998.0,
"step": 8220
},
{
"epoch": 15.498823529411764,
"grad_norm": 1.4086875915527344,
"learning_rate": 8.497827044521074e-06,
"loss": 1.1806,
"mean_token_accuracy": 0.5614006619900465,
"num_tokens": 846346.0,
"step": 8230
},
{
"epoch": 15.51764705882353,
"grad_norm": 1.5172885656356812,
"learning_rate": 8.477096417780818e-06,
"loss": 1.1423,
"mean_token_accuracy": 0.5598192039877177,
"num_tokens": 859606.0,
"step": 8240
},
{
"epoch": 15.536470588235295,
"grad_norm": 0.635094404220581,
"learning_rate": 8.456372489090294e-06,
"loss": 1.195,
"mean_token_accuracy": 0.5515242625027895,
"num_tokens": 872352.0,
"step": 8250
},
{
"epoch": 15.555294117647058,
"grad_norm": 1.049091100692749,
"learning_rate": 8.43565534959769e-06,
"loss": 1.1708,
"mean_token_accuracy": 0.5665956649929285,
"num_tokens": 886434.0,
"step": 8260
},
{
"epoch": 15.574117647058824,
"grad_norm": 0.6422486901283264,
"learning_rate": 8.414945090421337e-06,
"loss": 1.1199,
"mean_token_accuracy": 0.5681115534156561,
"num_tokens": 899435.0,
"step": 8270
},
{
"epoch": 15.592941176470589,
"grad_norm": 1.1809626817703247,
"learning_rate": 8.394241802649307e-06,
"loss": 1.1553,
"mean_token_accuracy": 0.5631851524114608,
"num_tokens": 912350.0,
"step": 8280
},
{
"epoch": 15.611764705882353,
"grad_norm": 0.8651266694068909,
"learning_rate": 8.373545577339002e-06,
"loss": 1.1419,
"mean_token_accuracy": 0.5651818908751011,
"num_tokens": 925480.0,
"step": 8290
},
{
"epoch": 15.630588235294118,
"grad_norm": 0.7373852729797363,
"learning_rate": 8.352856505516765e-06,
"loss": 1.1959,
"mean_token_accuracy": 0.5542501173913479,
"num_tokens": 938863.0,
"step": 8300
},
{
"epoch": 15.649411764705881,
"grad_norm": 1.3229117393493652,
"learning_rate": 8.33217467817748e-06,
"loss": 1.1445,
"mean_token_accuracy": 0.5683686885982752,
"num_tokens": 952271.0,
"step": 8310
},
{
"epoch": 15.668235294117647,
"grad_norm": 0.9950488805770874,
"learning_rate": 8.311500186284166e-06,
"loss": 1.1469,
"mean_token_accuracy": 0.5671338357031346,
"num_tokens": 966154.0,
"step": 8320
},
{
"epoch": 15.687058823529412,
"grad_norm": 1.0574768781661987,
"learning_rate": 8.290833120767585e-06,
"loss": 1.1745,
"mean_token_accuracy": 0.5554843176156282,
"num_tokens": 978470.0,
"step": 8330
},
{
"epoch": 15.705882352941176,
"grad_norm": 0.9360321760177612,
"learning_rate": 8.270173572525824e-06,
"loss": 1.1932,
"mean_token_accuracy": 0.5552597276866436,
"num_tokens": 992540.0,
"step": 8340
},
{
"epoch": 15.724705882352941,
"grad_norm": 1.4928091764450073,
"learning_rate": 8.249521632423918e-06,
"loss": 1.1648,
"mean_token_accuracy": 0.5653361968696118,
"num_tokens": 1006353.0,
"step": 8350
},
{
"epoch": 15.743529411764706,
"grad_norm": 0.7848607897758484,
"learning_rate": 8.228877391293432e-06,
"loss": 1.1971,
"mean_token_accuracy": 0.5555432129651308,
"num_tokens": 1020378.0,
"step": 8360
},
{
"epoch": 15.76235294117647,
"grad_norm": 0.813613772392273,
"learning_rate": 8.20824093993208e-06,
"loss": 1.1464,
"mean_token_accuracy": 0.5570184625685215,
"num_tokens": 1032073.0,
"step": 8370
},
{
"epoch": 15.781176470588235,
"grad_norm": 0.6320846080780029,
"learning_rate": 8.1876123691033e-06,
"loss": 1.1486,
"mean_token_accuracy": 0.5596757929772138,
"num_tokens": 1045292.0,
"step": 8380
},
{
"epoch": 15.8,
"grad_norm": 1.1251544952392578,
"learning_rate": 8.166991769535886e-06,
"loss": 1.1581,
"mean_token_accuracy": 0.5615175377577544,
"num_tokens": 1058620.0,
"step": 8390
},
{
"epoch": 15.818823529411764,
"grad_norm": 1.5427863597869873,
"learning_rate": 8.146379231923558e-06,
"loss": 1.2204,
"mean_token_accuracy": 0.5381950225681067,
"num_tokens": 1072099.0,
"step": 8400
},
{
"epoch": 15.83764705882353,
"grad_norm": 0.9844633936882019,
"learning_rate": 8.12577484692459e-06,
"loss": 1.1673,
"mean_token_accuracy": 0.554484510794282,
"num_tokens": 1085409.0,
"step": 8410
},
{
"epoch": 15.856470588235293,
"grad_norm": 1.1419299840927124,
"learning_rate": 8.105178705161395e-06,
"loss": 1.1713,
"mean_token_accuracy": 0.5534321576356888,
"num_tokens": 1098803.0,
"step": 8420
},
{
"epoch": 15.875294117647059,
"grad_norm": 0.8007948994636536,
"learning_rate": 8.084590897220122e-06,
"loss": 1.1394,
"mean_token_accuracy": 0.562013290822506,
"num_tokens": 1111518.0,
"step": 8430
},
{
"epoch": 15.894117647058824,
"grad_norm": 0.7455958724021912,
"learning_rate": 8.064011513650276e-06,
"loss": 1.1577,
"mean_token_accuracy": 0.5672158092260361,
"num_tokens": 1126619.0,
"step": 8440
},
{
"epoch": 15.912941176470587,
"grad_norm": 1.5687180757522583,
"learning_rate": 8.04344064496431e-06,
"loss": 1.1668,
"mean_token_accuracy": 0.5503279969096184,
"num_tokens": 1139657.0,
"step": 8450
},
{
"epoch": 15.931764705882353,
"grad_norm": 0.9860045909881592,
"learning_rate": 8.022878381637219e-06,
"loss": 1.1937,
"mean_token_accuracy": 0.54759371727705,
"num_tokens": 1153370.0,
"step": 8460
},
{
"epoch": 15.950588235294118,
"grad_norm": 1.7666656970977783,
"learning_rate": 8.002324814106161e-06,
"loss": 1.2289,
"mean_token_accuracy": 0.5447251949459314,
"num_tokens": 1166420.0,
"step": 8470
},
{
"epoch": 15.969411764705882,
"grad_norm": 1.6037918329238892,
"learning_rate": 7.981780032770035e-06,
"loss": 1.1054,
"mean_token_accuracy": 0.5815329641103745,
"num_tokens": 1179026.0,
"step": 8480
},
{
"epoch": 15.988235294117647,
"grad_norm": 1.0090439319610596,
"learning_rate": 7.961244127989112e-06,
"loss": 1.181,
"mean_token_accuracy": 0.5504204016178846,
"num_tokens": 1192954.0,
"step": 8490
},
{
"epoch": 16.00564705882353,
"grad_norm": 1.271785855293274,
"learning_rate": 7.940717190084603e-06,
"loss": 1.2075,
"mean_token_accuracy": 0.5598280276801135,
"num_tokens": 1206256.0,
"step": 8500
},
{
"epoch": 16.024470588235292,
"grad_norm": 1.0442641973495483,
"learning_rate": 7.9201993093383e-06,
"loss": 1.1703,
"mean_token_accuracy": 0.5635019179433585,
"num_tokens": 1219916.0,
"step": 8510
},
{
"epoch": 16.043294117647058,
"grad_norm": 0.7755882143974304,
"learning_rate": 7.899690575992144e-06,
"loss": 1.2,
"mean_token_accuracy": 0.5463937662541867,
"num_tokens": 1233382.0,
"step": 8520
},
{
"epoch": 16.062117647058823,
"grad_norm": 0.8577190041542053,
"learning_rate": 7.879191080247857e-06,
"loss": 1.1861,
"mean_token_accuracy": 0.5470962207764387,
"num_tokens": 1248208.0,
"step": 8530
},
{
"epoch": 16.08094117647059,
"grad_norm": 0.7220326662063599,
"learning_rate": 7.85870091226652e-06,
"loss": 1.2134,
"mean_token_accuracy": 0.5384833466261625,
"num_tokens": 1262112.0,
"step": 8540
},
{
"epoch": 16.099764705882354,
"grad_norm": 0.8119881749153137,
"learning_rate": 7.838220162168199e-06,
"loss": 1.1882,
"mean_token_accuracy": 0.5573807552456855,
"num_tokens": 1274841.0,
"step": 8550
},
{
"epoch": 16.11858823529412,
"grad_norm": 0.7537893056869507,
"learning_rate": 7.817748920031533e-06,
"loss": 1.1632,
"mean_token_accuracy": 0.5611206289380789,
"num_tokens": 1289035.0,
"step": 8560
},
{
"epoch": 16.13741176470588,
"grad_norm": 1.9209939241409302,
"learning_rate": 7.797287275893339e-06,
"loss": 1.1986,
"mean_token_accuracy": 0.5489944905042649,
"num_tokens": 1302422.0,
"step": 8570
},
{
"epoch": 16.156235294117646,
"grad_norm": 0.7403332591056824,
"learning_rate": 7.776835319748226e-06,
"loss": 1.1926,
"mean_token_accuracy": 0.544366030395031,
"num_tokens": 1316218.0,
"step": 8580
},
{
"epoch": 16.17505882352941,
"grad_norm": 0.7108265161514282,
"learning_rate": 7.756393141548196e-06,
"loss": 1.188,
"mean_token_accuracy": 0.5483727026730776,
"num_tokens": 1329307.0,
"step": 8590
},
{
"epoch": 16.193882352941177,
"grad_norm": 0.9194144606590271,
"learning_rate": 7.735960831202233e-06,
"loss": 1.1304,
"mean_token_accuracy": 0.5695245500653983,
"num_tokens": 1341639.0,
"step": 8600
},
{
"epoch": 16.212705882352942,
"grad_norm": 0.9176170825958252,
"learning_rate": 7.715538478575938e-06,
"loss": 1.1746,
"mean_token_accuracy": 0.5415426712483168,
"num_tokens": 1355385.0,
"step": 8610
},
{
"epoch": 16.231529411764704,
"grad_norm": 0.9545559883117676,
"learning_rate": 7.695126173491096e-06,
"loss": 1.1516,
"mean_token_accuracy": 0.5630953580141067,
"num_tokens": 1369295.0,
"step": 8620
},
{
"epoch": 16.25035294117647,
"grad_norm": 0.5547646284103394,
"learning_rate": 7.67472400572532e-06,
"loss": 1.1848,
"mean_token_accuracy": 0.5485550325363875,
"num_tokens": 1382440.0,
"step": 8630
},
{
"epoch": 16.269176470588235,
"grad_norm": 1.0427489280700684,
"learning_rate": 7.65433206501162e-06,
"loss": 1.1646,
"mean_token_accuracy": 0.5695793781429529,
"num_tokens": 1396411.0,
"step": 8640
},
{
"epoch": 16.288,
"grad_norm": 1.417345643043518,
"learning_rate": 7.633950441038041e-06,
"loss": 1.1358,
"mean_token_accuracy": 0.5730958338826895,
"num_tokens": 1409619.0,
"step": 8650
},
{
"epoch": 16.306823529411766,
"grad_norm": 0.6368844509124756,
"learning_rate": 7.613579223447238e-06,
"loss": 1.1379,
"mean_token_accuracy": 0.5624699790030718,
"num_tokens": 1422290.0,
"step": 8660
},
{
"epoch": 16.32564705882353,
"grad_norm": 0.7046752572059631,
"learning_rate": 7.593218501836108e-06,
"loss": 1.1817,
"mean_token_accuracy": 0.5441745646297932,
"num_tokens": 1435116.0,
"step": 8670
},
{
"epoch": 16.344470588235293,
"grad_norm": 0.9974550604820251,
"learning_rate": 7.572868365755377e-06,
"loss": 1.1776,
"mean_token_accuracy": 0.5540152471512556,
"num_tokens": 1449169.0,
"step": 8680
},
{
"epoch": 16.363294117647058,
"grad_norm": 0.9207789897918701,
"learning_rate": 7.552528904709224e-06,
"loss": 1.1139,
"mean_token_accuracy": 0.5721325032413006,
"num_tokens": 1461392.0,
"step": 8690
},
{
"epoch": 16.382117647058823,
"grad_norm": 0.7715643644332886,
"learning_rate": 7.532200208154856e-06,
"loss": 1.1484,
"mean_token_accuracy": 0.5646240394562483,
"num_tokens": 1475085.0,
"step": 8700
},
{
"epoch": 16.40094117647059,
"grad_norm": 0.6167107224464417,
"learning_rate": 7.511882365502161e-06,
"loss": 1.113,
"mean_token_accuracy": 0.568938347697258,
"num_tokens": 1488403.0,
"step": 8710
},
{
"epoch": 16.419764705882354,
"grad_norm": 1.0627272129058838,
"learning_rate": 7.491575466113269e-06,
"loss": 1.1889,
"mean_token_accuracy": 0.5542673517018557,
"num_tokens": 1501007.0,
"step": 8720
},
{
"epoch": 16.438588235294116,
"grad_norm": 0.8951665759086609,
"learning_rate": 7.4712795993021936e-06,
"loss": 1.1568,
"mean_token_accuracy": 0.5628596622496843,
"num_tokens": 1513501.0,
"step": 8730
},
{
"epoch": 16.45741176470588,
"grad_norm": 1.0367953777313232,
"learning_rate": 7.450994854334414e-06,
"loss": 1.1873,
"mean_token_accuracy": 0.5458543870598078,
"num_tokens": 1526907.0,
"step": 8740
},
{
"epoch": 16.476235294117647,
"grad_norm": 0.9993324875831604,
"learning_rate": 7.430721320426502e-06,
"loss": 1.1875,
"mean_token_accuracy": 0.5544692728668451,
"num_tokens": 1540310.0,
"step": 8750
},
{
"epoch": 16.495058823529412,
"grad_norm": 0.7184901237487793,
"learning_rate": 7.410459086745715e-06,
"loss": 1.1688,
"mean_token_accuracy": 0.5600904107093811,
"num_tokens": 1554564.0,
"step": 8760
},
{
"epoch": 16.513882352941177,
"grad_norm": 1.4242663383483887,
"learning_rate": 7.390208242409611e-06,
"loss": 1.1422,
"mean_token_accuracy": 0.5547851927578449,
"num_tokens": 1568019.0,
"step": 8770
},
{
"epoch": 16.532705882352943,
"grad_norm": 0.606593906879425,
"learning_rate": 7.3699688764856556e-06,
"loss": 1.1774,
"mean_token_accuracy": 0.5609697885811329,
"num_tokens": 1581881.0,
"step": 8780
},
{
"epoch": 16.551529411764704,
"grad_norm": 0.9548640847206116,
"learning_rate": 7.349741077990833e-06,
"loss": 1.1215,
"mean_token_accuracy": 0.5721657130867243,
"num_tokens": 1594281.0,
"step": 8790
},
{
"epoch": 16.57035294117647,
"grad_norm": 1.281101942062378,
"learning_rate": 7.3295249358912415e-06,
"loss": 1.1452,
"mean_token_accuracy": 0.5627951502799988,
"num_tokens": 1607907.0,
"step": 8800
},
{
"epoch": 16.591058823529412,
"grad_norm": 1.2261985540390015,
"learning_rate": 7.3093205391017275e-06,
"loss": 1.1948,
"mean_token_accuracy": 0.5499283254146576,
"num_tokens": 13007.0,
"step": 8810
},
{
"epoch": 16.609882352941177,
"grad_norm": 0.959309458732605,
"learning_rate": 7.289127976485462e-06,
"loss": 1.1569,
"mean_token_accuracy": 0.5632215116173029,
"num_tokens": 27121.0,
"step": 8820
},
{
"epoch": 16.628705882352943,
"grad_norm": 0.8404517769813538,
"learning_rate": 7.268947336853588e-06,
"loss": 1.2085,
"mean_token_accuracy": 0.5531386416405439,
"num_tokens": 40179.0,
"step": 8830
},
{
"epoch": 16.647529411764705,
"grad_norm": 1.2052935361862183,
"learning_rate": 7.248778708964781e-06,
"loss": 1.1325,
"mean_token_accuracy": 0.5616716485470533,
"num_tokens": 52387.0,
"step": 8840
},
{
"epoch": 16.66635294117647,
"grad_norm": 1.1935890913009644,
"learning_rate": 7.228622181524909e-06,
"loss": 1.1662,
"mean_token_accuracy": 0.5652685184031725,
"num_tokens": 66527.0,
"step": 8850
},
{
"epoch": 16.685176470588235,
"grad_norm": 1.4688079357147217,
"learning_rate": 7.20847784318661e-06,
"loss": 1.1733,
"mean_token_accuracy": 0.5583682101219892,
"num_tokens": 79512.0,
"step": 8860
},
{
"epoch": 16.704,
"grad_norm": 0.9977661967277527,
"learning_rate": 7.188345782548918e-06,
"loss": 1.1196,
"mean_token_accuracy": 0.5758439347147941,
"num_tokens": 92443.0,
"step": 8870
},
{
"epoch": 16.722823529411766,
"grad_norm": 1.6382378339767456,
"learning_rate": 7.168226088156858e-06,
"loss": 1.1558,
"mean_token_accuracy": 0.563961322978139,
"num_tokens": 107011.0,
"step": 8880
},
{
"epoch": 16.741647058823528,
"grad_norm": 0.7158175110816956,
"learning_rate": 7.148118848501073e-06,
"loss": 1.2003,
"mean_token_accuracy": 0.5421418201178312,
"num_tokens": 120340.0,
"step": 8890
},
{
"epoch": 16.760470588235293,
"grad_norm": 0.7682539224624634,
"learning_rate": 7.128024152017426e-06,
"loss": 1.1337,
"mean_token_accuracy": 0.5682530965656042,
"num_tokens": 133870.0,
"step": 8900
},
{
"epoch": 16.77929411764706,
"grad_norm": 1.2490408420562744,
"learning_rate": 7.10794208708661e-06,
"loss": 1.1464,
"mean_token_accuracy": 0.5654782570898533,
"num_tokens": 147737.0,
"step": 8910
},
{
"epoch": 16.798117647058824,
"grad_norm": 1.0072635412216187,
"learning_rate": 7.087872742033761e-06,
"loss": 1.1675,
"mean_token_accuracy": 0.5675601534545421,
"num_tokens": 160861.0,
"step": 8920
},
{
"epoch": 16.81694117647059,
"grad_norm": 0.9989560842514038,
"learning_rate": 7.0678162051280796e-06,
"loss": 1.1504,
"mean_token_accuracy": 0.5777845904231071,
"num_tokens": 173818.0,
"step": 8930
},
{
"epoch": 16.835764705882355,
"grad_norm": 0.7746507525444031,
"learning_rate": 7.04777256458242e-06,
"loss": 1.2331,
"mean_token_accuracy": 0.5357637394219636,
"num_tokens": 187606.0,
"step": 8940
},
{
"epoch": 16.854588235294116,
"grad_norm": 0.5496880412101746,
"learning_rate": 7.0277419085529275e-06,
"loss": 1.1534,
"mean_token_accuracy": 0.5625104811042547,
"num_tokens": 200788.0,
"step": 8950
},
{
"epoch": 16.87341176470588,
"grad_norm": 0.7524011731147766,
"learning_rate": 7.007724325138626e-06,
"loss": 1.1731,
"mean_token_accuracy": 0.5571359943598508,
"num_tokens": 214193.0,
"step": 8960
},
{
"epoch": 16.892235294117647,
"grad_norm": 1.898985743522644,
"learning_rate": 6.987719902381063e-06,
"loss": 1.1823,
"mean_token_accuracy": 0.546281049400568,
"num_tokens": 227004.0,
"step": 8970
},
{
"epoch": 16.911058823529412,
"grad_norm": 1.2188752889633179,
"learning_rate": 6.967728728263875e-06,
"loss": 1.2082,
"mean_token_accuracy": 0.5488316975533962,
"num_tokens": 240725.0,
"step": 8980
},
{
"epoch": 16.929882352941178,
"grad_norm": 1.4341834783554077,
"learning_rate": 6.947750890712452e-06,
"loss": 1.1383,
"mean_token_accuracy": 0.566087681055069,
"num_tokens": 255280.0,
"step": 8990
},
{
"epoch": 16.94870588235294,
"grad_norm": 1.4695709943771362,
"learning_rate": 6.927786477593517e-06,
"loss": 1.1297,
"mean_token_accuracy": 0.571322912350297,
"num_tokens": 268707.0,
"step": 9000
},
{
"epoch": 16.967529411764705,
"grad_norm": 1.0631098747253418,
"learning_rate": 6.907835576714752e-06,
"loss": 1.1401,
"mean_token_accuracy": 0.5591850385069848,
"num_tokens": 282374.0,
"step": 9010
},
{
"epoch": 16.98635294117647,
"grad_norm": 0.7683926820755005,
"learning_rate": 6.887898275824405e-06,
"loss": 1.1538,
"mean_token_accuracy": 0.5545760612934828,
"num_tokens": 295895.0,
"step": 9020
},
{
"epoch": 17.00564705882353,
"grad_norm": 0.5843003392219543,
"learning_rate": 6.8679746626109165e-06,
"loss": 1.304,
"mean_token_accuracy": 0.5468519330024719,
"num_tokens": 309096.0,
"step": 9030
},
{
"epoch": 17.024470588235292,
"grad_norm": 0.5110841393470764,
"learning_rate": 6.848064824702518e-06,
"loss": 1.1689,
"mean_token_accuracy": 0.5539047036319971,
"num_tokens": 322843.0,
"step": 9040
},
{
"epoch": 17.043294117647058,
"grad_norm": 0.681012749671936,
"learning_rate": 6.828168849666859e-06,
"loss": 1.1473,
"mean_token_accuracy": 0.5699756104499102,
"num_tokens": 335834.0,
"step": 9050
},
{
"epoch": 17.062117647058823,
"grad_norm": 0.6035940647125244,
"learning_rate": 6.808286825010611e-06,
"loss": 1.1957,
"mean_token_accuracy": 0.5480252616107464,
"num_tokens": 349415.0,
"step": 9060
},
{
"epoch": 17.08094117647059,
"grad_norm": 0.793001651763916,
"learning_rate": 6.788418838179101e-06,
"loss": 1.1495,
"mean_token_accuracy": 0.5695446979254484,
"num_tokens": 362782.0,
"step": 9070
},
{
"epoch": 17.099764705882354,
"grad_norm": 0.5845211148262024,
"learning_rate": 6.768564976555898e-06,
"loss": 1.2018,
"mean_token_accuracy": 0.5484800077974796,
"num_tokens": 375606.0,
"step": 9080
},
{
"epoch": 17.11858823529412,
"grad_norm": 0.7158066630363464,
"learning_rate": 6.748725327462462e-06,
"loss": 1.1601,
"mean_token_accuracy": 0.5678265064954757,
"num_tokens": 388427.0,
"step": 9090
},
{
"epoch": 17.13741176470588,
"grad_norm": 1.2140324115753174,
"learning_rate": 6.728899978157729e-06,
"loss": 1.2314,
"mean_token_accuracy": 0.5344064626842737,
"num_tokens": 402111.0,
"step": 9100
},
{
"epoch": 17.156235294117646,
"grad_norm": 1.0139904022216797,
"learning_rate": 6.709089015837758e-06,
"loss": 1.1831,
"mean_token_accuracy": 0.5626831982284785,
"num_tokens": 416419.0,
"step": 9110
},
{
"epoch": 17.17505882352941,
"grad_norm": 0.5783360600471497,
"learning_rate": 6.68929252763531e-06,
"loss": 1.1888,
"mean_token_accuracy": 0.5566362496465445,
"num_tokens": 430433.0,
"step": 9120
},
{
"epoch": 17.193882352941177,
"grad_norm": 1.0979998111724854,
"learning_rate": 6.669510600619502e-06,
"loss": 1.1366,
"mean_token_accuracy": 0.572005919739604,
"num_tokens": 442507.0,
"step": 9130
},
{
"epoch": 17.212705882352942,
"grad_norm": 1.239842414855957,
"learning_rate": 6.649743321795401e-06,
"loss": 1.1488,
"mean_token_accuracy": 0.5650555603206158,
"num_tokens": 455301.0,
"step": 9140
},
{
"epoch": 17.231529411764704,
"grad_norm": 0.9120736718177795,
"learning_rate": 6.629990778103652e-06,
"loss": 1.1347,
"mean_token_accuracy": 0.5705232992768288,
"num_tokens": 468128.0,
"step": 9150
},
{
"epoch": 17.25035294117647,
"grad_norm": 1.5878956317901611,
"learning_rate": 6.6102530564200885e-06,
"loss": 1.1428,
"mean_token_accuracy": 0.5688801523298025,
"num_tokens": 482382.0,
"step": 9160
},
{
"epoch": 17.269176470588235,
"grad_norm": 1.3523510694503784,
"learning_rate": 6.5905302435553575e-06,
"loss": 1.1501,
"mean_token_accuracy": 0.5716863550245762,
"num_tokens": 495560.0,
"step": 9170
},
{
"epoch": 17.288,
"grad_norm": 1.0103275775909424,
"learning_rate": 6.570822426254526e-06,
"loss": 1.1479,
"mean_token_accuracy": 0.5623312875628471,
"num_tokens": 509660.0,
"step": 9180
},
{
"epoch": 17.306823529411766,
"grad_norm": 0.7961730360984802,
"learning_rate": 6.55112969119672e-06,
"loss": 1.177,
"mean_token_accuracy": 0.5561908625066281,
"num_tokens": 523652.0,
"step": 9190
},
{
"epoch": 17.32564705882353,
"grad_norm": 1.049294114112854,
"learning_rate": 6.531452124994716e-06,
"loss": 1.1729,
"mean_token_accuracy": 0.553871612995863,
"num_tokens": 536343.0,
"step": 9200
},
{
"epoch": 17.344470588235293,
"grad_norm": 0.9252281188964844,
"learning_rate": 6.511789814194588e-06,
"loss": 1.1302,
"mean_token_accuracy": 0.567984351888299,
"num_tokens": 549308.0,
"step": 9210
},
{
"epoch": 17.363294117647058,
"grad_norm": 0.8509281277656555,
"learning_rate": 6.492142845275302e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.5457706928253174,
"num_tokens": 562695.0,
"step": 9220
},
{
"epoch": 17.382117647058823,
"grad_norm": 0.8771809935569763,
"learning_rate": 6.472511304648359e-06,
"loss": 1.1732,
"mean_token_accuracy": 0.5531365133821964,
"num_tokens": 575073.0,
"step": 9230
},
{
"epoch": 17.40094117647059,
"grad_norm": 1.814473032951355,
"learning_rate": 6.4528952786573904e-06,
"loss": 1.1541,
"mean_token_accuracy": 0.5633249927312136,
"num_tokens": 588911.0,
"step": 9240
},
{
"epoch": 17.419764705882354,
"grad_norm": 0.7689526081085205,
"learning_rate": 6.4332948535778075e-06,
"loss": 1.2086,
"mean_token_accuracy": 0.5450298830866813,
"num_tokens": 603178.0,
"step": 9250
},
{
"epoch": 17.438588235294116,
"grad_norm": 0.8878546357154846,
"learning_rate": 6.413710115616383e-06,
"loss": 1.1557,
"mean_token_accuracy": 0.5638493042439222,
"num_tokens": 616690.0,
"step": 9260
},
{
"epoch": 17.45741176470588,
"grad_norm": 0.610453188419342,
"learning_rate": 6.394141150910913e-06,
"loss": 1.1544,
"mean_token_accuracy": 0.5664511952549219,
"num_tokens": 629868.0,
"step": 9270
},
{
"epoch": 17.476235294117647,
"grad_norm": 0.7785117626190186,
"learning_rate": 6.37458804552981e-06,
"loss": 1.1758,
"mean_token_accuracy": 0.5591502383351326,
"num_tokens": 643658.0,
"step": 9280
},
{
"epoch": 17.495058823529412,
"grad_norm": 0.694078803062439,
"learning_rate": 6.355050885471743e-06,
"loss": 1.1698,
"mean_token_accuracy": 0.5497753735631704,
"num_tokens": 656038.0,
"step": 9290
},
{
"epoch": 17.513882352941177,
"grad_norm": 0.9329729676246643,
"learning_rate": 6.33552975666524e-06,
"loss": 1.1292,
"mean_token_accuracy": 0.5673054289072752,
"num_tokens": 670286.0,
"step": 9300
},
{
"epoch": 17.532705882352943,
"grad_norm": 1.1342458724975586,
"learning_rate": 6.316024744968327e-06,
"loss": 1.2161,
"mean_token_accuracy": 0.5357775934040546,
"num_tokens": 683493.0,
"step": 9310
},
{
"epoch": 17.551529411764704,
"grad_norm": 0.8364800810813904,
"learning_rate": 6.296535936168137e-06,
"loss": 1.1663,
"mean_token_accuracy": 0.5568131286650896,
"num_tokens": 697575.0,
"step": 9320
},
{
"epoch": 17.57035294117647,
"grad_norm": 1.625592589378357,
"learning_rate": 6.277063415980549e-06,
"loss": 1.1174,
"mean_token_accuracy": 0.5752797372639179,
"num_tokens": 710207.0,
"step": 9330
},
{
"epoch": 17.589176470588235,
"grad_norm": 1.3862090110778809,
"learning_rate": 6.257607270049791e-06,
"loss": 1.142,
"mean_token_accuracy": 0.5722228426486253,
"num_tokens": 724438.0,
"step": 9340
},
{
"epoch": 17.608,
"grad_norm": 1.26033616065979,
"learning_rate": 6.238167583948082e-06,
"loss": 1.1907,
"mean_token_accuracy": 0.5385926622897387,
"num_tokens": 739319.0,
"step": 9350
},
{
"epoch": 17.626823529411766,
"grad_norm": 1.0322513580322266,
"learning_rate": 6.218744443175237e-06,
"loss": 1.1304,
"mean_token_accuracy": 0.5683851022273302,
"num_tokens": 751914.0,
"step": 9360
},
{
"epoch": 17.645647058823528,
"grad_norm": 0.7326356172561646,
"learning_rate": 6.199337933158316e-06,
"loss": 1.1813,
"mean_token_accuracy": 0.5477977491915226,
"num_tokens": 766447.0,
"step": 9370
},
{
"epoch": 17.664470588235293,
"grad_norm": 0.9041365385055542,
"learning_rate": 6.179948139251218e-06,
"loss": 1.1652,
"mean_token_accuracy": 0.55347336307168,
"num_tokens": 779625.0,
"step": 9380
},
{
"epoch": 17.68329411764706,
"grad_norm": 1.576574683189392,
"learning_rate": 6.160575146734338e-06,
"loss": 1.1529,
"mean_token_accuracy": 0.5669393539428711,
"num_tokens": 793737.0,
"step": 9390
},
{
"epoch": 17.702117647058824,
"grad_norm": 1.3531404733657837,
"learning_rate": 6.1412190408141505e-06,
"loss": 1.1197,
"mean_token_accuracy": 0.5701812230050564,
"num_tokens": 807112.0,
"step": 9400
},
{
"epoch": 17.72094117647059,
"grad_norm": 1.6633743047714233,
"learning_rate": 6.121879906622883e-06,
"loss": 1.1761,
"mean_token_accuracy": 0.560976068302989,
"num_tokens": 820492.0,
"step": 9410
},
{
"epoch": 17.739764705882354,
"grad_norm": 1.1381210088729858,
"learning_rate": 6.102557829218105e-06,
"loss": 1.1562,
"mean_token_accuracy": 0.5558978658169508,
"num_tokens": 834186.0,
"step": 9420
},
{
"epoch": 17.758588235294116,
"grad_norm": 1.8115606307983398,
"learning_rate": 6.083252893582374e-06,
"loss": 1.1872,
"mean_token_accuracy": 0.554209940135479,
"num_tokens": 847318.0,
"step": 9430
},
{
"epoch": 17.77741176470588,
"grad_norm": 1.288480520248413,
"learning_rate": 6.063965184622845e-06,
"loss": 1.1726,
"mean_token_accuracy": 0.5530060395598412,
"num_tokens": 860095.0,
"step": 9440
},
{
"epoch": 17.796235294117647,
"grad_norm": 1.2023969888687134,
"learning_rate": 6.0446947871709174e-06,
"loss": 1.1904,
"mean_token_accuracy": 0.5426197368651628,
"num_tokens": 873256.0,
"step": 9450
},
{
"epoch": 17.815058823529412,
"grad_norm": 1.5823273658752441,
"learning_rate": 6.025441785981843e-06,
"loss": 1.1334,
"mean_token_accuracy": 0.5691535335034132,
"num_tokens": 886435.0,
"step": 9460
},
{
"epoch": 17.833882352941178,
"grad_norm": 0.8472403883934021,
"learning_rate": 6.006206265734364e-06,
"loss": 1.1382,
"mean_token_accuracy": 0.5657643742859364,
"num_tokens": 899127.0,
"step": 9470
},
{
"epoch": 17.852705882352943,
"grad_norm": 0.931440532207489,
"learning_rate": 5.9869883110303366e-06,
"loss": 1.1718,
"mean_token_accuracy": 0.5716207943856716,
"num_tokens": 913094.0,
"step": 9480
},
{
"epoch": 17.871529411764705,
"grad_norm": 0.7743551135063171,
"learning_rate": 5.967788006394364e-06,
"loss": 1.1778,
"mean_token_accuracy": 0.5500955499708653,
"num_tokens": 927506.0,
"step": 9490
},
{
"epoch": 17.89035294117647,
"grad_norm": 3.367818593978882,
"learning_rate": 5.948605436273411e-06,
"loss": 1.1036,
"mean_token_accuracy": 0.5776654280722141,
"num_tokens": 940411.0,
"step": 9500
},
{
"epoch": 17.909176470588235,
"grad_norm": 0.621562659740448,
"learning_rate": 5.9294406850364584e-06,
"loss": 1.2119,
"mean_token_accuracy": 0.5432645879685879,
"num_tokens": 954352.0,
"step": 9510
},
{
"epoch": 17.928,
"grad_norm": 0.5852854251861572,
"learning_rate": 5.910293836974099e-06,
"loss": 1.1967,
"mean_token_accuracy": 0.5400953222066164,
"num_tokens": 967263.0,
"step": 9520
},
{
"epoch": 17.946823529411766,
"grad_norm": 0.6211656332015991,
"learning_rate": 5.891164976298198e-06,
"loss": 1.1627,
"mean_token_accuracy": 0.5562442850321532,
"num_tokens": 980662.0,
"step": 9530
},
{
"epoch": 17.965647058823528,
"grad_norm": 0.6909055709838867,
"learning_rate": 5.872054187141492e-06,
"loss": 1.1726,
"mean_token_accuracy": 0.5591957967728376,
"num_tokens": 993499.0,
"step": 9540
},
{
"epoch": 17.984470588235293,
"grad_norm": 1.064255714416504,
"learning_rate": 5.852961553557251e-06,
"loss": 1.188,
"mean_token_accuracy": 0.5609757989645004,
"num_tokens": 1007775.0,
"step": 9550
},
{
"epoch": 18.001882352941177,
"grad_norm": 1.33067786693573,
"learning_rate": 5.833887159518882e-06,
"loss": 1.1521,
"mean_token_accuracy": 0.5590104452661566,
"num_tokens": 1019534.0,
"step": 9560
},
{
"epoch": 18.020705882352942,
"grad_norm": 1.1214513778686523,
"learning_rate": 5.8148310889195795e-06,
"loss": 1.1953,
"mean_token_accuracy": 0.5516563657671213,
"num_tokens": 1032963.0,
"step": 9570
},
{
"epoch": 18.039529411764708,
"grad_norm": 1.491575837135315,
"learning_rate": 5.795793425571943e-06,
"loss": 1.1595,
"mean_token_accuracy": 0.5553607545793057,
"num_tokens": 1045960.0,
"step": 9580
},
{
"epoch": 18.05835294117647,
"grad_norm": 0.6637427806854248,
"learning_rate": 5.776774253207607e-06,
"loss": 1.1874,
"mean_token_accuracy": 0.5493495386093855,
"num_tokens": 1060210.0,
"step": 9590
},
{
"epoch": 18.077176470588235,
"grad_norm": 0.5911340117454529,
"learning_rate": 5.757773655476895e-06,
"loss": 1.1127,
"mean_token_accuracy": 0.57906415425241,
"num_tokens": 1072162.0,
"step": 9600
},
{
"epoch": 18.097882352941177,
"grad_norm": 1.7098009586334229,
"learning_rate": 5.738791715948421e-06,
"loss": 1.1165,
"mean_token_accuracy": 0.5782070815563202,
"num_tokens": 13939.0,
"step": 9610
},
{
"epoch": 18.116705882352942,
"grad_norm": 0.6690590977668762,
"learning_rate": 5.7198285181087406e-06,
"loss": 1.1392,
"mean_token_accuracy": 0.5584286205470562,
"num_tokens": 28443.0,
"step": 9620
},
{
"epoch": 18.135529411764704,
"grad_norm": 1.1516035795211792,
"learning_rate": 5.700884145361976e-06,
"loss": 1.202,
"mean_token_accuracy": 0.543058916553855,
"num_tokens": 43005.0,
"step": 9630
},
{
"epoch": 18.15435294117647,
"grad_norm": 0.6750898957252502,
"learning_rate": 5.6819586810294635e-06,
"loss": 1.0982,
"mean_token_accuracy": 0.5858326137065888,
"num_tokens": 55756.0,
"step": 9640
},
{
"epoch": 18.173176470588235,
"grad_norm": 1.525680661201477,
"learning_rate": 5.663052208349367e-06,
"loss": 1.0754,
"mean_token_accuracy": 0.5846003469079732,
"num_tokens": 68605.0,
"step": 9650
},
{
"epoch": 18.192,
"grad_norm": 1.5765116214752197,
"learning_rate": 5.6441648104763215e-06,
"loss": 1.1771,
"mean_token_accuracy": 0.5558013815432787,
"num_tokens": 82077.0,
"step": 9660
},
{
"epoch": 18.210823529411766,
"grad_norm": 0.8816690444946289,
"learning_rate": 5.625296570481069e-06,
"loss": 1.1803,
"mean_token_accuracy": 0.5606073562055827,
"num_tokens": 94758.0,
"step": 9670
},
{
"epoch": 18.22964705882353,
"grad_norm": 1.345479130744934,
"learning_rate": 5.606447571350093e-06,
"loss": 1.2028,
"mean_token_accuracy": 0.5379578843712807,
"num_tokens": 109010.0,
"step": 9680
},
{
"epoch": 18.248470588235293,
"grad_norm": 1.374245524406433,
"learning_rate": 5.587617895985247e-06,
"loss": 1.196,
"mean_token_accuracy": 0.5491275552660226,
"num_tokens": 122939.0,
"step": 9690
},
{
"epoch": 18.267294117647058,
"grad_norm": 0.7323598265647888,
"learning_rate": 5.568807627203399e-06,
"loss": 1.1414,
"mean_token_accuracy": 0.5601202577352524,
"num_tokens": 137029.0,
"step": 9700
},
{
"epoch": 18.286117647058823,
"grad_norm": 1.2711374759674072,
"learning_rate": 5.550016847736055e-06,
"loss": 1.1124,
"mean_token_accuracy": 0.5777692060917616,
"num_tokens": 149183.0,
"step": 9710
},
{
"epoch": 18.30494117647059,
"grad_norm": 0.8101398944854736,
"learning_rate": 5.5312456402290174e-06,
"loss": 1.1478,
"mean_token_accuracy": 0.5615578092634678,
"num_tokens": 163147.0,
"step": 9720
},
{
"epoch": 18.323764705882354,
"grad_norm": 1.201725721359253,
"learning_rate": 5.512494087241995e-06,
"loss": 1.1889,
"mean_token_accuracy": 0.5410687677562237,
"num_tokens": 176934.0,
"step": 9730
},
{
"epoch": 18.342588235294116,
"grad_norm": 1.2040985822677612,
"learning_rate": 5.493762271248255e-06,
"loss": 1.0963,
"mean_token_accuracy": 0.5771806977689267,
"num_tokens": 189676.0,
"step": 9740
},
{
"epoch": 18.36141176470588,
"grad_norm": 1.194765567779541,
"learning_rate": 5.475050274634255e-06,
"loss": 1.2101,
"mean_token_accuracy": 0.5370388999581337,
"num_tokens": 202706.0,
"step": 9750
},
{
"epoch": 18.380235294117647,
"grad_norm": 1.3350589275360107,
"learning_rate": 5.456358179699289e-06,
"loss": 1.17,
"mean_token_accuracy": 0.5458086933940649,
"num_tokens": 216179.0,
"step": 9760
},
{
"epoch": 18.399058823529412,
"grad_norm": 1.164698839187622,
"learning_rate": 5.437686068655115e-06,
"loss": 1.2626,
"mean_token_accuracy": 0.534201942011714,
"num_tokens": 229633.0,
"step": 9770
},
{
"epoch": 18.417882352941177,
"grad_norm": 0.6664723753929138,
"learning_rate": 5.419034023625597e-06,
"loss": 1.1409,
"mean_token_accuracy": 0.5639401733875274,
"num_tokens": 242540.0,
"step": 9780
},
{
"epoch": 18.436705882352943,
"grad_norm": 0.5169602036476135,
"learning_rate": 5.4004021266463415e-06,
"loss": 1.1817,
"mean_token_accuracy": 0.5450482603162528,
"num_tokens": 254975.0,
"step": 9790
},
{
"epoch": 18.455529411764704,
"grad_norm": 0.5377786755561829,
"learning_rate": 5.381790459664355e-06,
"loss": 1.1443,
"mean_token_accuracy": 0.5668651383370161,
"num_tokens": 269635.0,
"step": 9800
},
{
"epoch": 18.47435294117647,
"grad_norm": 1.7214852571487427,
"learning_rate": 5.363199104537649e-06,
"loss": 1.1384,
"mean_token_accuracy": 0.5759254258126021,
"num_tokens": 282552.0,
"step": 9810
},
{
"epoch": 18.493176470588235,
"grad_norm": 1.1212029457092285,
"learning_rate": 5.344628143034904e-06,
"loss": 1.1671,
"mean_token_accuracy": 0.5748541194945573,
"num_tokens": 296715.0,
"step": 9820
},
{
"epoch": 18.512,
"grad_norm": 0.8291766047477722,
"learning_rate": 5.32607765683511e-06,
"loss": 1.171,
"mean_token_accuracy": 0.5621029295027256,
"num_tokens": 310741.0,
"step": 9830
},
{
"epoch": 18.530823529411766,
"grad_norm": 1.666212558746338,
"learning_rate": 5.307547727527207e-06,
"loss": 1.1493,
"mean_token_accuracy": 0.5714134465903044,
"num_tokens": 324377.0,
"step": 9840
},
{
"epoch": 18.54964705882353,
"grad_norm": 0.6212410926818848,
"learning_rate": 5.28903843660971e-06,
"loss": 1.182,
"mean_token_accuracy": 0.5523442510515452,
"num_tokens": 336585.0,
"step": 9850
},
{
"epoch": 18.568470588235293,
"grad_norm": 0.763521134853363,
"learning_rate": 5.2705498654903666e-06,
"loss": 1.2182,
"mean_token_accuracy": 0.5365333639085292,
"num_tokens": 351773.0,
"step": 9860
},
{
"epoch": 18.58729411764706,
"grad_norm": 1.7621654272079468,
"learning_rate": 5.252082095485793e-06,
"loss": 1.134,
"mean_token_accuracy": 0.5687922302633523,
"num_tokens": 364135.0,
"step": 9870
},
{
"epoch": 18.606117647058824,
"grad_norm": 0.8109177350997925,
"learning_rate": 5.233635207821126e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.5557177890092134,
"num_tokens": 377314.0,
"step": 9880
},
{
"epoch": 18.62494117647059,
"grad_norm": 1.2423245906829834,
"learning_rate": 5.215209283629647e-06,
"loss": 1.1754,
"mean_token_accuracy": 0.5624301459640264,
"num_tokens": 391043.0,
"step": 9890
},
{
"epoch": 18.643764705882354,
"grad_norm": 1.4668940305709839,
"learning_rate": 5.19680440395244e-06,
"loss": 1.1433,
"mean_token_accuracy": 0.5692973904311657,
"num_tokens": 404155.0,
"step": 9900
},
{
"epoch": 18.662588235294116,
"grad_norm": 0.6444953083992004,
"learning_rate": 5.1784206497380275e-06,
"loss": 1.1656,
"mean_token_accuracy": 0.5535128690302372,
"num_tokens": 417083.0,
"step": 9910
},
{
"epoch": 18.68141176470588,
"grad_norm": 0.9833362102508545,
"learning_rate": 5.160058101842025e-06,
"loss": 1.1301,
"mean_token_accuracy": 0.5652093205600976,
"num_tokens": 430739.0,
"step": 9920
},
{
"epoch": 18.700235294117647,
"grad_norm": 1.3160921335220337,
"learning_rate": 5.141716841026774e-06,
"loss": 1.2158,
"mean_token_accuracy": 0.5528298642486333,
"num_tokens": 444108.0,
"step": 9930
},
{
"epoch": 18.719058823529412,
"grad_norm": 0.6429352164268494,
"learning_rate": 5.123396947960993e-06,
"loss": 1.1509,
"mean_token_accuracy": 0.5627094566822052,
"num_tokens": 457365.0,
"step": 9940
},
{
"epoch": 18.737882352941178,
"grad_norm": 0.57741379737854,
"learning_rate": 5.105098503219408e-06,
"loss": 1.1677,
"mean_token_accuracy": 0.5417445503175259,
"num_tokens": 470389.0,
"step": 9950
},
{
"epoch": 18.756705882352943,
"grad_norm": 1.4769562482833862,
"learning_rate": 5.08682158728243e-06,
"loss": 1.1693,
"mean_token_accuracy": 0.5543987430632115,
"num_tokens": 483516.0,
"step": 9960
},
{
"epoch": 18.775529411764705,
"grad_norm": 1.122862696647644,
"learning_rate": 5.068566280535772e-06,
"loss": 1.1676,
"mean_token_accuracy": 0.5597089301794768,
"num_tokens": 497211.0,
"step": 9970
},
{
"epoch": 18.79435294117647,
"grad_norm": 1.3088434934616089,
"learning_rate": 5.050332663270105e-06,
"loss": 1.1703,
"mean_token_accuracy": 0.5574114482849837,
"num_tokens": 511246.0,
"step": 9980
},
{
"epoch": 18.813176470588235,
"grad_norm": 1.146748661994934,
"learning_rate": 5.032120815680703e-06,
"loss": 1.1415,
"mean_token_accuracy": 0.5681348893791437,
"num_tokens": 524961.0,
"step": 9990
},
{
"epoch": 18.832,
"grad_norm": 0.6369304060935974,
"learning_rate": 5.013930817867103e-06,
"loss": 1.1355,
"mean_token_accuracy": 0.5745254665613174,
"num_tokens": 537543.0,
"step": 10000
},
{
"epoch": 18.850823529411766,
"grad_norm": 0.804693341255188,
"learning_rate": 4.995762749832731e-06,
"loss": 1.1858,
"mean_token_accuracy": 0.5416501805186271,
"num_tokens": 550396.0,
"step": 10010
},
{
"epoch": 18.869647058823528,
"grad_norm": 1.5857802629470825,
"learning_rate": 4.977616691484567e-06,
"loss": 1.1571,
"mean_token_accuracy": 0.5618045397102833,
"num_tokens": 563953.0,
"step": 10020
},
{
"epoch": 18.888470588235293,
"grad_norm": 1.1062195301055908,
"learning_rate": 4.9594927226327795e-06,
"loss": 1.2112,
"mean_token_accuracy": 0.5421402599662543,
"num_tokens": 577786.0,
"step": 10030
},
{
"epoch": 18.90729411764706,
"grad_norm": 0.7114964127540588,
"learning_rate": 4.941390922990398e-06,
"loss": 1.1818,
"mean_token_accuracy": 0.5595896728336811,
"num_tokens": 592052.0,
"step": 10040
},
{
"epoch": 18.926117647058824,
"grad_norm": 1.7100679874420166,
"learning_rate": 4.923311372172935e-06,
"loss": 1.1518,
"mean_token_accuracy": 0.5808280512690545,
"num_tokens": 605121.0,
"step": 10050
},
{
"epoch": 18.94494117647059,
"grad_norm": 1.4883623123168945,
"learning_rate": 4.905254149698049e-06,
"loss": 1.1205,
"mean_token_accuracy": 0.564001039788127,
"num_tokens": 618937.0,
"step": 10060
},
{
"epoch": 18.963764705882355,
"grad_norm": 0.5710100531578064,
"learning_rate": 4.8872193349852e-06,
"loss": 1.193,
"mean_token_accuracy": 0.5475729245692491,
"num_tokens": 631403.0,
"step": 10070
},
{
"epoch": 18.982588235294116,
"grad_norm": 0.82412189245224,
"learning_rate": 4.869207007355286e-06,
"loss": 1.1769,
"mean_token_accuracy": 0.549387214705348,
"num_tokens": 644809.0,
"step": 10080
},
{
"epoch": 19.001882352941177,
"grad_norm": 4.334903717041016,
"learning_rate": 4.851217246030307e-06,
"loss": 1.2232,
"mean_token_accuracy": 0.5745812316493291,
"num_tokens": 658742.0,
"step": 10090
},
{
"epoch": 19.020705882352942,
"grad_norm": 0.7227234244346619,
"learning_rate": 4.833250130133014e-06,
"loss": 1.1446,
"mean_token_accuracy": 0.5578329466283322,
"num_tokens": 672376.0,
"step": 10100
},
{
"epoch": 19.039529411764708,
"grad_norm": 1.063941240310669,
"learning_rate": 4.815305738686548e-06,
"loss": 1.1323,
"mean_token_accuracy": 0.5669731423258781,
"num_tokens": 684972.0,
"step": 10110
},
{
"epoch": 19.05835294117647,
"grad_norm": 1.1839574575424194,
"learning_rate": 4.7973841506141195e-06,
"loss": 1.178,
"mean_token_accuracy": 0.5547245424240828,
"num_tokens": 699346.0,
"step": 10120
},
{
"epoch": 19.077176470588235,
"grad_norm": 0.8287972807884216,
"learning_rate": 4.779485444738632e-06,
"loss": 1.1305,
"mean_token_accuracy": 0.5627703540027141,
"num_tokens": 711760.0,
"step": 10130
},
{
"epoch": 19.096,
"grad_norm": 1.2000936269760132,
"learning_rate": 4.761609699782351e-06,
"loss": 1.1206,
"mean_token_accuracy": 0.5821688748896122,
"num_tokens": 724959.0,
"step": 10140
},
{
"epoch": 19.114823529411765,
"grad_norm": 1.1382743120193481,
"learning_rate": 4.743756994366555e-06,
"loss": 1.199,
"mean_token_accuracy": 0.5427775271236897,
"num_tokens": 738570.0,
"step": 10150
},
{
"epoch": 19.13364705882353,
"grad_norm": 0.6352145075798035,
"learning_rate": 4.7259274070111986e-06,
"loss": 1.1679,
"mean_token_accuracy": 0.5518446248024702,
"num_tokens": 751688.0,
"step": 10160
},
{
"epoch": 19.152470588235293,
"grad_norm": 1.043312668800354,
"learning_rate": 4.708121016134545e-06,
"loss": 1.1412,
"mean_token_accuracy": 0.56727832891047,
"num_tokens": 765261.0,
"step": 10170
},
{
"epoch": 19.171294117647058,
"grad_norm": 2.1281962394714355,
"learning_rate": 4.69033790005284e-06,
"loss": 1.1316,
"mean_token_accuracy": 0.5734411317855119,
"num_tokens": 777972.0,
"step": 10180
},
{
"epoch": 19.190117647058823,
"grad_norm": 1.2191262245178223,
"learning_rate": 4.672578136979961e-06,
"loss": 1.2033,
"mean_token_accuracy": 0.5502295974642039,
"num_tokens": 792540.0,
"step": 10190
},
{
"epoch": 19.20894117647059,
"grad_norm": 2.241875648498535,
"learning_rate": 4.65484180502708e-06,
"loss": 1.2114,
"mean_token_accuracy": 0.5526003040373325,
"num_tokens": 805628.0,
"step": 10200
},
{
"epoch": 19.227764705882354,
"grad_norm": 1.0254322290420532,
"learning_rate": 4.637128982202308e-06,
"loss": 1.1448,
"mean_token_accuracy": 0.566441947594285,
"num_tokens": 818605.0,
"step": 10210
},
{
"epoch": 19.24658823529412,
"grad_norm": 0.5974338054656982,
"learning_rate": 4.619439746410361e-06,
"loss": 1.1663,
"mean_token_accuracy": 0.5573429156094789,
"num_tokens": 831744.0,
"step": 10220
},
{
"epoch": 19.26541176470588,
"grad_norm": 1.1514984369277954,
"learning_rate": 4.601774175452203e-06,
"loss": 1.1816,
"mean_token_accuracy": 0.5479875948280096,
"num_tokens": 844511.0,
"step": 10230
},
{
"epoch": 19.284235294117646,
"grad_norm": 1.9380877017974854,
"learning_rate": 4.584132347024732e-06,
"loss": 1.1513,
"mean_token_accuracy": 0.5600051417946815,
"num_tokens": 857034.0,
"step": 10240
},
{
"epoch": 19.303058823529412,
"grad_norm": 1.8556410074234009,
"learning_rate": 4.566514338720414e-06,
"loss": 1.2121,
"mean_token_accuracy": 0.5544085066765547,
"num_tokens": 870895.0,
"step": 10250
},
{
"epoch": 19.321882352941177,
"grad_norm": 0.7411885857582092,
"learning_rate": 4.5489202280269465e-06,
"loss": 1.1471,
"mean_token_accuracy": 0.5642319560050965,
"num_tokens": 883794.0,
"step": 10260
},
{
"epoch": 19.340705882352943,
"grad_norm": 0.6563217043876648,
"learning_rate": 4.53135009232692e-06,
"loss": 1.119,
"mean_token_accuracy": 0.571989681199193,
"num_tokens": 896094.0,
"step": 10270
},
{
"epoch": 19.359529411764704,
"grad_norm": 0.717928946018219,
"learning_rate": 4.513804008897487e-06,
"loss": 1.1896,
"mean_token_accuracy": 0.5477908588945866,
"num_tokens": 909067.0,
"step": 10280
},
{
"epoch": 19.37835294117647,
"grad_norm": 1.7051725387573242,
"learning_rate": 4.496282054910006e-06,
"loss": 1.2038,
"mean_token_accuracy": 0.5528531819581985,
"num_tokens": 922861.0,
"step": 10290
},
{
"epoch": 19.397176470588235,
"grad_norm": 0.5672712922096252,
"learning_rate": 4.478784307429707e-06,
"loss": 1.1883,
"mean_token_accuracy": 0.5424028813838959,
"num_tokens": 935977.0,
"step": 10300
},
{
"epoch": 19.416,
"grad_norm": 1.4658303260803223,
"learning_rate": 4.461310843415354e-06,
"loss": 1.1138,
"mean_token_accuracy": 0.5855666678398848,
"num_tokens": 950190.0,
"step": 10310
},
{
"epoch": 19.434823529411766,
"grad_norm": 0.8565905690193176,
"learning_rate": 4.4438617397189185e-06,
"loss": 1.1637,
"mean_token_accuracy": 0.5575710866600275,
"num_tokens": 964649.0,
"step": 10320
},
{
"epoch": 19.45364705882353,
"grad_norm": 1.2235567569732666,
"learning_rate": 4.42643707308522e-06,
"loss": 1.1326,
"mean_token_accuracy": 0.5670348349958658,
"num_tokens": 978559.0,
"step": 10330
},
{
"epoch": 19.472470588235293,
"grad_norm": 0.79115229845047,
"learning_rate": 4.4090369201516e-06,
"loss": 1.1965,
"mean_token_accuracy": 0.5441572275012732,
"num_tokens": 992276.0,
"step": 10340
},
{
"epoch": 19.491294117647058,
"grad_norm": 1.4369901418685913,
"learning_rate": 4.391661357447585e-06,
"loss": 1.1701,
"mean_token_accuracy": 0.5550823096185923,
"num_tokens": 1006379.0,
"step": 10350
},
{
"epoch": 19.510117647058824,
"grad_norm": 1.175675868988037,
"learning_rate": 4.374310461394548e-06,
"loss": 1.1332,
"mean_token_accuracy": 0.5736968379467726,
"num_tokens": 1019668.0,
"step": 10360
},
{
"epoch": 19.52894117647059,
"grad_norm": 0.5965217351913452,
"learning_rate": 4.356984308305374e-06,
"loss": 1.1563,
"mean_token_accuracy": 0.5608095470815897,
"num_tokens": 1032049.0,
"step": 10370
},
{
"epoch": 19.547764705882354,
"grad_norm": 1.406221628189087,
"learning_rate": 4.3396829743841205e-06,
"loss": 1.1749,
"mean_token_accuracy": 0.5496669236570597,
"num_tokens": 1045211.0,
"step": 10380
},
{
"epoch": 19.566588235294116,
"grad_norm": 1.215728759765625,
"learning_rate": 4.322406535725686e-06,
"loss": 1.1748,
"mean_token_accuracy": 0.5576162055134773,
"num_tokens": 1058179.0,
"step": 10390
},
{
"epoch": 19.58541176470588,
"grad_norm": 1.223363995552063,
"learning_rate": 4.305155068315481e-06,
"loss": 1.1467,
"mean_token_accuracy": 0.5632787074893713,
"num_tokens": 1071797.0,
"step": 10400
},
{
"epoch": 19.604235294117647,
"grad_norm": 1.32563054561615,
"learning_rate": 4.2879286480290784e-06,
"loss": 1.1665,
"mean_token_accuracy": 0.543903386592865,
"num_tokens": 1085172.0,
"step": 10410
},
{
"epoch": 19.623058823529412,
"grad_norm": 1.0964701175689697,
"learning_rate": 4.270727350631892e-06,
"loss": 1.1368,
"mean_token_accuracy": 0.5769836001098156,
"num_tokens": 1098617.0,
"step": 10420
},
{
"epoch": 19.641882352941177,
"grad_norm": 0.8849780559539795,
"learning_rate": 4.253551251778835e-06,
"loss": 1.238,
"mean_token_accuracy": 0.5346022747457028,
"num_tokens": 1111860.0,
"step": 10430
},
{
"epoch": 19.660705882352943,
"grad_norm": 1.4776290655136108,
"learning_rate": 4.236400427014005e-06,
"loss": 1.2089,
"mean_token_accuracy": 0.5553506713360548,
"num_tokens": 1125874.0,
"step": 10440
},
{
"epoch": 19.679529411764705,
"grad_norm": 0.6762340068817139,
"learning_rate": 4.2192749517703255e-06,
"loss": 1.1319,
"mean_token_accuracy": 0.572966867312789,
"num_tokens": 1139009.0,
"step": 10450
},
{
"epoch": 19.69835294117647,
"grad_norm": 0.9607488512992859,
"learning_rate": 4.202174901369236e-06,
"loss": 1.1342,
"mean_token_accuracy": 0.5701036512851715,
"num_tokens": 1151825.0,
"step": 10460
},
{
"epoch": 19.717176470588235,
"grad_norm": 0.7690389156341553,
"learning_rate": 4.1851003510203416e-06,
"loss": 1.1599,
"mean_token_accuracy": 0.554409109801054,
"num_tokens": 1165331.0,
"step": 10470
},
{
"epoch": 19.736,
"grad_norm": 0.6242837905883789,
"learning_rate": 4.168051375821108e-06,
"loss": 1.1945,
"mean_token_accuracy": 0.5393414959311486,
"num_tokens": 1178787.0,
"step": 10480
},
{
"epoch": 19.754823529411766,
"grad_norm": 0.8885065317153931,
"learning_rate": 4.151028050756507e-06,
"loss": 1.1456,
"mean_token_accuracy": 0.5653862472623586,
"num_tokens": 1191650.0,
"step": 10490
},
{
"epoch": 19.773647058823528,
"grad_norm": 0.7802302837371826,
"learning_rate": 4.134030450698697e-06,
"loss": 1.1645,
"mean_token_accuracy": 0.5542376168072224,
"num_tokens": 1205371.0,
"step": 10500
},
{
"epoch": 19.792470588235293,
"grad_norm": 0.8476331233978271,
"learning_rate": 4.117058650406683e-06,
"loss": 1.1996,
"mean_token_accuracy": 0.5366521954536438,
"num_tokens": 1218885.0,
"step": 10510
},
{
"epoch": 19.81129411764706,
"grad_norm": 0.7439809441566467,
"learning_rate": 4.1001127245260175e-06,
"loss": 1.162,
"mean_token_accuracy": 0.5500502925366163,
"num_tokens": 1232409.0,
"step": 10520
},
{
"epoch": 19.830117647058824,
"grad_norm": 0.8466014266014099,
"learning_rate": 4.083192747588436e-06,
"loss": 1.2165,
"mean_token_accuracy": 0.547482916712761,
"num_tokens": 1245876.0,
"step": 10530
},
{
"epoch": 19.84894117647059,
"grad_norm": 0.9549068808555603,
"learning_rate": 4.066298794011551e-06,
"loss": 1.1552,
"mean_token_accuracy": 0.567984651774168,
"num_tokens": 1260603.0,
"step": 10540
},
{
"epoch": 19.867764705882355,
"grad_norm": 0.8882144689559937,
"learning_rate": 4.049430938098513e-06,
"loss": 1.1424,
"mean_token_accuracy": 0.566171682626009,
"num_tokens": 1274404.0,
"step": 10550
},
{
"epoch": 19.886588235294116,
"grad_norm": 1.1163575649261475,
"learning_rate": 4.0325892540377035e-06,
"loss": 1.1986,
"mean_token_accuracy": 0.54889883287251,
"num_tokens": 1288135.0,
"step": 10560
},
{
"epoch": 19.90541176470588,
"grad_norm": 0.5996796488761902,
"learning_rate": 4.01577381590238e-06,
"loss": 1.1317,
"mean_token_accuracy": 0.5590782940387726,
"num_tokens": 1301565.0,
"step": 10570
},
{
"epoch": 19.924235294117647,
"grad_norm": 0.5613903999328613,
"learning_rate": 3.998984697650369e-06,
"loss": 1.144,
"mean_token_accuracy": 0.5581843961030245,
"num_tokens": 1315363.0,
"step": 10580
},
{
"epoch": 19.943058823529412,
"grad_norm": 1.517250895500183,
"learning_rate": 3.982221973123738e-06,
"loss": 1.1585,
"mean_token_accuracy": 0.5547402266412973,
"num_tokens": 1328940.0,
"step": 10590
},
{
"epoch": 19.961882352941178,
"grad_norm": 1.6663577556610107,
"learning_rate": 3.965485716048473e-06,
"loss": 1.1706,
"mean_token_accuracy": 0.5520875003188849,
"num_tokens": 1342451.0,
"step": 10600
},
{
"epoch": 19.98070588235294,
"grad_norm": 1.2554893493652344,
"learning_rate": 3.948776000034144e-06,
"loss": 1.1016,
"mean_token_accuracy": 0.5831372920423746,
"num_tokens": 1355912.0,
"step": 10610
},
{
"epoch": 19.999529411764705,
"grad_norm": 1.3978781700134277,
"learning_rate": 3.932092898573593e-06,
"loss": 1.0993,
"mean_token_accuracy": 0.5891566134989261,
"num_tokens": 1370301.0,
"step": 10620
},
{
"epoch": 20.01694117647059,
"grad_norm": 1.0220972299575806,
"learning_rate": 3.915436485042602e-06,
"loss": 1.1514,
"mean_token_accuracy": 0.5695925415367693,
"num_tokens": 1382512.0,
"step": 10630
},
{
"epoch": 20.035764705882354,
"grad_norm": 0.9992367625236511,
"learning_rate": 3.898806832699574e-06,
"loss": 1.2,
"mean_token_accuracy": 0.5552287392318249,
"num_tokens": 1395877.0,
"step": 10640
},
{
"epoch": 20.05458823529412,
"grad_norm": 0.8924506306648254,
"learning_rate": 3.882204014685213e-06,
"loss": 1.1325,
"mean_token_accuracy": 0.562846252322197,
"num_tokens": 1408762.0,
"step": 10650
},
{
"epoch": 20.07341176470588,
"grad_norm": 1.2033953666687012,
"learning_rate": 3.8656281040221975e-06,
"loss": 1.1699,
"mean_token_accuracy": 0.5635105472058057,
"num_tokens": 1421599.0,
"step": 10660
},
{
"epoch": 20.092235294117646,
"grad_norm": 0.999156653881073,
"learning_rate": 3.849079173614863e-06,
"loss": 1.1869,
"mean_token_accuracy": 0.5541719019412994,
"num_tokens": 1434720.0,
"step": 10670
},
{
"epoch": 20.11105882352941,
"grad_norm": 1.1332379579544067,
"learning_rate": 3.832557296248883e-06,
"loss": 1.1769,
"mean_token_accuracy": 0.5474078699946403,
"num_tokens": 1448113.0,
"step": 10680
},
{
"epoch": 20.129882352941177,
"grad_norm": 1.046257734298706,
"learning_rate": 3.816062544590944e-06,
"loss": 1.119,
"mean_token_accuracy": 0.5713853165507317,
"num_tokens": 1462113.0,
"step": 10690
},
{
"epoch": 20.148705882352942,
"grad_norm": 1.346682071685791,
"learning_rate": 3.7995949911884235e-06,
"loss": 1.1687,
"mean_token_accuracy": 0.5563855923712253,
"num_tokens": 1474782.0,
"step": 10700
},
{
"epoch": 20.167529411764704,
"grad_norm": 1.4376060962677002,
"learning_rate": 3.783154708469079e-06,
"loss": 1.1921,
"mean_token_accuracy": 0.5529118336737155,
"num_tokens": 1488150.0,
"step": 10710
},
{
"epoch": 20.18635294117647,
"grad_norm": 1.298230767250061,
"learning_rate": 3.7667417687407305e-06,
"loss": 1.1748,
"mean_token_accuracy": 0.553018931671977,
"num_tokens": 1501896.0,
"step": 10720
},
{
"epoch": 20.205176470588235,
"grad_norm": 1.2680264711380005,
"learning_rate": 3.750356244190931e-06,
"loss": 1.1694,
"mean_token_accuracy": 0.5553711723536253,
"num_tokens": 1515310.0,
"step": 10730
},
{
"epoch": 20.224,
"grad_norm": 1.5268313884735107,
"learning_rate": 3.7339982068866586e-06,
"loss": 1.1437,
"mean_token_accuracy": 0.566285153850913,
"num_tokens": 1528249.0,
"step": 10740
},
{
"epoch": 20.242823529411766,
"grad_norm": 1.163407802581787,
"learning_rate": 3.717667728773995e-06,
"loss": 1.1316,
"mean_token_accuracy": 0.5630121100693941,
"num_tokens": 1541606.0,
"step": 10750
},
{
"epoch": 20.26164705882353,
"grad_norm": 0.8959663510322571,
"learning_rate": 3.701364881677809e-06,
"loss": 1.163,
"mean_token_accuracy": 0.5470004346221685,
"num_tokens": 1554546.0,
"step": 10760
},
{
"epoch": 20.280470588235293,
"grad_norm": 0.8293361067771912,
"learning_rate": 3.6850897373014514e-06,
"loss": 1.2158,
"mean_token_accuracy": 0.5398020602762699,
"num_tokens": 1567389.0,
"step": 10770
},
{
"epoch": 20.299294117647058,
"grad_norm": 0.7178218364715576,
"learning_rate": 3.668842367226427e-06,
"loss": 1.1527,
"mean_token_accuracy": 0.5576813716441393,
"num_tokens": 1580197.0,
"step": 10780
},
{
"epoch": 20.318117647058823,
"grad_norm": 0.8142568469047546,
"learning_rate": 3.652622842912068e-06,
"loss": 1.167,
"mean_token_accuracy": 0.5699529372155666,
"num_tokens": 1593797.0,
"step": 10790
},
{
"epoch": 20.33694117647059,
"grad_norm": 1.293581247329712,
"learning_rate": 3.6364312356952603e-06,
"loss": 1.1763,
"mean_token_accuracy": 0.5648769486695528,
"num_tokens": 1607332.0,
"step": 10800
},
{
"epoch": 20.35764705882353,
"grad_norm": 1.6430315971374512,
"learning_rate": 5.595460614152204e-06,
"loss": 1.1903,
"mean_token_accuracy": 0.5535745773464441,
"num_tokens": 14114.0,
"step": 10810
},
{
"epoch": 20.376470588235293,
"grad_norm": 0.9296208620071411,
"learning_rate": 5.582109479305742e-06,
"loss": 1.1451,
"mean_token_accuracy": 0.5736374389380217,
"num_tokens": 26719.0,
"step": 10820
},
{
"epoch": 20.395294117647058,
"grad_norm": 0.7295175790786743,
"learning_rate": 5.5687583444592795e-06,
"loss": 1.1719,
"mean_token_accuracy": 0.5615855868905782,
"num_tokens": 40788.0,
"step": 10830
},
{
"epoch": 20.414117647058823,
"grad_norm": 0.5272361636161804,
"learning_rate": 5.555407209612818e-06,
"loss": 1.1707,
"mean_token_accuracy": 0.5535146549344063,
"num_tokens": 55000.0,
"step": 10840
},
{
"epoch": 20.43294117647059,
"grad_norm": 1.6679742336273193,
"learning_rate": 5.542056074766355e-06,
"loss": 1.1919,
"mean_token_accuracy": 0.5483698755502701,
"num_tokens": 68277.0,
"step": 10850
},
{
"epoch": 20.451764705882354,
"grad_norm": 1.3171534538269043,
"learning_rate": 5.528704939919893e-06,
"loss": 1.1023,
"mean_token_accuracy": 0.5720774076879025,
"num_tokens": 81749.0,
"step": 10860
},
{
"epoch": 20.470588235294116,
"grad_norm": 0.6171587705612183,
"learning_rate": 5.515353805073432e-06,
"loss": 1.1495,
"mean_token_accuracy": 0.5558391027152538,
"num_tokens": 95748.0,
"step": 10870
},
{
"epoch": 20.48941176470588,
"grad_norm": 1.8184794187545776,
"learning_rate": 5.50200267022697e-06,
"loss": 1.1554,
"mean_token_accuracy": 0.5604365076869726,
"num_tokens": 109182.0,
"step": 10880
},
{
"epoch": 20.508235294117647,
"grad_norm": 0.6223208904266357,
"learning_rate": 5.488651535380508e-06,
"loss": 1.1084,
"mean_token_accuracy": 0.5673012807965279,
"num_tokens": 122093.0,
"step": 10890
},
{
"epoch": 20.527058823529412,
"grad_norm": 0.6511589288711548,
"learning_rate": 5.475300400534046e-06,
"loss": 1.1891,
"mean_token_accuracy": 0.5500082913786173,
"num_tokens": 135801.0,
"step": 10900
},
{
"epoch": 20.545882352941177,
"grad_norm": 1.1379867792129517,
"learning_rate": 5.461949265687584e-06,
"loss": 1.1577,
"mean_token_accuracy": 0.5615630965679884,
"num_tokens": 149528.0,
"step": 10910
},
{
"epoch": 20.564705882352943,
"grad_norm": 0.6646468043327332,
"learning_rate": 5.448598130841122e-06,
"loss": 1.1693,
"mean_token_accuracy": 0.5596156906336546,
"num_tokens": 162347.0,
"step": 10920
},
{
"epoch": 20.583529411764705,
"grad_norm": 0.6345205903053284,
"learning_rate": 5.435246995994659e-06,
"loss": 1.1393,
"mean_token_accuracy": 0.5636776462197304,
"num_tokens": 174228.0,
"step": 10930
},
{
"epoch": 20.60235294117647,
"grad_norm": 1.0836670398712158,
"learning_rate": 5.4218958611481976e-06,
"loss": 1.1253,
"mean_token_accuracy": 0.574345787242055,
"num_tokens": 188052.0,
"step": 10940
},
{
"epoch": 20.621176470588235,
"grad_norm": 0.7239655256271362,
"learning_rate": 5.408544726301737e-06,
"loss": 1.1691,
"mean_token_accuracy": 0.5582763768732548,
"num_tokens": 202984.0,
"step": 10950
},
{
"epoch": 20.64,
"grad_norm": 0.6753464937210083,
"learning_rate": 5.395193591455274e-06,
"loss": 1.1513,
"mean_token_accuracy": 0.5622932318598032,
"num_tokens": 216292.0,
"step": 10960
},
{
"epoch": 20.658823529411766,
"grad_norm": 0.5746181607246399,
"learning_rate": 5.381842456608812e-06,
"loss": 1.1928,
"mean_token_accuracy": 0.5508632536977529,
"num_tokens": 229204.0,
"step": 10970
},
{
"epoch": 20.677647058823528,
"grad_norm": 1.1452544927597046,
"learning_rate": 5.3684913217623505e-06,
"loss": 1.1549,
"mean_token_accuracy": 0.5678117204457521,
"num_tokens": 242540.0,
"step": 10980
},
{
"epoch": 20.696470588235293,
"grad_norm": 0.6321762800216675,
"learning_rate": 5.355140186915888e-06,
"loss": 1.1235,
"mean_token_accuracy": 0.575124978646636,
"num_tokens": 256825.0,
"step": 10990
},
{
"epoch": 20.71529411764706,
"grad_norm": 0.5946145057678223,
"learning_rate": 5.341789052069426e-06,
"loss": 1.1731,
"mean_token_accuracy": 0.5547100655734539,
"num_tokens": 270675.0,
"step": 11000
},
{
"epoch": 20.734117647058824,
"grad_norm": 1.3031941652297974,
"learning_rate": 5.3284379172229635e-06,
"loss": 1.1891,
"mean_token_accuracy": 0.5470674268901348,
"num_tokens": 282696.0,
"step": 11010
},
{
"epoch": 20.75294117647059,
"grad_norm": 0.5822432637214661,
"learning_rate": 5.315086782376503e-06,
"loss": 1.2153,
"mean_token_accuracy": 0.5374420482665301,
"num_tokens": 296789.0,
"step": 11020
},
{
"epoch": 20.771764705882354,
"grad_norm": 1.1034314632415771,
"learning_rate": 5.301735647530041e-06,
"loss": 1.1166,
"mean_token_accuracy": 0.5685049999505282,
"num_tokens": 310298.0,
"step": 11030
},
{
"epoch": 20.790588235294116,
"grad_norm": 1.798014521598816,
"learning_rate": 5.288384512683579e-06,
"loss": 1.1764,
"mean_token_accuracy": 0.5581833314150572,
"num_tokens": 323136.0,
"step": 11040
},
{
"epoch": 20.80941176470588,
"grad_norm": 1.2033790349960327,
"learning_rate": 5.2750333778371165e-06,
"loss": 1.1509,
"mean_token_accuracy": 0.5552062794566155,
"num_tokens": 336592.0,
"step": 11050
},
{
"epoch": 20.828235294117647,
"grad_norm": 0.9958351850509644,
"learning_rate": 5.261682242990655e-06,
"loss": 1.1421,
"mean_token_accuracy": 0.5717320717871189,
"num_tokens": 350031.0,
"step": 11060
},
{
"epoch": 20.847058823529412,
"grad_norm": 0.9751930832862854,
"learning_rate": 5.248331108144192e-06,
"loss": 1.2123,
"mean_token_accuracy": 0.5408148296177387,
"num_tokens": 364779.0,
"step": 11070
},
{
"epoch": 20.865882352941178,
"grad_norm": 0.6778987646102905,
"learning_rate": 5.23497997329773e-06,
"loss": 1.1929,
"mean_token_accuracy": 0.547919350117445,
"num_tokens": 379331.0,
"step": 11080
},
{
"epoch": 20.88470588235294,
"grad_norm": 0.85933518409729,
"learning_rate": 5.221628838451269e-06,
"loss": 1.119,
"mean_token_accuracy": 0.5630400247871876,
"num_tokens": 392787.0,
"step": 11090
},
{
"epoch": 20.903529411764705,
"grad_norm": 0.6913843750953674,
"learning_rate": 5.208277703604807e-06,
"loss": 1.1649,
"mean_token_accuracy": 0.5535217590630055,
"num_tokens": 406067.0,
"step": 11100
},
{
"epoch": 20.92235294117647,
"grad_norm": 2.0229623317718506,
"learning_rate": 5.194926568758345e-06,
"loss": 1.12,
"mean_token_accuracy": 0.5807195238769054,
"num_tokens": 420283.0,
"step": 11110
},
{
"epoch": 20.941176470588236,
"grad_norm": 1.6949794292449951,
"learning_rate": 5.181575433911883e-06,
"loss": 1.1508,
"mean_token_accuracy": 0.5568496011197567,
"num_tokens": 433084.0,
"step": 11120
},
{
"epoch": 20.96,
"grad_norm": 0.9725853204727173,
"learning_rate": 5.168224299065421e-06,
"loss": 1.2174,
"mean_token_accuracy": 0.5451920755207539,
"num_tokens": 447263.0,
"step": 11130
},
{
"epoch": 20.978823529411766,
"grad_norm": 1.1975111961364746,
"learning_rate": 5.154873164218959e-06,
"loss": 1.1483,
"mean_token_accuracy": 0.5572316914796829,
"num_tokens": 460142.0,
"step": 11140
},
{
"epoch": 20.997647058823528,
"grad_norm": 1.078298807144165,
"learning_rate": 5.141522029372496e-06,
"loss": 1.1615,
"mean_token_accuracy": 0.5542684197425842,
"num_tokens": 473192.0,
"step": 11150
},
{
"epoch": 21.01694117647059,
"grad_norm": 1.2463195323944092,
"learning_rate": 5.128170894526035e-06,
"loss": 1.2856,
"mean_token_accuracy": 0.5541327973691429,
"num_tokens": 486372.0,
"step": 11160
},
{
"epoch": 21.035764705882354,
"grad_norm": 1.2360081672668457,
"learning_rate": 5.114819759679574e-06,
"loss": 1.1388,
"mean_token_accuracy": 0.5694611296057701,
"num_tokens": 499191.0,
"step": 11170
},
{
"epoch": 21.05458823529412,
"grad_norm": 0.9578425288200378,
"learning_rate": 5.101468624833111e-06,
"loss": 1.1818,
"mean_token_accuracy": 0.548756854981184,
"num_tokens": 512720.0,
"step": 11180
},
{
"epoch": 21.07341176470588,
"grad_norm": 0.6617890000343323,
"learning_rate": 5.088117489986649e-06,
"loss": 1.1515,
"mean_token_accuracy": 0.5633066941052676,
"num_tokens": 525141.0,
"step": 11190
},
{
"epoch": 21.092235294117646,
"grad_norm": 0.5509127974510193,
"learning_rate": 5.0747663551401875e-06,
"loss": 1.1534,
"mean_token_accuracy": 0.5600371100008488,
"num_tokens": 539326.0,
"step": 11200
},
{
"epoch": 21.11105882352941,
"grad_norm": 0.7871003150939941,
"learning_rate": 5.061415220293725e-06,
"loss": 1.0991,
"mean_token_accuracy": 0.5748973291367292,
"num_tokens": 553010.0,
"step": 11210
},
{
"epoch": 21.129882352941177,
"grad_norm": 1.2488114833831787,
"learning_rate": 5.048064085447263e-06,
"loss": 1.1,
"mean_token_accuracy": 0.575805452466011,
"num_tokens": 566192.0,
"step": 11220
},
{
"epoch": 21.148705882352942,
"grad_norm": 0.6213930249214172,
"learning_rate": 5.034712950600802e-06,
"loss": 1.1342,
"mean_token_accuracy": 0.5743603181093931,
"num_tokens": 579897.0,
"step": 11230
},
{
"epoch": 21.167529411764704,
"grad_norm": 0.6450327634811401,
"learning_rate": 5.0213618157543396e-06,
"loss": 1.197,
"mean_token_accuracy": 0.5464676439762115,
"num_tokens": 594375.0,
"step": 11240
},
{
"epoch": 21.18635294117647,
"grad_norm": 1.1492058038711548,
"learning_rate": 5.008010680907878e-06,
"loss": 1.1822,
"mean_token_accuracy": 0.5506179232150317,
"num_tokens": 607333.0,
"step": 11250
},
{
"epoch": 21.205176470588235,
"grad_norm": 1.0695908069610596,
"learning_rate": 4.994659546061415e-06,
"loss": 1.1359,
"mean_token_accuracy": 0.5681717403233051,
"num_tokens": 620561.0,
"step": 11260
},
{
"epoch": 21.224,
"grad_norm": 0.608024001121521,
"learning_rate": 4.9813084112149534e-06,
"loss": 1.1696,
"mean_token_accuracy": 0.5604459267109633,
"num_tokens": 634382.0,
"step": 11270
},
{
"epoch": 21.242823529411766,
"grad_norm": 0.6441075801849365,
"learning_rate": 4.967957276368492e-06,
"loss": 1.144,
"mean_token_accuracy": 0.5593028951436281,
"num_tokens": 646437.0,
"step": 11280
},
{
"epoch": 21.26164705882353,
"grad_norm": 1.208881139755249,
"learning_rate": 4.95460614152203e-06,
"loss": 1.1171,
"mean_token_accuracy": 0.5644662406295538,
"num_tokens": 659651.0,
"step": 11290
},
{
"epoch": 21.280470588235293,
"grad_norm": 1.3741132020950317,
"learning_rate": 4.941255006675567e-06,
"loss": 1.1408,
"mean_token_accuracy": 0.5656964641064406,
"num_tokens": 673643.0,
"step": 11300
},
{
"epoch": 21.299294117647058,
"grad_norm": 1.710774302482605,
"learning_rate": 4.927903871829106e-06,
"loss": 1.1746,
"mean_token_accuracy": 0.5554309643805027,
"num_tokens": 686757.0,
"step": 11310
},
{
"epoch": 21.318117647058823,
"grad_norm": 0.5914443731307983,
"learning_rate": 4.914552736982644e-06,
"loss": 1.1724,
"mean_token_accuracy": 0.5584981873631477,
"num_tokens": 701156.0,
"step": 11320
},
{
"epoch": 21.33694117647059,
"grad_norm": 0.6047216653823853,
"learning_rate": 4.901201602136182e-06,
"loss": 1.1493,
"mean_token_accuracy": 0.570811814814806,
"num_tokens": 714373.0,
"step": 11330
},
{
"epoch": 21.355764705882354,
"grad_norm": 1.1371484994888306,
"learning_rate": 4.887850467289719e-06,
"loss": 1.1611,
"mean_token_accuracy": 0.5554365783929824,
"num_tokens": 727980.0,
"step": 11340
},
{
"epoch": 21.37458823529412,
"grad_norm": 0.6046891212463379,
"learning_rate": 4.8744993324432585e-06,
"loss": 1.1768,
"mean_token_accuracy": 0.5487030290067196,
"num_tokens": 741661.0,
"step": 11350
},
{
"epoch": 21.39341176470588,
"grad_norm": 1.0406830310821533,
"learning_rate": 4.861148197596796e-06,
"loss": 1.103,
"mean_token_accuracy": 0.5742270287126303,
"num_tokens": 754818.0,
"step": 11360
},
{
"epoch": 21.412235294117647,
"grad_norm": 1.8457794189453125,
"learning_rate": 4.847797062750334e-06,
"loss": 1.1727,
"mean_token_accuracy": 0.5553502965718508,
"num_tokens": 767996.0,
"step": 11370
},
{
"epoch": 21.431058823529412,
"grad_norm": 1.229186773300171,
"learning_rate": 4.834445927903872e-06,
"loss": 1.1148,
"mean_token_accuracy": 0.5711336594074965,
"num_tokens": 781314.0,
"step": 11380
},
{
"epoch": 21.449882352941177,
"grad_norm": 2.4556403160095215,
"learning_rate": 4.8210947930574106e-06,
"loss": 1.2005,
"mean_token_accuracy": 0.5530955422669649,
"num_tokens": 794540.0,
"step": 11390
},
{
"epoch": 21.468705882352943,
"grad_norm": 0.9994168281555176,
"learning_rate": 4.807743658210948e-06,
"loss": 1.1588,
"mean_token_accuracy": 0.5520365055650472,
"num_tokens": 807577.0,
"step": 11400
},
{
"epoch": 11.06690909090909,
"grad_norm": 2.4615001678466797,
"learning_rate": 2.7042869240445714e-06,
"loss": 0.9921,
"mean_token_accuracy": 0.6851526271551848,
"num_tokens": 10722.0,
"step": 11410
},
{
"epoch": 11.07660606060606,
"grad_norm": 2.1516900062561035,
"learning_rate": 2.689960187285652e-06,
"loss": 0.9928,
"mean_token_accuracy": 0.6662691086530685,
"num_tokens": 21293.0,
"step": 11420
},
{
"epoch": 11.08630303030303,
"grad_norm": 1.547285556793213,
"learning_rate": 2.675665601616777e-06,
"loss": 0.9335,
"mean_token_accuracy": 0.6725459590554237,
"num_tokens": 32220.0,
"step": 11430
},
{
"epoch": 11.096,
"grad_norm": 1.2127219438552856,
"learning_rate": 2.6614032299085324e-06,
"loss": 1.0357,
"mean_token_accuracy": 0.6421669337898492,
"num_tokens": 43405.0,
"step": 11440
},
{
"epoch": 11.10569696969697,
"grad_norm": 2.609590530395508,
"learning_rate": 2.647173134889831e-06,
"loss": 0.955,
"mean_token_accuracy": 0.6785120502114296,
"num_tokens": 53503.0,
"step": 11450
},
{
"epoch": 11.11539393939394,
"grad_norm": 2.0895426273345947,
"learning_rate": 2.6329753791476143e-06,
"loss": 0.8958,
"mean_token_accuracy": 0.6950253710150719,
"num_tokens": 62932.0,
"step": 11460
},
{
"epoch": 11.12509090909091,
"grad_norm": 1.0643393993377686,
"learning_rate": 2.6188100251265947e-06,
"loss": 0.965,
"mean_token_accuracy": 0.6781762517988682,
"num_tokens": 73725.0,
"step": 11470
},
{
"epoch": 11.13478787878788,
"grad_norm": 0.9910470843315125,
"learning_rate": 2.604677135128972e-06,
"loss": 0.9692,
"mean_token_accuracy": 0.6687733806669712,
"num_tokens": 84995.0,
"step": 11480
},
{
"epoch": 11.144484848484849,
"grad_norm": 1.3115955591201782,
"learning_rate": 2.590576771314166e-06,
"loss": 0.9748,
"mean_token_accuracy": 0.6800978854298592,
"num_tokens": 95231.0,
"step": 11490
},
{
"epoch": 11.154181818181819,
"grad_norm": 1.7872004508972168,
"learning_rate": 2.5765089956985357e-06,
"loss": 0.857,
"mean_token_accuracy": 0.7096298310905695,
"num_tokens": 104730.0,
"step": 11500
},
{
"epoch": 11.163878787878788,
"grad_norm": 1.7525864839553833,
"learning_rate": 2.56247387015511e-06,
"loss": 0.9162,
"mean_token_accuracy": 0.7024530675262213,
"num_tokens": 114691.0,
"step": 11510
},
{
"epoch": 11.173575757575758,
"grad_norm": 1.5869275331497192,
"learning_rate": 2.5484714564133237e-06,
"loss": 0.8835,
"mean_token_accuracy": 0.6953748039901256,
"num_tokens": 124358.0,
"step": 11520
},
{
"epoch": 11.183272727272728,
"grad_norm": 1.2352665662765503,
"learning_rate": 2.534501816058731e-06,
"loss": 0.9839,
"mean_token_accuracy": 0.6811668451875448,
"num_tokens": 135440.0,
"step": 11530
},
{
"epoch": 11.192969696969698,
"grad_norm": 0.9480632543563843,
"learning_rate": 2.5205650105327405e-06,
"loss": 0.9216,
"mean_token_accuracy": 0.6818428047001361,
"num_tokens": 145555.0,
"step": 11540
},
{
"epoch": 11.202666666666667,
"grad_norm": 1.0198525190353394,
"learning_rate": 2.5066611011323505e-06,
"loss": 0.9813,
"mean_token_accuracy": 0.6706477042287589,
"num_tokens": 157378.0,
"step": 11550
},
{
"epoch": 11.212363636363637,
"grad_norm": 0.9845815896987915,
"learning_rate": 2.4927901490098762e-06,
"loss": 0.9984,
"mean_token_accuracy": 0.6655347641557455,
"num_tokens": 168558.0,
"step": 11560
},
{
"epoch": 11.222060606060605,
"grad_norm": 1.9387283325195312,
"learning_rate": 2.4789522151726764e-06,
"loss": 0.9677,
"mean_token_accuracy": 0.6992917202413083,
"num_tokens": 179836.0,
"step": 11570
},
{
"epoch": 11.231757575757575,
"grad_norm": 1.2270708084106445,
"learning_rate": 2.4651473604828903e-06,
"loss": 0.9466,
"mean_token_accuracy": 0.675117377564311,
"num_tokens": 190256.0,
"step": 11580
},
{
"epoch": 11.241454545454545,
"grad_norm": 1.0312174558639526,
"learning_rate": 2.4513756456571667e-06,
"loss": 0.9776,
"mean_token_accuracy": 0.6721729058772326,
"num_tokens": 201487.0,
"step": 11590
},
{
"epoch": 11.251151515151514,
"grad_norm": 1.4611775875091553,
"learning_rate": 2.437637131266396e-06,
"loss": 0.9016,
"mean_token_accuracy": 0.6916485130786896,
"num_tokens": 210683.0,
"step": 11600
},
{
"epoch": 11.260848484848484,
"grad_norm": 1.251896858215332,
"learning_rate": 2.4239318777354593e-06,
"loss": 0.9099,
"mean_token_accuracy": 0.6964191533625126,
"num_tokens": 220616.0,
"step": 11610
},
{
"epoch": 11.270545454545454,
"grad_norm": 1.1485376358032227,
"learning_rate": 2.410259945342929e-06,
"loss": 0.9313,
"mean_token_accuracy": 0.6866546850651503,
"num_tokens": 231221.0,
"step": 11620
},
{
"epoch": 11.280242424242424,
"grad_norm": 1.1417745351791382,
"learning_rate": 2.3966213942208363e-06,
"loss": 0.8653,
"mean_token_accuracy": 0.7093327675014734,
"num_tokens": 241191.0,
"step": 11630
},
{
"epoch": 11.289939393939393,
"grad_norm": 1.0089123249053955,
"learning_rate": 2.383016284354397e-06,
"loss": 0.9424,
"mean_token_accuracy": 0.6817425429821015,
"num_tokens": 250917.0,
"step": 11640
},
{
"epoch": 11.299636363636363,
"grad_norm": 1.3917380571365356,
"learning_rate": 2.369444675581738e-06,
"loss": 0.9331,
"mean_token_accuracy": 0.6756290566176176,
"num_tokens": 261760.0,
"step": 11650
},
{
"epoch": 11.309333333333333,
"grad_norm": 1.3879166841506958,
"learning_rate": 2.355906627593647e-06,
"loss": 0.9807,
"mean_token_accuracy": 0.6554592750966549,
"num_tokens": 272414.0,
"step": 11660
},
{
"epoch": 11.319030303030303,
"grad_norm": 1.2711937427520752,
"learning_rate": 2.342402199933296e-06,
"loss": 0.8541,
"mean_token_accuracy": 0.7178003009408712,
"num_tokens": 282396.0,
"step": 11670
},
{
"epoch": 11.328727272727273,
"grad_norm": 0.8704134821891785,
"learning_rate": 2.3289314519960016e-06,
"loss": 0.9997,
"mean_token_accuracy": 0.6554147530347109,
"num_tokens": 292871.0,
"step": 11680
},
{
"epoch": 11.338424242424242,
"grad_norm": 1.9305732250213623,
"learning_rate": 2.315494443028937e-06,
"loss": 0.9644,
"mean_token_accuracy": 0.6782477792352438,
"num_tokens": 303864.0,
"step": 11690
},
{
"epoch": 11.348121212121212,
"grad_norm": 1.0496376752853394,
"learning_rate": 2.30209123213089e-06,
"loss": 0.9606,
"mean_token_accuracy": 0.6711665719747544,
"num_tokens": 314081.0,
"step": 11700
},
{
"epoch": 11.357818181818182,
"grad_norm": 1.2640137672424316,
"learning_rate": 2.288721878251996e-06,
"loss": 0.9216,
"mean_token_accuracy": 0.6923185490071774,
"num_tokens": 324585.0,
"step": 11710
},
{
"epoch": 11.367515151515152,
"grad_norm": 1.2028977870941162,
"learning_rate": 2.275386440193479e-06,
"loss": 0.8991,
"mean_token_accuracy": 0.70025773383677,
"num_tokens": 334337.0,
"step": 11720
},
{
"epoch": 11.377212121212121,
"grad_norm": 1.366431713104248,
"learning_rate": 2.2620849766073993e-06,
"loss": 0.9379,
"mean_token_accuracy": 0.6744892597198486,
"num_tokens": 344863.0,
"step": 11730
},
{
"epoch": 11.386909090909091,
"grad_norm": 1.1718578338623047,
"learning_rate": 2.248817545996387e-06,
"loss": 0.8827,
"mean_token_accuracy": 0.7153472680598497,
"num_tokens": 354780.0,
"step": 11740
},
{
"epoch": 11.39660606060606,
"grad_norm": 1.7113317251205444,
"learning_rate": 2.235584206713385e-06,
"loss": 0.9333,
"mean_token_accuracy": 0.6829107455909252,
"num_tokens": 365921.0,
"step": 11750
},
{
"epoch": 11.40630303030303,
"grad_norm": 0.9357189536094666,
"learning_rate": 2.2223850169613993e-06,
"loss": 0.9788,
"mean_token_accuracy": 0.6629223726689816,
"num_tokens": 376384.0,
"step": 11760
},
{
"epoch": 11.416,
"grad_norm": 1.9502002000808716,
"learning_rate": 2.209220034793237e-06,
"loss": 0.9126,
"mean_token_accuracy": 0.6891988046467304,
"num_tokens": 385616.0,
"step": 11770
},
{
"epoch": 11.42569696969697,
"grad_norm": 0.9912474751472473,
"learning_rate": 2.1960893181112553e-06,
"loss": 0.9927,
"mean_token_accuracy": 0.6538973189890385,
"num_tokens": 396111.0,
"step": 11780
},
{
"epoch": 11.43539393939394,
"grad_norm": 1.6034260988235474,
"learning_rate": 2.182992924667101e-06,
"loss": 0.9853,
"mean_token_accuracy": 0.658538245409727,
"num_tokens": 407225.0,
"step": 11790
},
{
"epoch": 11.44509090909091,
"grad_norm": 0.7665310502052307,
"learning_rate": 2.1699309120614663e-06,
"loss": 0.9821,
"mean_token_accuracy": 0.6684748906642198,
"num_tokens": 417932.0,
"step": 11800
},
{
"epoch": 11.45478787878788,
"grad_norm": 1.4279521703720093,
"learning_rate": 2.1569033377438243e-06,
"loss": 1.0331,
"mean_token_accuracy": 0.6504704430699348,
"num_tokens": 429943.0,
"step": 11810
},
{
"epoch": 11.46448484848485,
"grad_norm": 1.4924397468566895,
"learning_rate": 2.1439102590121807e-06,
"loss": 0.9594,
"mean_token_accuracy": 0.6776580080389977,
"num_tokens": 440423.0,
"step": 11820
},
{
"epoch": 11.474181818181819,
"grad_norm": 1.0187861919403076,
"learning_rate": 2.1309517330128217e-06,
"loss": 1.0211,
"mean_token_accuracy": 0.6710641365498304,
"num_tokens": 451528.0,
"step": 11830
},
{
"epoch": 11.483878787878789,
"grad_norm": 1.2168591022491455,
"learning_rate": 2.1180278167400726e-06,
"loss": 0.9256,
"mean_token_accuracy": 0.6835467047989369,
"num_tokens": 461950.0,
"step": 11840
},
{
"epoch": 11.493575757575758,
"grad_norm": 0.7139029502868652,
"learning_rate": 2.105138567036026e-06,
"loss": 0.984,
"mean_token_accuracy": 0.6716462299227715,
"num_tokens": 472203.0,
"step": 11850
},
{
"epoch": 11.503272727272726,
"grad_norm": 0.9237242341041565,
"learning_rate": 2.09228404059031e-06,
"loss": 1.0324,
"mean_token_accuracy": 0.653912478685379,
"num_tokens": 484302.0,
"step": 11860
},
{
"epoch": 11.512969696969696,
"grad_norm": 1.539085865020752,
"learning_rate": 2.0794642939398315e-06,
"loss": 0.9019,
"mean_token_accuracy": 0.6944774236530066,
"num_tokens": 494362.0,
"step": 11870
},
{
"epoch": 11.522666666666666,
"grad_norm": 0.7662031054496765,
"learning_rate": 2.066679383468524e-06,
"loss": 0.9687,
"mean_token_accuracy": 0.6761994324624538,
"num_tokens": 505137.0,
"step": 11880
},
{
"epoch": 11.532363636363636,
"grad_norm": 1.0061405897140503,
"learning_rate": 2.0539293654071167e-06,
"loss": 0.9671,
"mean_token_accuracy": 0.669762023538351,
"num_tokens": 515456.0,
"step": 11890
},
{
"epoch": 11.542060606060605,
"grad_norm": 1.5532357692718506,
"learning_rate": 2.0412142958328586e-06,
"loss": 0.8768,
"mean_token_accuracy": 0.7063564002513886,
"num_tokens": 525883.0,
"step": 11900
},
{
"epoch": 11.551757575757575,
"grad_norm": 0.8483320474624634,
"learning_rate": 2.028534230669296e-06,
"loss": 0.9502,
"mean_token_accuracy": 0.6627003367990255,
"num_tokens": 537384.0,
"step": 11910
},
{
"epoch": 11.561454545454545,
"grad_norm": 0.933795154094696,
"learning_rate": 2.015889225686022e-06,
"loss": 0.9788,
"mean_token_accuracy": 0.6608807422220707,
"num_tokens": 548906.0,
"step": 11920
},
{
"epoch": 11.571151515151515,
"grad_norm": 2.9793217182159424,
"learning_rate": 2.0032793364984225e-06,
"loss": 0.9872,
"mean_token_accuracy": 0.6697162009775639,
"num_tokens": 559505.0,
"step": 11930
},
{
"epoch": 11.580848484848485,
"grad_norm": 1.162315845489502,
"learning_rate": 1.9907046185674374e-06,
"loss": 0.8945,
"mean_token_accuracy": 0.7048578035086394,
"num_tokens": 569706.0,
"step": 11940
},
{
"epoch": 11.590545454545454,
"grad_norm": 0.6796969175338745,
"learning_rate": 1.978165127199313e-06,
"loss": 1.0175,
"mean_token_accuracy": 0.6569722048938275,
"num_tokens": 580920.0,
"step": 11950
},
{
"epoch": 11.600242424242424,
"grad_norm": 0.8585827946662903,
"learning_rate": 1.9656609175453724e-06,
"loss": 0.9786,
"mean_token_accuracy": 0.6617213696241379,
"num_tokens": 591801.0,
"step": 11960
},
{
"epoch": 11.609939393939394,
"grad_norm": 2.407949924468994,
"learning_rate": 1.9531920446017514e-06,
"loss": 0.8688,
"mean_token_accuracy": 0.7031644247472286,
"num_tokens": 601494.0,
"step": 11970
},
{
"epoch": 11.619636363636364,
"grad_norm": 1.2153403759002686,
"learning_rate": 1.940758563209172e-06,
"loss": 0.9819,
"mean_token_accuracy": 0.667534577473998,
"num_tokens": 612759.0,
"step": 11980
},
{
"epoch": 11.629333333333333,
"grad_norm": 1.6503994464874268,
"learning_rate": 1.928360528052695e-06,
"loss": 0.9575,
"mean_token_accuracy": 0.6706000864505768,
"num_tokens": 623931.0,
"step": 11990
},
{
"epoch": 11.639030303030303,
"grad_norm": 2.4140963554382324,
"learning_rate": 1.9159979936614813e-06,
"loss": 0.927,
"mean_token_accuracy": 0.6892816323786974,
"num_tokens": 634238.0,
"step": 12000
},
{
"epoch": 11.648727272727273,
"grad_norm": 1.6788582801818848,
"learning_rate": 1.9036710144085568e-06,
"loss": 0.9357,
"mean_token_accuracy": 0.697110791504383,
"num_tokens": 644919.0,
"step": 12010
},
{
"epoch": 11.658424242424243,
"grad_norm": 1.9367320537567139,
"learning_rate": 1.891379644510566e-06,
"loss": 1.038,
"mean_token_accuracy": 0.6408150486648083,
"num_tokens": 655529.0,
"step": 12020
},
{
"epoch": 11.668121212121212,
"grad_norm": 1.541839361190796,
"learning_rate": 1.8791239380275262e-06,
"loss": 0.9451,
"mean_token_accuracy": 0.6883293610066176,
"num_tokens": 665483.0,
"step": 12030
},
{
"epoch": 11.677818181818182,
"grad_norm": 0.8563993573188782,
"learning_rate": 1.8669039488626162e-06,
"loss": 0.928,
"mean_token_accuracy": 0.6791775230318308,
"num_tokens": 676255.0,
"step": 12040
},
{
"epoch": 11.687515151515152,
"grad_norm": 1.097931981086731,
"learning_rate": 1.8547197307619102e-06,
"loss": 1.0107,
"mean_token_accuracy": 0.645743177831173,
"num_tokens": 687475.0,
"step": 12050
},
{
"epoch": 11.697212121212122,
"grad_norm": 1.8921289443969727,
"learning_rate": 1.8425713373141597e-06,
"loss": 0.9458,
"mean_token_accuracy": 0.689141795784235,
"num_tokens": 697582.0,
"step": 12060
},
{
"epoch": 11.706909090909091,
"grad_norm": 1.5931812524795532,
"learning_rate": 1.830458821950546e-06,
"loss": 0.8759,
"mean_token_accuracy": 0.6943042069673538,
"num_tokens": 707522.0,
"step": 12070
},
{
"epoch": 11.716606060606061,
"grad_norm": 1.7966309785842896,
"learning_rate": 1.8183822379444604e-06,
"loss": 0.9538,
"mean_token_accuracy": 0.6829646300524473,
"num_tokens": 718288.0,
"step": 12080
},
{
"epoch": 11.726303030303031,
"grad_norm": 1.5532159805297852,
"learning_rate": 1.8063416384112532e-06,
"loss": 0.9123,
"mean_token_accuracy": 0.6941913302987814,
"num_tokens": 727672.0,
"step": 12090
},
{
"epoch": 11.736,
"grad_norm": 0.9339669942855835,
"learning_rate": 1.7943370763080093e-06,
"loss": 0.9778,
"mean_token_accuracy": 0.652100894600153,
"num_tokens": 738798.0,
"step": 12100
},
{
"epoch": 11.74569696969697,
"grad_norm": 1.5431864261627197,
"learning_rate": 1.7823686044333134e-06,
"loss": 0.9883,
"mean_token_accuracy": 0.6828689679503441,
"num_tokens": 749829.0,
"step": 12110
},
{
"epoch": 11.75539393939394,
"grad_norm": 1.6083624362945557,
"learning_rate": 1.7704362754270143e-06,
"loss": 0.8534,
"mean_token_accuracy": 0.7035974383354187,
"num_tokens": 759547.0,
"step": 12120
},
{
"epoch": 11.765090909090908,
"grad_norm": 1.150327444076538,
"learning_rate": 1.7585401417700076e-06,
"loss": 0.8699,
"mean_token_accuracy": 0.6958384934812785,
"num_tokens": 769350.0,
"step": 12130
},
{
"epoch": 11.77478787878788,
"grad_norm": 1.8415894508361816,
"learning_rate": 1.7466802557839834e-06,
"loss": 0.9496,
"mean_token_accuracy": 0.6662904676049948,
"num_tokens": 780015.0,
"step": 12140
},
{
"epoch": 11.784484848484848,
"grad_norm": 2.885213851928711,
"learning_rate": 1.7348566696312108e-06,
"loss": 0.9546,
"mean_token_accuracy": 0.6689011044800282,
"num_tokens": 790176.0,
"step": 12150
},
{
"epoch": 11.794181818181817,
"grad_norm": 1.567074179649353,
"learning_rate": 1.7230694353143041e-06,
"loss": 0.9247,
"mean_token_accuracy": 0.69071399345994,
"num_tokens": 801125.0,
"step": 12160
},
{
"epoch": 11.803878787878787,
"grad_norm": 0.8478710651397705,
"learning_rate": 1.7113186046759956e-06,
"loss": 0.9342,
"mean_token_accuracy": 0.6843322183936834,
"num_tokens": 811193.0,
"step": 12170
},
{
"epoch": 11.813575757575757,
"grad_norm": 1.2154415845870972,
"learning_rate": 1.6996042293989046e-06,
"loss": 0.8842,
"mean_token_accuracy": 0.7175424035638571,
"num_tokens": 822012.0,
"step": 12180
},
{
"epoch": 11.823272727272727,
"grad_norm": 1.4030102491378784,
"learning_rate": 1.6879263610053109e-06,
"loss": 0.9184,
"mean_token_accuracy": 0.6776260420680046,
"num_tokens": 832565.0,
"step": 12190
},
{
"epoch": 11.832969696969696,
"grad_norm": 0.6021126508712769,
"learning_rate": 1.6762850508569383e-06,
"loss": 0.9,
"mean_token_accuracy": 0.6897207599133253,
"num_tokens": 843002.0,
"step": 12200
},
{
"epoch": 11.842666666666666,
"grad_norm": 1.1585458517074585,
"learning_rate": 1.6646803501547104e-06,
"loss": 0.9101,
"mean_token_accuracy": 0.6929288487881422,
"num_tokens": 853533.0,
"step": 12210
},
{
"epoch": 11.852363636363636,
"grad_norm": 0.7529911398887634,
"learning_rate": 1.653112309938537e-06,
"loss": 0.9389,
"mean_token_accuracy": 0.6742060914635658,
"num_tokens": 864186.0,
"step": 12220
},
{
"epoch": 11.862060606060606,
"grad_norm": 1.9934836626052856,
"learning_rate": 1.6415809810870854e-06,
"loss": 0.9119,
"mean_token_accuracy": 0.6902973093092442,
"num_tokens": 875834.0,
"step": 12230
},
{
"epoch": 11.871757575757576,
"grad_norm": 1.0597401857376099,
"learning_rate": 1.6300864143175665e-06,
"loss": 0.9318,
"mean_token_accuracy": 0.6818343084305525,
"num_tokens": 886161.0,
"step": 12240
},
{
"epoch": 11.881454545454545,
"grad_norm": 1.6065829992294312,
"learning_rate": 1.6186286601854962e-06,
"loss": 0.9453,
"mean_token_accuracy": 0.6795123651623726,
"num_tokens": 895476.0,
"step": 12250
},
{
"epoch": 11.891151515151515,
"grad_norm": 1.7515671253204346,
"learning_rate": 1.6072077690844824e-06,
"loss": 0.9692,
"mean_token_accuracy": 0.6692648060619831,
"num_tokens": 906427.0,
"step": 12260
},
{
"epoch": 11.900848484848485,
"grad_norm": 1.2341034412384033,
"learning_rate": 1.5958237912460028e-06,
"loss": 0.9293,
"mean_token_accuracy": 0.6872673355042934,
"num_tokens": 916474.0,
"step": 12270
},
{
"epoch": 11.910545454545455,
"grad_norm": 1.444577693939209,
"learning_rate": 1.5844767767391799e-06,
"loss": 0.8847,
"mean_token_accuracy": 0.6981671530753374,
"num_tokens": 926428.0,
"step": 12280
},
{
"epoch": 11.920242424242424,
"grad_norm": 1.0555857419967651,
"learning_rate": 1.5731667754705716e-06,
"loss": 0.9736,
"mean_token_accuracy": 0.6644821926951409,
"num_tokens": 936011.0,
"step": 12290
},
{
"epoch": 11.929939393939394,
"grad_norm": 1.8039774894714355,
"learning_rate": 1.5618938371839366e-06,
"loss": 0.9717,
"mean_token_accuracy": 0.6748083829879761,
"num_tokens": 946502.0,
"step": 12300
},
{
"epoch": 11.939636363636364,
"grad_norm": 1.572402834892273,
"learning_rate": 1.550658011460019e-06,
"loss": 0.9184,
"mean_token_accuracy": 0.7099429033696651,
"num_tokens": 956754.0,
"step": 12310
},
{
"epoch": 11.949333333333334,
"grad_norm": 0.8282158374786377,
"learning_rate": 1.5394593477163456e-06,
"loss": 0.8845,
"mean_token_accuracy": 0.7014227926731109,
"num_tokens": 966536.0,
"step": 12320
},
{
"epoch": 11.959030303030303,
"grad_norm": 1.0083385705947876,
"learning_rate": 1.5282978952069904e-06,
"loss": 0.9763,
"mean_token_accuracy": 0.670442745834589,
"num_tokens": 976841.0,
"step": 12330
},
{
"epoch": 11.968727272727273,
"grad_norm": 2.277254581451416,
"learning_rate": 1.5171737030223632e-06,
"loss": 1.0049,
"mean_token_accuracy": 0.6729221884161234,
"num_tokens": 987549.0,
"step": 12340
},
{
"epoch": 11.978424242424243,
"grad_norm": 0.8625606894493103,
"learning_rate": 1.5060868200889955e-06,
"loss": 0.8779,
"mean_token_accuracy": 0.7104882929474116,
"num_tokens": 996977.0,
"step": 12350
},
{
"epoch": 11.988121212121213,
"grad_norm": 1.0558991432189941,
"learning_rate": 1.4950372951693316e-06,
"loss": 0.9398,
"mean_token_accuracy": 0.678413325548172,
"num_tokens": 1007009.0,
"step": 12360
},
{
"epoch": 11.997818181818182,
"grad_norm": 1.0509843826293945,
"learning_rate": 1.4840251768614987e-06,
"loss": 0.8752,
"mean_token_accuracy": 0.7061600238084793,
"num_tokens": 1016337.0,
"step": 12370
},
{
"epoch": 12.007757575757577,
"grad_norm": 1.150305986404419,
"learning_rate": 1.473050513599107e-06,
"loss": 1.0264,
"mean_token_accuracy": 0.6960650755137932,
"num_tokens": 1027200.0,
"step": 12380
},
{
"epoch": 12.017454545454546,
"grad_norm": 0.8161555528640747,
"learning_rate": 1.462113353651029e-06,
"loss": 0.9927,
"mean_token_accuracy": 0.6727097641676665,
"num_tokens": 1037572.0,
"step": 12390
},
{
"epoch": 12.027151515151516,
"grad_norm": 1.7541327476501465,
"learning_rate": 1.4512137451211884e-06,
"loss": 0.8799,
"mean_token_accuracy": 0.6972331315279007,
"num_tokens": 1046891.0,
"step": 12400
},
{
"epoch": 12.036848484848484,
"grad_norm": 0.9530600309371948,
"learning_rate": 1.4403517359483577e-06,
"loss": 0.9435,
"mean_token_accuracy": 0.68136284686625,
"num_tokens": 10434.0,
"step": 12410
},
{
"epoch": 12.046545454545454,
"grad_norm": 1.3567638397216797,
"learning_rate": 1.42952737390593e-06,
"loss": 0.9598,
"mean_token_accuracy": 0.676530422642827,
"num_tokens": 22038.0,
"step": 12420
},
{
"epoch": 12.056242424242424,
"grad_norm": 1.3663750886917114,
"learning_rate": 1.4187407066017245e-06,
"loss": 0.9701,
"mean_token_accuracy": 0.6606147531419992,
"num_tokens": 32741.0,
"step": 12430
},
{
"epoch": 12.065939393939393,
"grad_norm": 1.086794376373291,
"learning_rate": 1.4079917814777667e-06,
"loss": 0.8693,
"mean_token_accuracy": 0.7050681680440902,
"num_tokens": 41500.0,
"step": 12440
},
{
"epoch": 12.075636363636363,
"grad_norm": 0.9989749193191528,
"learning_rate": 1.3972806458100885e-06,
"loss": 0.8294,
"mean_token_accuracy": 0.7239202216267586,
"num_tokens": 50782.0,
"step": 12450
},
{
"epoch": 12.085333333333333,
"grad_norm": 1.2325557470321655,
"learning_rate": 1.3866073467085127e-06,
"loss": 0.8807,
"mean_token_accuracy": 0.6962772708386182,
"num_tokens": 60816.0,
"step": 12460
},
{
"epoch": 12.095030303030303,
"grad_norm": 1.5396286249160767,
"learning_rate": 1.3759719311164477e-06,
"loss": 0.995,
"mean_token_accuracy": 0.6646735660731793,
"num_tokens": 71812.0,
"step": 12470
},
{
"epoch": 12.104727272727272,
"grad_norm": 1.008445382118225,
"learning_rate": 1.3653744458106876e-06,
"loss": 0.9715,
"mean_token_accuracy": 0.6767258770763874,
"num_tokens": 83843.0,
"step": 12480
},
{
"epoch": 12.114424242424242,
"grad_norm": 1.6044663190841675,
"learning_rate": 1.3548149374011986e-06,
"loss": 0.9437,
"mean_token_accuracy": 0.701893288269639,
"num_tokens": 94582.0,
"step": 12490
},
{
"epoch": 12.124121212121212,
"grad_norm": 1.4867864847183228,
"learning_rate": 1.3442934523309137e-06,
"loss": 0.9598,
"mean_token_accuracy": 0.678819801285863,
"num_tokens": 106037.0,
"step": 12500
},
{
"epoch": 12.133818181818182,
"grad_norm": 1.6262177228927612,
"learning_rate": 1.3338100368755346e-06,
"loss": 0.9829,
"mean_token_accuracy": 0.6883869960904121,
"num_tokens": 117393.0,
"step": 12510
},
{
"epoch": 12.143515151515151,
"grad_norm": 2.58561635017395,
"learning_rate": 1.3233647371433222e-06,
"loss": 0.9191,
"mean_token_accuracy": 0.6864805597811937,
"num_tokens": 127326.0,
"step": 12520
},
{
"epoch": 12.153212121212121,
"grad_norm": 1.6916279792785645,
"learning_rate": 1.3129575990749e-06,
"loss": 0.9546,
"mean_token_accuracy": 0.6888086255639791,
"num_tokens": 137539.0,
"step": 12530
},
{
"epoch": 12.162909090909091,
"grad_norm": 1.5663442611694336,
"learning_rate": 1.3025886684430467e-06,
"loss": 0.9829,
"mean_token_accuracy": 0.6744012456387282,
"num_tokens": 148648.0,
"step": 12540
},
{
"epoch": 12.17260606060606,
"grad_norm": 1.4812220335006714,
"learning_rate": 1.2922579908524946e-06,
"loss": 0.9033,
"mean_token_accuracy": 0.7043029896914959,
"num_tokens": 158590.0,
"step": 12550
},
{
"epoch": 12.18230303030303,
"grad_norm": 1.7226941585540771,
"learning_rate": 1.2819656117397328e-06,
"loss": 0.9398,
"mean_token_accuracy": 0.6753247026354074,
"num_tokens": 170030.0,
"step": 12560
},
{
"epoch": 12.192,
"grad_norm": 0.7470999956130981,
"learning_rate": 1.2717115763728083e-06,
"loss": 0.9315,
"mean_token_accuracy": 0.6908956177532672,
"num_tokens": 179668.0,
"step": 12570
},
{
"epoch": 12.20169696969697,
"grad_norm": 1.0085124969482422,
"learning_rate": 1.2614959298511231e-06,
"loss": 0.9663,
"mean_token_accuracy": 0.6647142685949803,
"num_tokens": 190351.0,
"step": 12580
},
{
"epoch": 12.21139393939394,
"grad_norm": 0.801249623298645,
"learning_rate": 1.2513187171052288e-06,
"loss": 1.0055,
"mean_token_accuracy": 0.6591165266931057,
"num_tokens": 200784.0,
"step": 12590
},
{
"epoch": 12.22109090909091,
"grad_norm": 1.1452405452728271,
"learning_rate": 1.2411799828966497e-06,
"loss": 0.9374,
"mean_token_accuracy": 0.6782014291733504,
"num_tokens": 210672.0,
"step": 12600
},
{
"epoch": 12.23078787878788,
"grad_norm": 1.4320217370986938,
"learning_rate": 1.2310797718176658e-06,
"loss": 0.9016,
"mean_token_accuracy": 0.6874732073396445,
"num_tokens": 220175.0,
"step": 12610
},
{
"epoch": 12.240484848484849,
"grad_norm": 1.0549358129501343,
"learning_rate": 1.221018128291127e-06,
"loss": 0.9145,
"mean_token_accuracy": 0.6930529691278935,
"num_tokens": 230511.0,
"step": 12620
},
{
"epoch": 12.250181818181819,
"grad_norm": 0.7888785004615784,
"learning_rate": 1.2109950965702532e-06,
"loss": 0.962,
"mean_token_accuracy": 0.6716390445828437,
"num_tokens": 240893.0,
"step": 12630
},
{
"epoch": 12.259878787878788,
"grad_norm": 2.5039796829223633,
"learning_rate": 1.2010107207384437e-06,
"loss": 0.8335,
"mean_token_accuracy": 0.7178800087422132,
"num_tokens": 250554.0,
"step": 12640
},
{
"epoch": 12.269575757575758,
"grad_norm": 1.5427664518356323,
"learning_rate": 1.1910650447090798e-06,
"loss": 1.0129,
"mean_token_accuracy": 0.6641611870378256,
"num_tokens": 261026.0,
"step": 12650
},
{
"epoch": 12.279272727272728,
"grad_norm": 1.7952816486358643,
"learning_rate": 1.1811581122253335e-06,
"loss": 0.9801,
"mean_token_accuracy": 0.6731622900813818,
"num_tokens": 271855.0,
"step": 12660
},
{
"epoch": 12.288969696969698,
"grad_norm": 1.4959173202514648,
"learning_rate": 1.171289966859973e-06,
"loss": 0.9324,
"mean_token_accuracy": 0.6812974836677312,
"num_tokens": 281878.0,
"step": 12670
},
{
"epoch": 12.298666666666668,
"grad_norm": 0.7014359831809998,
"learning_rate": 1.1614606520151716e-06,
"loss": 0.9333,
"mean_token_accuracy": 0.6856089878827334,
"num_tokens": 292658.0,
"step": 12680
},
{
"epoch": 12.308363636363636,
"grad_norm": 0.6972899436950684,
"learning_rate": 1.1516702109223243e-06,
"loss": 0.8949,
"mean_token_accuracy": 0.6989801757037639,
"num_tokens": 302011.0,
"step": 12690
},
{
"epoch": 12.318060606060605,
"grad_norm": 1.2687288522720337,
"learning_rate": 1.1419186866418452e-06,
"loss": 0.9406,
"mean_token_accuracy": 0.6923393607139587,
"num_tokens": 312147.0,
"step": 12700
},
{
"epoch": 12.327757575757575,
"grad_norm": 1.3525540828704834,
"learning_rate": 1.1322061220629855e-06,
"loss": 0.8962,
"mean_token_accuracy": 0.684671938046813,
"num_tokens": 323474.0,
"step": 12710
},
{
"epoch": 12.337454545454545,
"grad_norm": 1.2294106483459473,
"learning_rate": 1.122532559903644e-06,
"loss": 1.0468,
"mean_token_accuracy": 0.628922751918435,
"num_tokens": 334923.0,
"step": 12720
},
{
"epoch": 12.347151515151515,
"grad_norm": 1.096246600151062,
"learning_rate": 1.1128980427101766e-06,
"loss": 0.9059,
"mean_token_accuracy": 0.683203124627471,
"num_tokens": 344506.0,
"step": 12730
},
{
"epoch": 12.356848484848484,
"grad_norm": 1.3699408769607544,
"learning_rate": 1.1033026128572156e-06,
"loss": 0.9996,
"mean_token_accuracy": 0.6612563081085682,
"num_tokens": 355007.0,
"step": 12740
},
{
"epoch": 12.366545454545454,
"grad_norm": 1.7355482578277588,
"learning_rate": 1.0937463125474724e-06,
"loss": 0.9761,
"mean_token_accuracy": 0.6721325762569904,
"num_tokens": 365829.0,
"step": 12750
},
{
"epoch": 12.376242424242424,
"grad_norm": 2.603883981704712,
"learning_rate": 1.084229183811566e-06,
"loss": 0.9383,
"mean_token_accuracy": 0.6898716945201159,
"num_tokens": 376436.0,
"step": 12760
},
{
"epoch": 12.385939393939394,
"grad_norm": 1.0586647987365723,
"learning_rate": 1.0747512685078264e-06,
"loss": 0.9564,
"mean_token_accuracy": 0.6626970659941435,
"num_tokens": 387389.0,
"step": 12770
},
{
"epoch": 12.395636363636363,
"grad_norm": 1.6182094812393188,
"learning_rate": 1.0653126083221143e-06,
"loss": 0.9003,
"mean_token_accuracy": 0.6970617674291134,
"num_tokens": 397693.0,
"step": 12780
},
{
"epoch": 12.405333333333333,
"grad_norm": 1.9159958362579346,
"learning_rate": 1.05591324476764e-06,
"loss": 0.9911,
"mean_token_accuracy": 0.6668812599033117,
"num_tokens": 408904.0,
"step": 12790
},
{
"epoch": 12.415030303030303,
"grad_norm": 1.2994496822357178,
"learning_rate": 1.046553219184776e-06,
"loss": 0.8753,
"mean_token_accuracy": 0.7094687633216381,
"num_tokens": 418860.0,
"step": 12800
},
{
"epoch": 12.424727272727273,
"grad_norm": 1.3715529441833496,
"learning_rate": 1.0372325727408838e-06,
"loss": 0.9217,
"mean_token_accuracy": 0.6802921980619431,
"num_tokens": 429236.0,
"step": 12810
},
{
"epoch": 12.434424242424242,
"grad_norm": 0.981478750705719,
"learning_rate": 1.0279513464301204e-06,
"loss": 0.9377,
"mean_token_accuracy": 0.6767802778631449,
"num_tokens": 439169.0,
"step": 12820
},
{
"epoch": 12.444121212121212,
"grad_norm": 0.9200496077537537,
"learning_rate": 1.0187095810732705e-06,
"loss": 1.0123,
"mean_token_accuracy": 0.6698127511888743,
"num_tokens": 450423.0,
"step": 12830
},
{
"epoch": 12.453818181818182,
"grad_norm": 1.1707184314727783,
"learning_rate": 1.0095073173175552e-06,
"loss": 0.9562,
"mean_token_accuracy": 0.6921768002212048,
"num_tokens": 461570.0,
"step": 12840
},
{
"epoch": 12.463515151515152,
"grad_norm": 0.8096593022346497,
"learning_rate": 1.0003445956364666e-06,
"loss": 0.9273,
"mean_token_accuracy": 0.6814159829169512,
"num_tokens": 471981.0,
"step": 12850
},
{
"epoch": 12.473212121212121,
"grad_norm": 1.0456788539886475,
"learning_rate": 9.912214563295787e-07,
"loss": 0.9224,
"mean_token_accuracy": 0.68552374728024,
"num_tokens": 482091.0,
"step": 12860
},
{
"epoch": 12.482909090909091,
"grad_norm": 1.2879787683486938,
"learning_rate": 9.821379395223684e-07,
"loss": 0.9833,
"mean_token_accuracy": 0.6662912800908088,
"num_tokens": 493252.0,
"step": 12870
},
{
"epoch": 12.492606060606061,
"grad_norm": 1.3891626596450806,
"learning_rate": 9.730940851660554e-07,
"loss": 0.9407,
"mean_token_accuracy": 0.7015823908150196,
"num_tokens": 504067.0,
"step": 12880
},
{
"epoch": 12.50230303030303,
"grad_norm": 1.663533329963684,
"learning_rate": 9.640899330374088e-07,
"loss": 0.8911,
"mean_token_accuracy": 0.6906427904963494,
"num_tokens": 514270.0,
"step": 12890
},
{
"epoch": 12.512,
"grad_norm": 1.9871175289154053,
"learning_rate": 9.55125522738579e-07,
"loss": 0.9259,
"mean_token_accuracy": 0.6906178455799818,
"num_tokens": 524617.0,
"step": 12900
},
{
"epoch": 12.52169696969697,
"grad_norm": 0.9362130165100098,
"learning_rate": 9.462008936969258e-07,
"loss": 0.9653,
"mean_token_accuracy": 0.6761426538228988,
"num_tokens": 536008.0,
"step": 12910
},
{
"epoch": 12.53139393939394,
"grad_norm": 1.086140513420105,
"learning_rate": 9.373160851648422e-07,
"loss": 0.8916,
"mean_token_accuracy": 0.6984883040189743,
"num_tokens": 545747.0,
"step": 12920
},
{
"epoch": 12.54109090909091,
"grad_norm": 1.05403470993042,
"learning_rate": 9.28471136219582e-07,
"loss": 0.9704,
"mean_token_accuracy": 0.6681053042411804,
"num_tokens": 556761.0,
"step": 12930
},
{
"epoch": 12.55078787878788,
"grad_norm": 0.9770132303237915,
"learning_rate": 9.196660857630857e-07,
"loss": 0.9625,
"mean_token_accuracy": 0.6729031853377819,
"num_tokens": 566793.0,
"step": 12940
},
{
"epoch": 12.56048484848485,
"grad_norm": 2.408095598220825,
"learning_rate": 9.109009725218165e-07,
"loss": 0.9268,
"mean_token_accuracy": 0.6797478631138801,
"num_tokens": 577467.0,
"step": 12950
},
{
"epoch": 12.570181818181819,
"grad_norm": 1.0821237564086914,
"learning_rate": 9.021758350465804e-07,
"loss": 1.0222,
"mean_token_accuracy": 0.6477519739419222,
"num_tokens": 588108.0,
"step": 12960
},
{
"epoch": 12.579878787878787,
"grad_norm": 0.7974284887313843,
"learning_rate": 8.93490711712367e-07,
"loss": 0.9717,
"mean_token_accuracy": 0.6615799587219954,
"num_tokens": 598348.0,
"step": 12970
},
{
"epoch": 12.589575757575757,
"grad_norm": 0.9920361638069153,
"learning_rate": 8.848456407181715e-07,
"loss": 0.9195,
"mean_token_accuracy": 0.6796383894979954,
"num_tokens": 607847.0,
"step": 12980
},
{
"epoch": 12.599272727272727,
"grad_norm": 1.929929494857788,
"learning_rate": 8.762406600868301e-07,
"loss": 0.9424,
"mean_token_accuracy": 0.6750466857105494,
"num_tokens": 618641.0,
"step": 12990
},
{
"epoch": 12.608969696969696,
"grad_norm": 0.9798093438148499,
"learning_rate": 8.676758076648562e-07,
"loss": 0.9802,
"mean_token_accuracy": 0.6532435789704323,
"num_tokens": 629445.0,
"step": 13000
},
{
"epoch": 12.618666666666666,
"grad_norm": 1.7001301050186157,
"learning_rate": 8.59151121122268e-07,
"loss": 0.9055,
"mean_token_accuracy": 0.6827256765216589,
"num_tokens": 640368.0,
"step": 13010
},
{
"epoch": 12.628363636363636,
"grad_norm": 1.0197906494140625,
"learning_rate": 8.506666379524275e-07,
"loss": 0.9016,
"mean_token_accuracy": 0.6827419150620699,
"num_tokens": 650484.0,
"step": 13020
},
{
"epoch": 12.638060606060606,
"grad_norm": 2.6649887561798096,
"learning_rate": 8.4222239547187e-07,
"loss": 0.892,
"mean_token_accuracy": 0.6925595041364432,
"num_tokens": 660702.0,
"step": 13030
},
{
"epoch": 12.647757575757575,
"grad_norm": 1.020989179611206,
"learning_rate": 8.338184308201535e-07,
"loss": 0.9017,
"mean_token_accuracy": 0.6905462071299553,
"num_tokens": 671682.0,
"step": 13040
},
{
"epoch": 12.657454545454545,
"grad_norm": 1.4303945302963257,
"learning_rate": 8.254547809596747e-07,
"loss": 0.9703,
"mean_token_accuracy": 0.6805687319487334,
"num_tokens": 682100.0,
"step": 13050
},
{
"epoch": 12.667151515151515,
"grad_norm": 1.8320350646972656,
"learning_rate": 8.171314826755228e-07,
"loss": 0.9739,
"mean_token_accuracy": 0.66879703104496,
"num_tokens": 692660.0,
"step": 13060
},
{
"epoch": 12.676848484848485,
"grad_norm": 0.9438029527664185,
"learning_rate": 8.088485725753114e-07,
"loss": 0.9212,
"mean_token_accuracy": 0.6848585486412049,
"num_tokens": 702875.0,
"step": 13070
},
{
"epoch": 12.686545454545454,
"grad_norm": 2.9450020790100098,
"learning_rate": 8.006060870890165e-07,
"loss": 0.876,
"mean_token_accuracy": 0.6980018597096205,
"num_tokens": 712292.0,
"step": 13080
},
{
"epoch": 12.696242424242424,
"grad_norm": 1.4857258796691895,
"learning_rate": 7.924040624688245e-07,
"loss": 0.8641,
"mean_token_accuracy": 0.7006300635635853,
"num_tokens": 722244.0,
"step": 13090
},
{
"epoch": 12.705939393939394,
"grad_norm": 1.02292799949646,
"learning_rate": 7.842425347889582e-07,
"loss": 0.9976,
"mean_token_accuracy": 0.6541789300739765,
"num_tokens": 733252.0,
"step": 13100
},
{
"epoch": 12.715636363636364,
"grad_norm": 1.1031875610351562,
"learning_rate": 7.761215399455324e-07,
"loss": 0.9232,
"mean_token_accuracy": 0.6899745035916567,
"num_tokens": 744027.0,
"step": 13110
},
{
"epoch": 12.725333333333333,
"grad_norm": 1.4371963739395142,
"learning_rate": 7.680411136563837e-07,
"loss": 0.9818,
"mean_token_accuracy": 0.6558696981519461,
"num_tokens": 754156.0,
"step": 13120
},
{
"epoch": 12.735030303030303,
"grad_norm": 1.3838204145431519,
"learning_rate": 7.600012914609301e-07,
"loss": 0.912,
"mean_token_accuracy": 0.7075278196483851,
"num_tokens": 763732.0,
"step": 13130
},
{
"epoch": 12.744727272727273,
"grad_norm": 0.7802479267120361,
"learning_rate": 7.520021087199925e-07,
"loss": 0.916,
"mean_token_accuracy": 0.6960792735219001,
"num_tokens": 773456.0,
"step": 13140
},
{
"epoch": 12.754424242424243,
"grad_norm": 1.3201979398727417,
"learning_rate": 7.440436006156559e-07,
"loss": 0.9347,
"mean_token_accuracy": 0.6893177561461925,
"num_tokens": 784266.0,
"step": 13150
},
{
"epoch": 12.764121212121212,
"grad_norm": 0.9860504269599915,
"learning_rate": 7.361258021511142e-07,
"loss": 0.9249,
"mean_token_accuracy": 0.6765072204172611,
"num_tokens": 794396.0,
"step": 13160
},
{
"epoch": 12.773818181818182,
"grad_norm": 1.6493189334869385,
"learning_rate": 7.282487481505041e-07,
"loss": 0.9379,
"mean_token_accuracy": 0.671536460146308,
"num_tokens": 804843.0,
"step": 13170
},
{
"epoch": 12.783515151515152,
"grad_norm": 0.8871903419494629,
"learning_rate": 7.204124732587659e-07,
"loss": 0.8677,
"mean_token_accuracy": 0.7121831141412258,
"num_tokens": 815821.0,
"step": 13180
},
{
"epoch": 12.793212121212122,
"grad_norm": 1.6710381507873535,
"learning_rate": 7.126170119414799e-07,
"loss": 0.9455,
"mean_token_accuracy": 0.6882101558148861,
"num_tokens": 826418.0,
"step": 13190
},
{
"epoch": 12.802909090909091,
"grad_norm": 1.0449455976486206,
"learning_rate": 7.048623984847203e-07,
"loss": 0.9237,
"mean_token_accuracy": 0.6743796251714229,
"num_tokens": 837180.0,
"step": 13200
},
{
"epoch": 12.812606060606061,
"grad_norm": 1.153255581855774,
"learning_rate": 6.971486669949102e-07,
"loss": 0.9745,
"mean_token_accuracy": 0.6699652068316937,
"num_tokens": 847602.0,
"step": 13210
},
{
"epoch": 12.822303030303031,
"grad_norm": 1.069661021232605,
"learning_rate": 6.894758513986566e-07,
"loss": 0.9217,
"mean_token_accuracy": 0.6804017089307308,
"num_tokens": 857486.0,
"step": 13220
},
{
"epoch": 12.832,
"grad_norm": 1.011649489402771,
"learning_rate": 6.818439854426151e-07,
"loss": 0.9386,
"mean_token_accuracy": 0.6823414113372565,
"num_tokens": 868972.0,
"step": 13230
},
{
"epoch": 12.84169696969697,
"grad_norm": 0.7872369885444641,
"learning_rate": 6.74253102693333e-07,
"loss": 0.9409,
"mean_token_accuracy": 0.6847406111657619,
"num_tokens": 879178.0,
"step": 13240
},
{
"epoch": 12.85139393939394,
"grad_norm": 1.3302205801010132,
"learning_rate": 6.667032365371095e-07,
"loss": 0.9514,
"mean_token_accuracy": 0.6746706318110227,
"num_tokens": 890112.0,
"step": 13250
},
{
"epoch": 12.861090909090908,
"grad_norm": 0.7299315333366394,
"learning_rate": 6.591944201798394e-07,
"loss": 0.8983,
"mean_token_accuracy": 0.6949192993342876,
"num_tokens": 900105.0,
"step": 13260
},
{
"epoch": 12.870787878787878,
"grad_norm": 0.9053242206573486,
"learning_rate": 6.517266866468741e-07,
"loss": 0.9662,
"mean_token_accuracy": 0.6785097420215607,
"num_tokens": 909781.0,
"step": 13270
},
{
"epoch": 12.880484848484848,
"grad_norm": 1.5465375185012817,
"learning_rate": 6.443000687828737e-07,
"loss": 0.9076,
"mean_token_accuracy": 0.6935414470732212,
"num_tokens": 920332.0,
"step": 13280
},
{
"epoch": 12.890181818181818,
"grad_norm": 0.9741002917289734,
"learning_rate": 6.369145992516635e-07,
"loss": 0.9533,
"mean_token_accuracy": 0.6718010984361171,
"num_tokens": 930800.0,
"step": 13290
},
{
"epoch": 12.899878787878787,
"grad_norm": 1.4398901462554932,
"learning_rate": 6.295703105360884e-07,
"loss": 0.9613,
"mean_token_accuracy": 0.6741296485066414,
"num_tokens": 942822.0,
"step": 13300
},
{
"epoch": 12.909575757575757,
"grad_norm": 0.8408631086349487,
"learning_rate": 6.222672349378711e-07,
"loss": 0.8839,
"mean_token_accuracy": 0.6957414381206035,
"num_tokens": 953151.0,
"step": 13310
},
{
"epoch": 12.919272727272727,
"grad_norm": 1.185342788696289,
"learning_rate": 6.150054045774745e-07,
"loss": 0.9431,
"mean_token_accuracy": 0.6786404684185982,
"num_tokens": 963817.0,
"step": 13320
},
{
"epoch": 12.928969696969697,
"grad_norm": 1.5377130508422852,
"learning_rate": 6.07784851393951e-07,
"loss": 0.9263,
"mean_token_accuracy": 0.6862830605357886,
"num_tokens": 974618.0,
"step": 13330
},
{
"epoch": 12.938666666666666,
"grad_norm": 2.0658161640167236,
"learning_rate": 6.006056071448119e-07,
"loss": 0.8625,
"mean_token_accuracy": 0.7110202703624964,
"num_tokens": 984540.0,
"step": 13340
},
{
"epoch": 12.948363636363636,
"grad_norm": 1.0002696514129639,
"learning_rate": 5.934677034058789e-07,
"loss": 0.9983,
"mean_token_accuracy": 0.6710415873676538,
"num_tokens": 995538.0,
"step": 13350
},
{
"epoch": 12.958060606060606,
"grad_norm": 0.6808292269706726,
"learning_rate": 5.863711715711507e-07,
"loss": 0.9357,
"mean_token_accuracy": 0.6868117332458497,
"num_tokens": 1005955.0,
"step": 13360
},
{
"epoch": 12.967757575757576,
"grad_norm": 1.43692946434021,
"learning_rate": 5.793160428526678e-07,
"loss": 0.9581,
"mean_token_accuracy": 0.6872004386037588,
"num_tokens": 1017901.0,
"step": 13370
},
{
"epoch": 12.977454545454545,
"grad_norm": 1.1382737159729004,
"learning_rate": 5.723023482803658e-07,
"loss": 0.8893,
"mean_token_accuracy": 0.6952810846269131,
"num_tokens": 1027791.0,
"step": 13380
},
{
"epoch": 12.987151515151515,
"grad_norm": 1.5918898582458496,
"learning_rate": 5.653301187019455e-07,
"loss": 0.8051,
"mean_token_accuracy": 0.7371663119643926,
"num_tokens": 1037438.0,
"step": 13390
},
{
"epoch": 12.996848484848485,
"grad_norm": 1.294746994972229,
"learning_rate": 5.583993847827363e-07,
"loss": 0.9868,
"mean_token_accuracy": 0.6542905114591122,
"num_tokens": 1048425.0,
"step": 13400
},
{
"epoch": 13.006787878787879,
"grad_norm": 1.0259826183319092,
"learning_rate": 5.515101770055653e-07,
"loss": 1.1464,
"mean_token_accuracy": 0.6516239614021487,
"num_tokens": 1059946.0,
"step": 13410
},
{
"epoch": 13.016484848484849,
"grad_norm": 1.570686936378479,
"learning_rate": 5.446625256706095e-07,
"loss": 0.9854,
"mean_token_accuracy": 0.6571170825511217,
"num_tokens": 1071512.0,
"step": 13420
},
{
"epoch": 13.026181818181819,
"grad_norm": 1.4056403636932373,
"learning_rate": 5.378564608952786e-07,
"loss": 1.0107,
"mean_token_accuracy": 0.6513097662478685,
"num_tokens": 1082669.0,
"step": 13430
},
{
"epoch": 13.035878787878788,
"grad_norm": 1.19424307346344,
"learning_rate": 5.310920126140773e-07,
"loss": 0.9449,
"mean_token_accuracy": 0.6799818322062492,
"num_tokens": 1093470.0,
"step": 13440
},
{
"epoch": 13.045575757575758,
"grad_norm": 1.1541939973831177,
"learning_rate": 5.243692105784682e-07,
"loss": 1.0241,
"mean_token_accuracy": 0.6430629625916481,
"num_tokens": 1105089.0,
"step": 13450
},
{
"epoch": 13.055272727272728,
"grad_norm": 0.825744092464447,
"learning_rate": 5.176880843567455e-07,
"loss": 0.9137,
"mean_token_accuracy": 0.6785864185541868,
"num_tokens": 1115643.0,
"step": 13460
},
{
"epoch": 13.064969696969698,
"grad_norm": 1.2951405048370361,
"learning_rate": 5.110486633339062e-07,
"loss": 0.972,
"mean_token_accuracy": 0.6606432240456342,
"num_tokens": 1126575.0,
"step": 13470
},
{
"epoch": 13.074666666666667,
"grad_norm": 0.8548156023025513,
"learning_rate": 5.044509767115158e-07,
"loss": 0.9143,
"mean_token_accuracy": 0.6806200005114078,
"num_tokens": 1137317.0,
"step": 13480
},
{
"epoch": 13.084363636363637,
"grad_norm": 1.2886772155761719,
"learning_rate": 4.978950535075878e-07,
"loss": 0.8903,
"mean_token_accuracy": 0.7035336244851351,
"num_tokens": 1148065.0,
"step": 13490
},
{
"epoch": 13.094060606060607,
"grad_norm": 1.966200351715088,
"learning_rate": 4.913809225564492e-07,
"loss": 0.9073,
"mean_token_accuracy": 0.701976515352726,
"num_tokens": 1158621.0,
"step": 13500
},
{
"epoch": 13.103757575757577,
"grad_norm": 0.8389899134635925,
"learning_rate": 4.849086125086156e-07,
"loss": 0.9414,
"mean_token_accuracy": 0.6894888635724783,
"num_tokens": 1168890.0,
"step": 13510
},
{
"epoch": 13.113454545454545,
"grad_norm": 0.9758931994438171,
"learning_rate": 4.784781518306624e-07,
"loss": 0.9384,
"mean_token_accuracy": 0.6737278677523136,
"num_tokens": 1178584.0,
"step": 13520
},
{
"epoch": 13.123151515151514,
"grad_norm": 1.0330685377120972,
"learning_rate": 4.720895688051108e-07,
"loss": 0.9524,
"mean_token_accuracy": 0.6780954591929913,
"num_tokens": 1189454.0,
"step": 13530
},
{
"epoch": 13.132848484848484,
"grad_norm": 1.9264168739318848,
"learning_rate": 4.657428915302875e-07,
"loss": 0.8363,
"mean_token_accuracy": 0.720489464327693,
"num_tokens": 1199557.0,
"step": 13540
},
{
"epoch": 13.142545454545454,
"grad_norm": 0.7110128402709961,
"learning_rate": 4.594381479202137e-07,
"loss": 0.9138,
"mean_token_accuracy": 0.6820375476032495,
"num_tokens": 1210005.0,
"step": 13550
},
{
"epoch": 13.152242424242424,
"grad_norm": 1.9913625717163086,
"learning_rate": 4.531753657044735e-07,
"loss": 0.9352,
"mean_token_accuracy": 0.6837764341384173,
"num_tokens": 1220507.0,
"step": 13560
},
{
"epoch": 13.161939393939393,
"grad_norm": 1.4394137859344482,
"learning_rate": 4.469545724280988e-07,
"loss": 0.9389,
"mean_token_accuracy": 0.6835528288036585,
"num_tokens": 1231088.0,
"step": 13570
},
{
"epoch": 13.171636363636363,
"grad_norm": 1.118189811706543,
"learning_rate": 4.407757954514458e-07,
"loss": 0.9182,
"mean_token_accuracy": 0.69982905164361,
"num_tokens": 1241297.0,
"step": 13580
},
{
"epoch": 13.181333333333333,
"grad_norm": 0.6542367935180664,
"learning_rate": 4.3463906195007066e-07,
"loss": 0.8837,
"mean_token_accuracy": 0.700026823580265,
"num_tokens": 1250943.0,
"step": 13590
},
{
"epoch": 13.191030303030303,
"grad_norm": 0.5948226451873779,
"learning_rate": 4.285443989146176e-07,
"loss": 1.0363,
"mean_token_accuracy": 0.6513338401913643,
"num_tokens": 1262487.0,
"step": 13600
},
{
"epoch": 13.200727272727272,
"grad_norm": 1.0918562412261963,
"learning_rate": 4.5e-05,
"loss": 0.9926,
"mean_token_accuracy": 0.6507623802870512,
"num_tokens": 11060.0,
"step": 13610
},
{
"epoch": 13.210424242424242,
"grad_norm": 1.453194499015808,
"learning_rate": 9.5e-05,
"loss": 0.8771,
"mean_token_accuracy": 0.6994029752910137,
"num_tokens": 21506.0,
"step": 13620
},
{
"epoch": 13.220121212121212,
"grad_norm": 2.374359130859375,
"learning_rate": 9.995495495495496e-05,
"loss": 0.9635,
"mean_token_accuracy": 0.6764267832040787,
"num_tokens": 32669.0,
"step": 13630
},
{
"epoch": 13.229818181818182,
"grad_norm": 1.6310795545578003,
"learning_rate": 9.990490490490491e-05,
"loss": 0.8954,
"mean_token_accuracy": 0.7067163821309805,
"num_tokens": 42938.0,
"step": 13640
},
{
"epoch": 13.239515151515151,
"grad_norm": 0.8453378677368164,
"learning_rate": 9.985485485485487e-05,
"loss": 0.9392,
"mean_token_accuracy": 0.68135135024786,
"num_tokens": 53420.0,
"step": 13650
},
{
"epoch": 13.249212121212121,
"grad_norm": 2.5701301097869873,
"learning_rate": 9.980480480480481e-05,
"loss": 0.8713,
"mean_token_accuracy": 0.7125500839203596,
"num_tokens": 62912.0,
"step": 13660
},
{
"epoch": 13.258909090909091,
"grad_norm": 1.7641572952270508,
"learning_rate": 9.975475475475477e-05,
"loss": 0.9947,
"mean_token_accuracy": 0.6635019164532423,
"num_tokens": 73599.0,
"step": 13670
},
{
"epoch": 13.26860606060606,
"grad_norm": 2.168328046798706,
"learning_rate": 9.970470470470471e-05,
"loss": 0.9706,
"mean_token_accuracy": 0.688429095223546,
"num_tokens": 83750.0,
"step": 13680
},
{
"epoch": 13.27830303030303,
"grad_norm": 1.4071749448776245,
"learning_rate": 9.965465465465466e-05,
"loss": 0.9676,
"mean_token_accuracy": 0.6768725138157606,
"num_tokens": 93823.0,
"step": 13690
},
{
"epoch": 13.288,
"grad_norm": 1.1499977111816406,
"learning_rate": 9.960460460460461e-05,
"loss": 0.9889,
"mean_token_accuracy": 0.6712037593126297,
"num_tokens": 103911.0,
"step": 13700
},
{
"epoch": 13.29769696969697,
"grad_norm": 1.2642593383789062,
"learning_rate": 9.955455455455456e-05,
"loss": 0.9786,
"mean_token_accuracy": 0.6803277429193259,
"num_tokens": 114296.0,
"step": 13710
},
{
"epoch": 13.30739393939394,
"grad_norm": 0.9675585627555847,
"learning_rate": 9.950450450450451e-05,
"loss": 0.8952,
"mean_token_accuracy": 0.6976213902235031,
"num_tokens": 123697.0,
"step": 13720
},
{
"epoch": 13.31709090909091,
"grad_norm": 1.5083271265029907,
"learning_rate": 9.945445445445446e-05,
"loss": 0.9518,
"mean_token_accuracy": 0.6812848944216967,
"num_tokens": 133976.0,
"step": 13730
},
{
"epoch": 13.32678787878788,
"grad_norm": 1.0912386178970337,
"learning_rate": 9.94044044044044e-05,
"loss": 0.9224,
"mean_token_accuracy": 0.6897901255637408,
"num_tokens": 143868.0,
"step": 13740
},
{
"epoch": 13.336484848484849,
"grad_norm": 1.7375333309173584,
"learning_rate": 9.935435435435436e-05,
"loss": 0.9107,
"mean_token_accuracy": 0.706351314485073,
"num_tokens": 154106.0,
"step": 13750
},
{
"epoch": 13.346181818181819,
"grad_norm": 1.1665840148925781,
"learning_rate": 9.930430430430431e-05,
"loss": 0.941,
"mean_token_accuracy": 0.6972976390272378,
"num_tokens": 164042.0,
"step": 13760
},
{
"epoch": 13.355878787878789,
"grad_norm": 1.7706063985824585,
"learning_rate": 9.925425425425427e-05,
"loss": 0.8943,
"mean_token_accuracy": 0.7054846830666065,
"num_tokens": 174506.0,
"step": 13770
},
{
"epoch": 13.365575757575758,
"grad_norm": 0.5767163038253784,
"learning_rate": 9.920420420420421e-05,
"loss": 0.9656,
"mean_token_accuracy": 0.6823426600545645,
"num_tokens": 185338.0,
"step": 13780
},
{
"epoch": 13.375272727272728,
"grad_norm": 1.4523296356201172,
"learning_rate": 9.915415415415416e-05,
"loss": 0.9469,
"mean_token_accuracy": 0.6681080140173435,
"num_tokens": 195763.0,
"step": 13790
},
{
"epoch": 13.384969696969698,
"grad_norm": 0.7047093510627747,
"learning_rate": 9.910410410410411e-05,
"loss": 0.9216,
"mean_token_accuracy": 0.6868221748620271,
"num_tokens": 205612.0,
"step": 13800
},
{
"epoch": 13.394666666666666,
"grad_norm": 0.7028587460517883,
"learning_rate": 9.905405405405406e-05,
"loss": 0.9799,
"mean_token_accuracy": 0.6751956883817911,
"num_tokens": 215645.0,
"step": 13810
},
{
"epoch": 13.404363636363636,
"grad_norm": 0.9091927409172058,
"learning_rate": 9.900400400400401e-05,
"loss": 0.9413,
"mean_token_accuracy": 0.6909396957606078,
"num_tokens": 225530.0,
"step": 13820
},
{
"epoch": 13.414060606060605,
"grad_norm": 1.0086578130722046,
"learning_rate": 9.895395395395396e-05,
"loss": 0.964,
"mean_token_accuracy": 0.6786475393921136,
"num_tokens": 236313.0,
"step": 13830
},
{
"epoch": 13.423757575757575,
"grad_norm": 1.5697195529937744,
"learning_rate": 9.89039039039039e-05,
"loss": 0.9573,
"mean_token_accuracy": 0.6727604184299707,
"num_tokens": 246884.0,
"step": 13840
},
{
"epoch": 13.433454545454545,
"grad_norm": 0.8102120161056519,
"learning_rate": 9.885385385385386e-05,
"loss": 0.9226,
"mean_token_accuracy": 0.6831782024353743,
"num_tokens": 256990.0,
"step": 13850
},
{
"epoch": 13.443151515151515,
"grad_norm": 0.9028761982917786,
"learning_rate": 9.880380380380381e-05,
"loss": 0.8753,
"mean_token_accuracy": 0.7016171887516975,
"num_tokens": 266786.0,
"step": 13860
},
{
"epoch": 13.452848484848484,
"grad_norm": 1.2319331169128418,
"learning_rate": 9.875375375375377e-05,
"loss": 0.9452,
"mean_token_accuracy": 0.6777403865009546,
"num_tokens": 278066.0,
"step": 13870
},
{
"epoch": 13.462545454545454,
"grad_norm": 1.340330719947815,
"learning_rate": 9.870370370370371e-05,
"loss": 0.8887,
"mean_token_accuracy": 0.6937030091881752,
"num_tokens": 287644.0,
"step": 13880
},
{
"epoch": 13.472242424242424,
"grad_norm": 2.107584238052368,
"learning_rate": 9.865365365365366e-05,
"loss": 0.9805,
"mean_token_accuracy": 0.6581023618578911,
"num_tokens": 299167.0,
"step": 13890
},
{
"epoch": 13.481939393939394,
"grad_norm": 1.3416616916656494,
"learning_rate": 9.860360360360361e-05,
"loss": 0.9588,
"mean_token_accuracy": 0.6862040366977453,
"num_tokens": 310325.0,
"step": 13900
},
{
"epoch": 13.491636363636363,
"grad_norm": 0.7638229727745056,
"learning_rate": 9.855355355355356e-05,
"loss": 0.9199,
"mean_token_accuracy": 0.6865271601825953,
"num_tokens": 320799.0,
"step": 13910
},
{
"epoch": 13.501333333333333,
"grad_norm": 1.8613024950027466,
"learning_rate": 9.850350350350351e-05,
"loss": 0.9029,
"mean_token_accuracy": 0.7134368922561407,
"num_tokens": 331742.0,
"step": 13920
},
{
"epoch": 13.511030303030303,
"grad_norm": 0.8470885753631592,
"learning_rate": 9.845345345345346e-05,
"loss": 0.9985,
"mean_token_accuracy": 0.6461464431136846,
"num_tokens": 342008.0,
"step": 13930
},
{
"epoch": 13.520727272727273,
"grad_norm": 1.4289556741714478,
"learning_rate": 9.84034034034034e-05,
"loss": 1.0229,
"mean_token_accuracy": 0.6607601415365935,
"num_tokens": 352783.0,
"step": 13940
},
{
"epoch": 13.530424242424242,
"grad_norm": 1.1315350532531738,
"learning_rate": 9.835335335335336e-05,
"loss": 0.8668,
"mean_token_accuracy": 0.7064531348645687,
"num_tokens": 362703.0,
"step": 13950
},
{
"epoch": 13.540121212121212,
"grad_norm": 0.8690136671066284,
"learning_rate": 9.83033033033033e-05,
"loss": 0.8763,
"mean_token_accuracy": 0.7114055767655373,
"num_tokens": 372732.0,
"step": 13960
},
{
"epoch": 13.549818181818182,
"grad_norm": 0.9560481905937195,
"learning_rate": 9.825325325325326e-05,
"loss": 0.9223,
"mean_token_accuracy": 0.6863605052232742,
"num_tokens": 382785.0,
"step": 13970
},
{
"epoch": 13.559515151515152,
"grad_norm": 1.053054928779602,
"learning_rate": 9.820320320320321e-05,
"loss": 0.9598,
"mean_token_accuracy": 0.6758723571896553,
"num_tokens": 393871.0,
"step": 13980
},
{
"epoch": 13.569212121212122,
"grad_norm": 0.4731355905532837,
"learning_rate": 9.815315315315316e-05,
"loss": 0.9456,
"mean_token_accuracy": 0.6892194643616676,
"num_tokens": 404378.0,
"step": 13990
},
{
"epoch": 13.578909090909091,
"grad_norm": 1.9100712537765503,
"learning_rate": 9.810310310310311e-05,
"loss": 1.011,
"mean_token_accuracy": 0.6585861250758172,
"num_tokens": 415762.0,
"step": 14000
},
{
"epoch": 13.588606060606061,
"grad_norm": 0.987190842628479,
"learning_rate": 9.805305305305306e-05,
"loss": 0.8616,
"mean_token_accuracy": 0.7012909840792417,
"num_tokens": 425425.0,
"step": 14010
},
{
"epoch": 13.59830303030303,
"grad_norm": 0.8835279941558838,
"learning_rate": 9.8003003003003e-05,
"loss": 0.9801,
"mean_token_accuracy": 0.6575286597013473,
"num_tokens": 435861.0,
"step": 14020
},
{
"epoch": 13.608,
"grad_norm": 0.9478653073310852,
"learning_rate": 9.795295295295296e-05,
"loss": 0.9283,
"mean_token_accuracy": 0.6895153563469648,
"num_tokens": 446411.0,
"step": 14030
},
{
"epoch": 13.61769696969697,
"grad_norm": 0.8801679015159607,
"learning_rate": 9.79029029029029e-05,
"loss": 0.9621,
"mean_token_accuracy": 0.68089236356318,
"num_tokens": 457521.0,
"step": 14040
},
{
"epoch": 13.62739393939394,
"grad_norm": 0.7246169447898865,
"learning_rate": 9.785285285285286e-05,
"loss": 0.915,
"mean_token_accuracy": 0.6914402432739735,
"num_tokens": 467230.0,
"step": 14050
},
{
"epoch": 13.63709090909091,
"grad_norm": 1.023116946220398,
"learning_rate": 9.78028028028028e-05,
"loss": 1.0017,
"mean_token_accuracy": 0.6633546780794859,
"num_tokens": 478815.0,
"step": 14060
},
{
"epoch": 13.64678787878788,
"grad_norm": 1.2296099662780762,
"learning_rate": 9.775275275275276e-05,
"loss": 0.9853,
"mean_token_accuracy": 0.6748053282499313,
"num_tokens": 488901.0,
"step": 14070
},
{
"epoch": 13.656484848484848,
"grad_norm": 0.9308061003684998,
"learning_rate": 9.770270270270272e-05,
"loss": 0.872,
"mean_token_accuracy": 0.7030160129070282,
"num_tokens": 499156.0,
"step": 14080
},
{
"epoch": 13.666181818181819,
"grad_norm": 1.4838083982467651,
"learning_rate": 9.765265265265266e-05,
"loss": 0.918,
"mean_token_accuracy": 0.6922544561326504,
"num_tokens": 508618.0,
"step": 14090
},
{
"epoch": 13.675878787878787,
"grad_norm": 0.6036433577537537,
"learning_rate": 9.760260260260262e-05,
"loss": 0.9253,
"mean_token_accuracy": 0.6886366963386535,
"num_tokens": 519918.0,
"step": 14100
},
{
"epoch": 13.685575757575757,
"grad_norm": 0.848430871963501,
"learning_rate": 9.755255255255256e-05,
"loss": 0.9634,
"mean_token_accuracy": 0.6708800371736288,
"num_tokens": 529716.0,
"step": 14110
},
{
"epoch": 13.695272727272727,
"grad_norm": 0.7561900019645691,
"learning_rate": 9.75025025025025e-05,
"loss": 0.8676,
"mean_token_accuracy": 0.6998249750584364,
"num_tokens": 539041.0,
"step": 14120
},
{
"epoch": 13.704969696969696,
"grad_norm": 0.8211101293563843,
"learning_rate": 9.745245245245246e-05,
"loss": 0.9797,
"mean_token_accuracy": 0.6581344068050384,
"num_tokens": 549883.0,
"step": 14130
},
{
"epoch": 13.714666666666666,
"grad_norm": 1.2751184701919556,
"learning_rate": 9.74024024024024e-05,
"loss": 0.9024,
"mean_token_accuracy": 0.6988375499844551,
"num_tokens": 560364.0,
"step": 14140
},
{
"epoch": 13.724363636363636,
"grad_norm": 0.7292294502258301,
"learning_rate": 9.735235235235236e-05,
"loss": 0.8688,
"mean_token_accuracy": 0.6985570065677166,
"num_tokens": 570146.0,
"step": 14150
},
{
"epoch": 13.734060606060606,
"grad_norm": 1.0787569284439087,
"learning_rate": 9.73023023023023e-05,
"loss": 0.9108,
"mean_token_accuracy": 0.6816088363528252,
"num_tokens": 580097.0,
"step": 14160
},
{
"epoch": 13.743757575757575,
"grad_norm": 0.5591951012611389,
"learning_rate": 9.725225225225225e-05,
"loss": 1.0228,
"mean_token_accuracy": 0.6424524009227752,
"num_tokens": 591054.0,
"step": 14170
},
{
"epoch": 13.753454545454545,
"grad_norm": 0.950010359287262,
"learning_rate": 9.72022022022022e-05,
"loss": 0.8811,
"mean_token_accuracy": 0.7032374102622271,
"num_tokens": 600799.0,
"step": 14180
},
{
"epoch": 13.763151515151515,
"grad_norm": 0.4867992103099823,
"learning_rate": 9.715215215215216e-05,
"loss": 0.9002,
"mean_token_accuracy": 0.6991371564567089,
"num_tokens": 611008.0,
"step": 14190
},
{
"epoch": 13.772848484848485,
"grad_norm": 0.5358482003211975,
"learning_rate": 9.710210210210212e-05,
"loss": 0.9305,
"mean_token_accuracy": 0.6951459109783172,
"num_tokens": 621483.0,
"step": 14200
},
{
"epoch": 13.782545454545454,
"grad_norm": 0.8481453657150269,
"learning_rate": 9.705205205205206e-05,
"loss": 0.9505,
"mean_token_accuracy": 0.6874804452061654,
"num_tokens": 633074.0,
"step": 14210
},
{
"epoch": 13.792242424242424,
"grad_norm": 0.664574146270752,
"learning_rate": 9.7002002002002e-05,
"loss": 0.971,
"mean_token_accuracy": 0.6761138528585434,
"num_tokens": 644108.0,
"step": 14220
},
{
"epoch": 13.801939393939394,
"grad_norm": 0.6939647793769836,
"learning_rate": 9.695195195195196e-05,
"loss": 0.9171,
"mean_token_accuracy": 0.6944379203021527,
"num_tokens": 654249.0,
"step": 14230
},
{
"epoch": 13.811636363636364,
"grad_norm": 0.6086325645446777,
"learning_rate": 9.69019019019019e-05,
"loss": 0.9794,
"mean_token_accuracy": 0.6574176583439112,
"num_tokens": 664735.0,
"step": 14240
},
{
"epoch": 13.821333333333333,
"grad_norm": 2.137354612350464,
"learning_rate": 9.685185185185186e-05,
"loss": 0.9202,
"mean_token_accuracy": 0.6800346210598945,
"num_tokens": 675580.0,
"step": 14250
},
{
"epoch": 13.831030303030303,
"grad_norm": 1.0914839506149292,
"learning_rate": 9.68018018018018e-05,
"loss": 0.9407,
"mean_token_accuracy": 0.6815901666879653,
"num_tokens": 685042.0,
"step": 14260
},
{
"epoch": 13.840727272727273,
"grad_norm": 0.9622077345848083,
"learning_rate": 9.675175175175175e-05,
"loss": 0.9412,
"mean_token_accuracy": 0.6896888021379709,
"num_tokens": 695452.0,
"step": 14270
},
{
"epoch": 13.850424242424243,
"grad_norm": 0.5911729335784912,
"learning_rate": 9.67017017017017e-05,
"loss": 0.9476,
"mean_token_accuracy": 0.6852936699986458,
"num_tokens": 706283.0,
"step": 14280
},
{
"epoch": 13.860121212121213,
"grad_norm": 1.0763121843338013,
"learning_rate": 9.665165165165166e-05,
"loss": 0.8593,
"mean_token_accuracy": 0.70830412581563,
"num_tokens": 715851.0,
"step": 14290
},
{
"epoch": 13.869818181818182,
"grad_norm": 0.7274637818336487,
"learning_rate": 9.660160160160162e-05,
"loss": 1.0047,
"mean_token_accuracy": 0.6792124062776566,
"num_tokens": 727035.0,
"step": 14300
},
{
"epoch": 13.879515151515152,
"grad_norm": 0.6750665903091431,
"learning_rate": 9.655155155155156e-05,
"loss": 0.9299,
"mean_token_accuracy": 0.6856705665588378,
"num_tokens": 737554.0,
"step": 14310
},
{
"epoch": 13.889212121212122,
"grad_norm": 0.6934303641319275,
"learning_rate": 9.65015015015015e-05,
"loss": 0.903,
"mean_token_accuracy": 0.6835582558065653,
"num_tokens": 747852.0,
"step": 14320
},
{
"epoch": 13.898909090909092,
"grad_norm": 0.5132259726524353,
"learning_rate": 9.645145145145146e-05,
"loss": 0.9779,
"mean_token_accuracy": 0.6827262349426746,
"num_tokens": 757970.0,
"step": 14330
},
{
"epoch": 13.908606060606061,
"grad_norm": 0.46078333258628845,
"learning_rate": 9.64014014014014e-05,
"loss": 0.9377,
"mean_token_accuracy": 0.684358200058341,
"num_tokens": 768757.0,
"step": 14340
},
{
"epoch": 13.918303030303031,
"grad_norm": 0.555814266204834,
"learning_rate": 9.635135135135136e-05,
"loss": 0.9762,
"mean_token_accuracy": 0.6606349345296622,
"num_tokens": 780206.0,
"step": 14350
},
{
"epoch": 13.928,
"grad_norm": 0.8341594338417053,
"learning_rate": 9.63013013013013e-05,
"loss": 0.9645,
"mean_token_accuracy": 0.6715805854648351,
"num_tokens": 790961.0,
"step": 14360
},
{
"epoch": 13.937696969696969,
"grad_norm": 0.6067021489143372,
"learning_rate": 9.625125125125125e-05,
"loss": 0.9145,
"mean_token_accuracy": 0.6793199084699154,
"num_tokens": 800987.0,
"step": 14370
},
{
"epoch": 13.947393939393939,
"grad_norm": 0.7952314019203186,
"learning_rate": 9.62012012012012e-05,
"loss": 0.9162,
"mean_token_accuracy": 0.69475242421031,
"num_tokens": 811828.0,
"step": 14380
},
{
"epoch": 13.957090909090908,
"grad_norm": 0.8746843934059143,
"learning_rate": 9.615115115115115e-05,
"loss": 0.8681,
"mean_token_accuracy": 0.7030756626278162,
"num_tokens": 822958.0,
"step": 14390
},
{
"epoch": 13.966787878787878,
"grad_norm": 0.4334689974784851,
"learning_rate": 9.61011011011011e-05,
"loss": 0.9797,
"mean_token_accuracy": 0.6570104032754898,
"num_tokens": 834206.0,
"step": 14400
},
{
"epoch": 13.976484848484848,
"grad_norm": 0.5802099108695984,
"learning_rate": 9.605105105105106e-05,
"loss": 0.9076,
"mean_token_accuracy": 0.6986728705465793,
"num_tokens": 845031.0,
"step": 14410
},
{
"epoch": 13.986181818181818,
"grad_norm": 0.41924917697906494,
"learning_rate": 9.6001001001001e-05,
"loss": 0.9134,
"mean_token_accuracy": 0.6965976521372795,
"num_tokens": 854691.0,
"step": 14420
},
{
"epoch": 13.995878787878787,
"grad_norm": 0.4162426292896271,
"learning_rate": 9.595095095095096e-05,
"loss": 1.0075,
"mean_token_accuracy": 0.6658117517828941,
"num_tokens": 865141.0,
"step": 14430
},
{
"epoch": 14.005818181818182,
"grad_norm": 0.6385387182235718,
"learning_rate": 9.59009009009009e-05,
"loss": 1.0001,
"mean_token_accuracy": 0.6936045238157598,
"num_tokens": 876494.0,
"step": 14440
},
{
"epoch": 14.015515151515151,
"grad_norm": 0.6041902303695679,
"learning_rate": 9.585085085085086e-05,
"loss": 0.9238,
"mean_token_accuracy": 0.6878648042678833,
"num_tokens": 886603.0,
"step": 14450
},
{
"epoch": 14.025212121212121,
"grad_norm": 0.9639670252799988,
"learning_rate": 9.58008008008008e-05,
"loss": 1.0331,
"mean_token_accuracy": 0.6559940252453089,
"num_tokens": 898033.0,
"step": 14460
},
{
"epoch": 14.03490909090909,
"grad_norm": 0.5883612036705017,
"learning_rate": 9.575075075075075e-05,
"loss": 0.9764,
"mean_token_accuracy": 0.6728239644318819,
"num_tokens": 909064.0,
"step": 14470
},
{
"epoch": 14.04460606060606,
"grad_norm": 0.8372961282730103,
"learning_rate": 9.57007007007007e-05,
"loss": 0.9061,
"mean_token_accuracy": 0.6874277569353581,
"num_tokens": 919744.0,
"step": 14480
},
{
"epoch": 14.05430303030303,
"grad_norm": 1.7760860919952393,
"learning_rate": 9.565065065065065e-05,
"loss": 0.9198,
"mean_token_accuracy": 0.6859086826443672,
"num_tokens": 930349.0,
"step": 14490
},
{
"epoch": 14.064,
"grad_norm": 0.5744428634643555,
"learning_rate": 9.56006006006006e-05,
"loss": 0.9518,
"mean_token_accuracy": 0.6734505753964186,
"num_tokens": 940791.0,
"step": 14500
},
{
"epoch": 14.07369696969697,
"grad_norm": 0.9980311989784241,
"learning_rate": 9.555055055055056e-05,
"loss": 0.903,
"mean_token_accuracy": 0.6935466017574072,
"num_tokens": 951362.0,
"step": 14510
},
{
"epoch": 14.08339393939394,
"grad_norm": 0.6623931527137756,
"learning_rate": 9.55005005005005e-05,
"loss": 0.8918,
"mean_token_accuracy": 0.6993258882313966,
"num_tokens": 962286.0,
"step": 14520
},
{
"epoch": 14.09309090909091,
"grad_norm": 0.4992653429508209,
"learning_rate": 9.545045045045046e-05,
"loss": 0.9309,
"mean_token_accuracy": 0.6649952068924904,
"num_tokens": 972644.0,
"step": 14530
},
{
"epoch": 14.102787878787879,
"grad_norm": 0.4818670153617859,
"learning_rate": 9.54004004004004e-05,
"loss": 0.9225,
"mean_token_accuracy": 0.6798055626451969,
"num_tokens": 983056.0,
"step": 14540
},
{
"epoch": 14.112484848484849,
"grad_norm": 0.9694674015045166,
"learning_rate": 9.535035035035036e-05,
"loss": 0.8962,
"mean_token_accuracy": 0.6953587524592877,
"num_tokens": 992490.0,
"step": 14550
},
{
"epoch": 14.122181818181819,
"grad_norm": 0.8076632618904114,
"learning_rate": 9.53003003003003e-05,
"loss": 0.9096,
"mean_token_accuracy": 0.6695175170898438,
"num_tokens": 1003185.0,
"step": 14560
},
{
"epoch": 14.131878787878788,
"grad_norm": 0.39989814162254333,
"learning_rate": 9.525025025025025e-05,
"loss": 0.8959,
"mean_token_accuracy": 0.6875695057213307,
"num_tokens": 1013510.0,
"step": 14570
},
{
"epoch": 14.141575757575758,
"grad_norm": 0.5998600721359253,
"learning_rate": 9.52002002002002e-05,
"loss": 0.9311,
"mean_token_accuracy": 0.6833312470465899,
"num_tokens": 1023703.0,
"step": 14580
},
{
"epoch": 14.151272727272728,
"grad_norm": 1.0913785696029663,
"learning_rate": 9.515015015015015e-05,
"loss": 1.0003,
"mean_token_accuracy": 0.6780395913869143,
"num_tokens": 1035344.0,
"step": 14590
},
{
"epoch": 14.160969696969698,
"grad_norm": 0.891591489315033,
"learning_rate": 9.51001001001001e-05,
"loss": 0.9419,
"mean_token_accuracy": 0.6847637005150318,
"num_tokens": 1045341.0,
"step": 14600
},
{
"epoch": 14.170666666666667,
"grad_norm": 0.8624415397644043,
"learning_rate": 9.505005005005005e-05,
"loss": 0.8532,
"mean_token_accuracy": 0.7248432952910662,
"num_tokens": 1055979.0,
"step": 14610
},
{
"epoch": 14.180363636363637,
"grad_norm": 0.9150317311286926,
"learning_rate": 9.5e-05,
"loss": 0.9745,
"mean_token_accuracy": 0.6698940627276897,
"num_tokens": 1067319.0,
"step": 14620
},
{
"epoch": 14.190060606060607,
"grad_norm": 0.41908109188079834,
"learning_rate": 9.494994994994996e-05,
"loss": 0.9568,
"mean_token_accuracy": 0.678890322521329,
"num_tokens": 1078431.0,
"step": 14630
},
{
"epoch": 14.199757575757575,
"grad_norm": 0.878993809223175,
"learning_rate": 9.48998998998999e-05,
"loss": 0.9256,
"mean_token_accuracy": 0.6974173996597528,
"num_tokens": 1088662.0,
"step": 14640
},
{
"epoch": 14.209454545454545,
"grad_norm": 0.3703934848308563,
"learning_rate": 9.484984984984986e-05,
"loss": 0.9434,
"mean_token_accuracy": 0.6732165481895208,
"num_tokens": 1099362.0,
"step": 14650
},
{
"epoch": 14.219151515151514,
"grad_norm": 0.4467850625514984,
"learning_rate": 9.47997997997998e-05,
"loss": 0.9459,
"mean_token_accuracy": 0.6609635852277279,
"num_tokens": 1110047.0,
"step": 14660
},
{
"epoch": 14.228848484848484,
"grad_norm": 1.2241610288619995,
"learning_rate": 9.474974974974975e-05,
"loss": 0.9469,
"mean_token_accuracy": 0.6950714159756899,
"num_tokens": 1120542.0,
"step": 14670
},
{
"epoch": 14.238545454545454,
"grad_norm": 0.6757529973983765,
"learning_rate": 9.46996996996997e-05,
"loss": 0.9628,
"mean_token_accuracy": 0.6720464017242194,
"num_tokens": 1131343.0,
"step": 14680
},
{
"epoch": 14.248242424242424,
"grad_norm": 0.9918266534805298,
"learning_rate": 9.464964964964965e-05,
"loss": 0.9084,
"mean_token_accuracy": 0.6795904841274023,
"num_tokens": 1142007.0,
"step": 14690
},
{
"epoch": 14.257939393939393,
"grad_norm": 0.9975070953369141,
"learning_rate": 9.45995995995996e-05,
"loss": 0.806,
"mean_token_accuracy": 0.734311144053936,
"num_tokens": 1151794.0,
"step": 14700
},
{
"epoch": 14.267636363636363,
"grad_norm": 0.6164572238922119,
"learning_rate": 9.454954954954955e-05,
"loss": 0.8116,
"mean_token_accuracy": 0.7311961345374585,
"num_tokens": 1161276.0,
"step": 14710
},
{
"epoch": 14.277333333333333,
"grad_norm": 0.8973527550697327,
"learning_rate": 9.44994994994995e-05,
"loss": 0.8702,
"mean_token_accuracy": 0.7167054928839207,
"num_tokens": 1171332.0,
"step": 14720
},
{
"epoch": 14.287030303030303,
"grad_norm": 0.6523808240890503,
"learning_rate": 9.444944944944946e-05,
"loss": 0.9112,
"mean_token_accuracy": 0.6947918102145195,
"num_tokens": 1181393.0,
"step": 14730
},
{
"epoch": 14.296727272727273,
"grad_norm": 0.41433241963386536,
"learning_rate": 9.43993993993994e-05,
"loss": 0.9264,
"mean_token_accuracy": 0.6836030226200819,
"num_tokens": 1191806.0,
"step": 14740
},
{
"epoch": 14.306424242424242,
"grad_norm": 0.7625298500061035,
"learning_rate": 9.434934934934936e-05,
"loss": 0.8197,
"mean_token_accuracy": 0.7187630910426378,
"num_tokens": 1200839.0,
"step": 14750
},
{
"epoch": 14.316121212121212,
"grad_norm": 0.5743375420570374,
"learning_rate": 9.42992992992993e-05,
"loss": 0.9071,
"mean_token_accuracy": 0.6910372313112021,
"num_tokens": 1211129.0,
"step": 14760
},
{
"epoch": 14.325818181818182,
"grad_norm": 1.0408577919006348,
"learning_rate": 9.424924924924925e-05,
"loss": 0.9313,
"mean_token_accuracy": 0.687668776512146,
"num_tokens": 1221591.0,
"step": 14770
},
{
"epoch": 14.335515151515152,
"grad_norm": 0.8543786406517029,
"learning_rate": 9.41991991991992e-05,
"loss": 0.9029,
"mean_token_accuracy": 0.7062688145786524,
"num_tokens": 1231723.0,
"step": 14780
},
{
"epoch": 14.345212121212121,
"grad_norm": 0.5075017809867859,
"learning_rate": 9.414914914914915e-05,
"loss": 0.8447,
"mean_token_accuracy": 0.7158392701297999,
"num_tokens": 1241710.0,
"step": 14790
},
{
"epoch": 14.354909090909091,
"grad_norm": 1.1220818758010864,
"learning_rate": 9.40990990990991e-05,
"loss": 0.9342,
"mean_token_accuracy": 0.6817990552634001,
"num_tokens": 1251618.0,
"step": 14800
},
{
"epoch": 7.288492307692308,
"grad_norm": 0.6032423377037048,
"learning_rate": 9.845357679969794e-05,
"loss": 0.6822,
"mean_token_accuracy": 0.7867337457835675,
"num_tokens": 9312.0,
"step": 14810
},
{
"epoch": 7.293415384615384,
"grad_norm": 2.23929762840271,
"learning_rate": 9.842759302218645e-05,
"loss": 0.8016,
"mean_token_accuracy": 0.750348436832428,
"num_tokens": 18778.0,
"step": 14820
},
{
"epoch": 7.298338461538462,
"grad_norm": 0.8688719868659973,
"learning_rate": 9.840139624995212e-05,
"loss": 0.6881,
"mean_token_accuracy": 0.7714763689786196,
"num_tokens": 27387.0,
"step": 14830
},
{
"epoch": 7.3032615384615385,
"grad_norm": 0.6268885135650635,
"learning_rate": 9.837498659821384e-05,
"loss": 0.7321,
"mean_token_accuracy": 0.7611544221639633,
"num_tokens": 36938.0,
"step": 14840
},
{
"epoch": 7.308184615384615,
"grad_norm": 0.822592556476593,
"learning_rate": 9.834836418312681e-05,
"loss": 0.744,
"mean_token_accuracy": 0.7452987994998693,
"num_tokens": 45571.0,
"step": 14850
},
{
"epoch": 7.3131076923076925,
"grad_norm": 0.4749494194984436,
"learning_rate": 9.8321529121782e-05,
"loss": 0.7403,
"mean_token_accuracy": 0.750314911454916,
"num_tokens": 54765.0,
"step": 14860
},
{
"epoch": 7.318030769230769,
"grad_norm": 1.3120962381362915,
"learning_rate": 9.829448153220566e-05,
"loss": 0.761,
"mean_token_accuracy": 0.7358665529638528,
"num_tokens": 63751.0,
"step": 14870
},
{
"epoch": 7.322953846153846,
"grad_norm": 0.7016109228134155,
"learning_rate": 9.826722153335877e-05,
"loss": 0.7017,
"mean_token_accuracy": 0.7645948387682437,
"num_tokens": 71817.0,
"step": 14880
},
{
"epoch": 7.327876923076923,
"grad_norm": 0.5037406086921692,
"learning_rate": 9.82397492451365e-05,
"loss": 0.7157,
"mean_token_accuracy": 0.7650218937546015,
"num_tokens": 80510.0,
"step": 14890
},
{
"epoch": 7.3328,
"grad_norm": 0.6709319353103638,
"learning_rate": 9.821206478836775e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7560942731797695,
"num_tokens": 89412.0,
"step": 14900
},
{
"epoch": 7.337723076923077,
"grad_norm": 1.4935665130615234,
"learning_rate": 9.81841682848146e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.7548914663493633,
"num_tokens": 99256.0,
"step": 14910
},
{
"epoch": 7.342646153846154,
"grad_norm": 0.44451966881752014,
"learning_rate": 9.815605985717171e-05,
"loss": 0.7185,
"mean_token_accuracy": 0.7600229732692242,
"num_tokens": 107641.0,
"step": 14920
},
{
"epoch": 7.34756923076923,
"grad_norm": 0.5159631371498108,
"learning_rate": 9.812773962906586e-05,
"loss": 0.7593,
"mean_token_accuracy": 0.7515256915241479,
"num_tokens": 116291.0,
"step": 14930
},
{
"epoch": 7.352492307692308,
"grad_norm": 1.3890159130096436,
"learning_rate": 9.809920772505532e-05,
"loss": 0.8097,
"mean_token_accuracy": 0.7170861914753914,
"num_tokens": 126012.0,
"step": 14940
},
{
"epoch": 7.3574153846153845,
"grad_norm": 1.5582915544509888,
"learning_rate": 9.807046427062944e-05,
"loss": 0.7585,
"mean_token_accuracy": 0.7490797568112612,
"num_tokens": 135364.0,
"step": 14950
},
{
"epoch": 7.362338461538462,
"grad_norm": 0.3708029091358185,
"learning_rate": 9.804150939220796e-05,
"loss": 0.7772,
"mean_token_accuracy": 0.7339643765240907,
"num_tokens": 143997.0,
"step": 14960
},
{
"epoch": 7.3672615384615385,
"grad_norm": 0.7632699012756348,
"learning_rate": 9.80123432171405e-05,
"loss": 0.7651,
"mean_token_accuracy": 0.7485666394233703,
"num_tokens": 153574.0,
"step": 14970
},
{
"epoch": 7.372184615384615,
"grad_norm": 1.4701080322265625,
"learning_rate": 9.798296587370603e-05,
"loss": 0.7292,
"mean_token_accuracy": 0.7644454840570688,
"num_tokens": 162637.0,
"step": 14980
},
{
"epoch": 7.377107692307693,
"grad_norm": 0.7957881093025208,
"learning_rate": 9.795337749111229e-05,
"loss": 0.8468,
"mean_token_accuracy": 0.7276211023330689,
"num_tokens": 173011.0,
"step": 14990
},
{
"epoch": 7.382030769230769,
"grad_norm": 0.41769880056381226,
"learning_rate": 9.792357819949518e-05,
"loss": 0.7238,
"mean_token_accuracy": 0.7681386031210422,
"num_tokens": 181909.0,
"step": 15000
},
{
"epoch": 7.386953846153846,
"grad_norm": 0.7403699159622192,
"learning_rate": 9.881224657674156e-05,
"loss": 0.7858,
"mean_token_accuracy": 0.7396802183240652,
"num_tokens": 8895.0,
"step": 15010
},
{
"epoch": 7.391876923076923,
"grad_norm": 0.6027011275291443,
"learning_rate": 9.879515199721796e-05,
"loss": 0.7274,
"mean_token_accuracy": 0.7529193755239248,
"num_tokens": 17998.0,
"step": 15020
},
{
"epoch": 7.3968,
"grad_norm": 0.4352588951587677,
"learning_rate": 9.87779367793514e-05,
"loss": 0.7908,
"mean_token_accuracy": 0.7491613268852234,
"num_tokens": 27846.0,
"step": 15030
},
{
"epoch": 7.401723076923077,
"grad_norm": 0.39712125062942505,
"learning_rate": 9.87606009657038e-05,
"loss": 0.7353,
"mean_token_accuracy": 0.7678960163146258,
"num_tokens": 36104.0,
"step": 15040
},
{
"epoch": 7.406646153846154,
"grad_norm": 0.5739689469337463,
"learning_rate": 9.874314459913522e-05,
"loss": 0.6803,
"mean_token_accuracy": 0.772222863510251,
"num_tokens": 44607.0,
"step": 15050
},
{
"epoch": 7.4115692307692305,
"grad_norm": 0.5296592116355896,
"learning_rate": 9.872556772280379e-05,
"loss": 0.6219,
"mean_token_accuracy": 0.7882269717752933,
"num_tokens": 52426.0,
"step": 15060
},
{
"epoch": 7.416492307692308,
"grad_norm": 0.7450407147407532,
"learning_rate": 9.870787038016557e-05,
"loss": 0.7046,
"mean_token_accuracy": 0.7562790676951409,
"num_tokens": 60835.0,
"step": 15070
},
{
"epoch": 7.4214153846153845,
"grad_norm": 0.8603422045707703,
"learning_rate": 9.869005261497446e-05,
"loss": 0.7464,
"mean_token_accuracy": 0.7453157220035791,
"num_tokens": 70309.0,
"step": 15080
},
{
"epoch": 7.426338461538462,
"grad_norm": 0.4799814820289612,
"learning_rate": 9.867211447128208e-05,
"loss": 0.8564,
"mean_token_accuracy": 0.7118423756211996,
"num_tokens": 80831.0,
"step": 15090
},
{
"epoch": 7.431261538461539,
"grad_norm": 0.38809236884117126,
"learning_rate": 9.865405599343768e-05,
"loss": 0.778,
"mean_token_accuracy": 0.729843546077609,
"num_tokens": 89878.0,
"step": 15100
},
{
"epoch": 7.436184615384615,
"grad_norm": 0.561546802520752,
"learning_rate": 9.863587722608799e-05,
"loss": 0.766,
"mean_token_accuracy": 0.736732891574502,
"num_tokens": 98413.0,
"step": 15110
},
{
"epoch": 7.441107692307693,
"grad_norm": 0.4035409986972809,
"learning_rate": 9.861757821417718e-05,
"loss": 0.6529,
"mean_token_accuracy": 0.7860307555645705,
"num_tokens": 106310.0,
"step": 15120
},
{
"epoch": 7.446030769230769,
"grad_norm": 1.2474324703216553,
"learning_rate": 9.859915900294666e-05,
"loss": 0.6801,
"mean_token_accuracy": 0.7747167505323886,
"num_tokens": 114567.0,
"step": 15130
},
{
"epoch": 7.450953846153846,
"grad_norm": 1.240290880203247,
"learning_rate": 9.858061963793503e-05,
"loss": 0.6493,
"mean_token_accuracy": 0.7812603395432234,
"num_tokens": 123149.0,
"step": 15140
},
{
"epoch": 7.455876923076923,
"grad_norm": 0.9319782853126526,
"learning_rate": 9.856196016497798e-05,
"loss": 0.8078,
"mean_token_accuracy": 0.7315979212522506,
"num_tokens": 132265.0,
"step": 15150
},
{
"epoch": 7.4608,
"grad_norm": 0.35292956233024597,
"learning_rate": 9.85431806302081e-05,
"loss": 0.7718,
"mean_token_accuracy": 0.7364303342998028,
"num_tokens": 141098.0,
"step": 15160
},
{
"epoch": 7.4657230769230765,
"grad_norm": 0.5348508358001709,
"learning_rate": 9.852428108005487e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.7685822080820799,
"num_tokens": 150742.0,
"step": 15170
},
{
"epoch": 7.470646153846154,
"grad_norm": 0.9570394158363342,
"learning_rate": 9.850526156124442e-05,
"loss": 0.6739,
"mean_token_accuracy": 0.7785952746868133,
"num_tokens": 159095.0,
"step": 15180
},
{
"epoch": 7.4755692307692305,
"grad_norm": 1.611872673034668,
"learning_rate": 9.848612212079955e-05,
"loss": 0.7185,
"mean_token_accuracy": 0.7705470208078623,
"num_tokens": 167922.0,
"step": 15190
},
{
"epoch": 7.480492307692308,
"grad_norm": 0.5744756460189819,
"learning_rate": 9.846686280603948e-05,
"loss": 0.8469,
"mean_token_accuracy": 0.724059621617198,
"num_tokens": 177884.0,
"step": 15200
},
{
"epoch": 7.485415384615385,
"grad_norm": 0.42839816212654114,
"learning_rate": 9.844748366457988e-05,
"loss": 0.7499,
"mean_token_accuracy": 0.7528812907636165,
"num_tokens": 187133.0,
"step": 15210
},
{
"epoch": 7.490338461538461,
"grad_norm": 2.1280364990234375,
"learning_rate": 9.84279847443326e-05,
"loss": 0.7742,
"mean_token_accuracy": 0.7489132527261972,
"num_tokens": 196413.0,
"step": 15220
},
{
"epoch": 7.495261538461539,
"grad_norm": 0.35753366351127625,
"learning_rate": 9.840836609350567e-05,
"loss": 0.835,
"mean_token_accuracy": 0.7175555892288685,
"num_tokens": 206238.0,
"step": 15230
},
{
"epoch": 7.500184615384615,
"grad_norm": 0.925093412399292,
"learning_rate": 9.838862776060312e-05,
"loss": 0.7501,
"mean_token_accuracy": 0.7446019750088453,
"num_tokens": 215620.0,
"step": 15240
},
{
"epoch": 7.505107692307693,
"grad_norm": 0.6911622881889343,
"learning_rate": 9.836876979442489e-05,
"loss": 0.7261,
"mean_token_accuracy": 0.7689918410032988,
"num_tokens": 224928.0,
"step": 15250
},
{
"epoch": 7.510030769230769,
"grad_norm": 0.7005440592765808,
"learning_rate": 9.834879224406663e-05,
"loss": 0.7894,
"mean_token_accuracy": 0.741933236643672,
"num_tokens": 235020.0,
"step": 15260
},
{
"epoch": 7.514953846153846,
"grad_norm": 0.5132576823234558,
"learning_rate": 9.832869515891975e-05,
"loss": 0.7629,
"mean_token_accuracy": 0.7501115497201681,
"num_tokens": 244901.0,
"step": 15270
},
{
"epoch": 7.519876923076923,
"grad_norm": 0.4901637136936188,
"learning_rate": 9.83084785886711e-05,
"loss": 0.7618,
"mean_token_accuracy": 0.7478879150003195,
"num_tokens": 254306.0,
"step": 15280
},
{
"epoch": 7.5248,
"grad_norm": 0.6824623346328735,
"learning_rate": 9.828814258330298e-05,
"loss": 0.7023,
"mean_token_accuracy": 0.7611722193658352,
"num_tokens": 263006.0,
"step": 15290
},
{
"epoch": 7.5297230769230765,
"grad_norm": 0.4069543480873108,
"learning_rate": 9.826768719309298e-05,
"loss": 0.7126,
"mean_token_accuracy": 0.7572260867804289,
"num_tokens": 271687.0,
"step": 15300
},
{
"epoch": 7.534646153846154,
"grad_norm": 0.7551083564758301,
"learning_rate": 9.824711246861382e-05,
"loss": 0.8352,
"mean_token_accuracy": 0.718966668099165,
"num_tokens": 281372.0,
"step": 15310
},
{
"epoch": 7.539569230769231,
"grad_norm": 0.8479435443878174,
"learning_rate": 9.822641846073329e-05,
"loss": 0.8138,
"mean_token_accuracy": 0.752262394875288,
"num_tokens": 290553.0,
"step": 15320
},
{
"epoch": 7.544492307692308,
"grad_norm": 0.38278627395629883,
"learning_rate": 9.820560522061403e-05,
"loss": 0.7287,
"mean_token_accuracy": 0.7666766557842493,
"num_tokens": 299428.0,
"step": 15330
},
{
"epoch": 7.549415384615385,
"grad_norm": 0.7417807579040527,
"learning_rate": 9.818467279971355e-05,
"loss": 0.6453,
"mean_token_accuracy": 0.7891027696430684,
"num_tokens": 308217.0,
"step": 15340
},
{
"epoch": 7.554338461538461,
"grad_norm": 0.41675281524658203,
"learning_rate": 9.816362124978396e-05,
"loss": 0.703,
"mean_token_accuracy": 0.7679217629134655,
"num_tokens": 316520.0,
"step": 15350
},
{
"epoch": 7.559261538461539,
"grad_norm": 0.8314495086669922,
"learning_rate": 9.814245062287189e-05,
"loss": 0.6985,
"mean_token_accuracy": 0.7756699241697789,
"num_tokens": 325247.0,
"step": 15360
},
{
"epoch": 7.564184615384615,
"grad_norm": 0.5109190344810486,
"learning_rate": 9.812116097131839e-05,
"loss": 0.6479,
"mean_token_accuracy": 0.7915467619895935,
"num_tokens": 333857.0,
"step": 15370
},
{
"epoch": 7.569107692307693,
"grad_norm": 0.8507750630378723,
"learning_rate": 9.80997523477588e-05,
"loss": 0.6877,
"mean_token_accuracy": 0.7707390915602446,
"num_tokens": 343201.0,
"step": 15380
},
{
"epoch": 7.574030769230769,
"grad_norm": 0.4465511739253998,
"learning_rate": 9.807822480512256e-05,
"loss": 0.7341,
"mean_token_accuracy": 0.7457791332155466,
"num_tokens": 352232.0,
"step": 15390
},
{
"epoch": 7.578953846153846,
"grad_norm": 0.7074446082115173,
"learning_rate": 9.805657839663313e-05,
"loss": 0.5786,
"mean_token_accuracy": 0.7954732224345207,
"num_tokens": 360362.0,
"step": 15400
},
{
"epoch": 7.583876923076923,
"grad_norm": 0.4567805826663971,
"learning_rate": 9.803481317580788e-05,
"loss": 0.7394,
"mean_token_accuracy": 0.7533329404890537,
"num_tokens": 369312.0,
"step": 15410
},
{
"epoch": 7.5888,
"grad_norm": 0.4720822274684906,
"learning_rate": 9.801292919645786e-05,
"loss": 0.7422,
"mean_token_accuracy": 0.7545758258551359,
"num_tokens": 378787.0,
"step": 15420
},
{
"epoch": 7.593723076923077,
"grad_norm": 0.6811593770980835,
"learning_rate": 9.799092651268778e-05,
"loss": 0.7089,
"mean_token_accuracy": 0.755854606255889,
"num_tokens": 387085.0,
"step": 15430
},
{
"epoch": 7.598646153846154,
"grad_norm": 0.520389974117279,
"learning_rate": 9.796880517889583e-05,
"loss": 0.7357,
"mean_token_accuracy": 0.7585709065198898,
"num_tokens": 395607.0,
"step": 15440
},
{
"epoch": 7.603569230769231,
"grad_norm": 0.4988136291503906,
"learning_rate": 9.794656524977353e-05,
"loss": 0.7718,
"mean_token_accuracy": 0.7427222758531571,
"num_tokens": 404335.0,
"step": 15450
},
{
"epoch": 7.608492307692307,
"grad_norm": 0.4840397834777832,
"learning_rate": 9.792420678030559e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.7715373657643795,
"num_tokens": 412789.0,
"step": 15460
},
{
"epoch": 7.613415384615385,
"grad_norm": 0.48264145851135254,
"learning_rate": 9.790172982576982e-05,
"loss": 0.7478,
"mean_token_accuracy": 0.7376698384061455,
"num_tokens": 421957.0,
"step": 15470
},
{
"epoch": 7.618338461538461,
"grad_norm": 0.5378937125205994,
"learning_rate": 9.787913444173696e-05,
"loss": 0.7276,
"mean_token_accuracy": 0.7619619213044644,
"num_tokens": 431082.0,
"step": 15480
},
{
"epoch": 7.623261538461539,
"grad_norm": 0.775534451007843,
"learning_rate": 9.785642068407055e-05,
"loss": 0.6669,
"mean_token_accuracy": 0.7788964670151473,
"num_tokens": 439416.0,
"step": 15490
},
{
"epoch": 7.628184615384615,
"grad_norm": 0.705254077911377,
"learning_rate": 9.783358860892679e-05,
"loss": 0.7338,
"mean_token_accuracy": 0.7540049366652966,
"num_tokens": 447426.0,
"step": 15500
},
{
"epoch": 7.633107692307692,
"grad_norm": 0.5129554271697998,
"learning_rate": 9.781063827275437e-05,
"loss": 0.7533,
"mean_token_accuracy": 0.747262655198574,
"num_tokens": 456215.0,
"step": 15510
},
{
"epoch": 7.638030769230769,
"grad_norm": 0.546363890171051,
"learning_rate": 9.778756973229441e-05,
"loss": 0.7179,
"mean_token_accuracy": 0.767508839443326,
"num_tokens": 465873.0,
"step": 15520
},
{
"epoch": 7.642953846153846,
"grad_norm": 0.5648319125175476,
"learning_rate": 9.776438304458025e-05,
"loss": 0.6624,
"mean_token_accuracy": 0.7714706733822823,
"num_tokens": 474390.0,
"step": 15530
},
{
"epoch": 7.6478769230769235,
"grad_norm": 0.5035734176635742,
"learning_rate": 9.774107826693731e-05,
"loss": 0.6713,
"mean_token_accuracy": 0.7714920256286859,
"num_tokens": 482861.0,
"step": 15540
},
{
"epoch": 7.6528,
"grad_norm": 0.5865426659584045,
"learning_rate": 9.771765545698303e-05,
"loss": 0.6718,
"mean_token_accuracy": 0.7740787465125323,
"num_tokens": 492284.0,
"step": 15550
},
{
"epoch": 7.657723076923077,
"grad_norm": 0.39083245396614075,
"learning_rate": 9.769411467262658e-05,
"loss": 0.6844,
"mean_token_accuracy": 0.7694638129323721,
"num_tokens": 501852.0,
"step": 15560
},
{
"epoch": 7.662646153846154,
"grad_norm": 0.5364481806755066,
"learning_rate": 9.767045597206888e-05,
"loss": 0.8126,
"mean_token_accuracy": 0.7328575398772955,
"num_tokens": 511967.0,
"step": 15570
},
{
"epoch": 7.667569230769231,
"grad_norm": 0.8996931910514832,
"learning_rate": 9.764667941380234e-05,
"loss": 0.731,
"mean_token_accuracy": 0.7515979178249836,
"num_tokens": 520643.0,
"step": 15580
},
{
"epoch": 7.672492307692307,
"grad_norm": 0.5694324374198914,
"learning_rate": 9.762278505661074e-05,
"loss": 0.7069,
"mean_token_accuracy": 0.7669325869530439,
"num_tokens": 529583.0,
"step": 15590
},
{
"epoch": 7.677415384615385,
"grad_norm": 0.7352235317230225,
"learning_rate": 9.759877295956916e-05,
"loss": 0.7426,
"mean_token_accuracy": 0.7516727082431316,
"num_tokens": 538607.0,
"step": 15600
},
{
"epoch": 7.682338461538461,
"grad_norm": 0.32605522871017456,
"learning_rate": 9.757464318204373e-05,
"loss": 0.7449,
"mean_token_accuracy": 0.7565869923681021,
"num_tokens": 8860.0,
"step": 15610
},
{
"epoch": 7.687261538461539,
"grad_norm": 1.012762188911438,
"learning_rate": 9.755039578369149e-05,
"loss": 0.771,
"mean_token_accuracy": 0.7340531777590513,
"num_tokens": 18651.0,
"step": 15620
},
{
"epoch": 7.692184615384615,
"grad_norm": 0.8538568615913391,
"learning_rate": 9.752603082446036e-05,
"loss": 0.7248,
"mean_token_accuracy": 0.7604099120944738,
"num_tokens": 27363.0,
"step": 15630
},
{
"epoch": 7.697107692307692,
"grad_norm": 0.6249682903289795,
"learning_rate": 9.750154836458887e-05,
"loss": 0.6874,
"mean_token_accuracy": 0.7692125029861927,
"num_tokens": 35912.0,
"step": 15640
},
{
"epoch": 7.7020307692307695,
"grad_norm": 0.8196687698364258,
"learning_rate": 9.747694846460605e-05,
"loss": 0.64,
"mean_token_accuracy": 0.7777061153203249,
"num_tokens": 44561.0,
"step": 15650
},
{
"epoch": 7.706953846153846,
"grad_norm": 0.4318842887878418,
"learning_rate": 9.745223118533127e-05,
"loss": 0.6814,
"mean_token_accuracy": 0.7579683996737003,
"num_tokens": 53007.0,
"step": 15660
},
{
"epoch": 7.7118769230769235,
"grad_norm": 0.3699759542942047,
"learning_rate": 9.742739658787414e-05,
"loss": 0.6928,
"mean_token_accuracy": 0.7668475016951561,
"num_tokens": 62417.0,
"step": 15670
},
{
"epoch": 7.7168,
"grad_norm": 0.4260357618331909,
"learning_rate": 9.740244473363426e-05,
"loss": 0.7704,
"mean_token_accuracy": 0.7583841320127249,
"num_tokens": 71808.0,
"step": 15680
},
{
"epoch": 7.721723076923077,
"grad_norm": 0.8471167087554932,
"learning_rate": 9.737737568430123e-05,
"loss": 0.6393,
"mean_token_accuracy": 0.7948539689183235,
"num_tokens": 80627.0,
"step": 15690
},
{
"epoch": 7.726646153846154,
"grad_norm": 0.3456837236881256,
"learning_rate": 9.735218950185428e-05,
"loss": 0.7253,
"mean_token_accuracy": 0.7614364203065633,
"num_tokens": 89544.0,
"step": 15700
},
{
"epoch": 7.731569230769231,
"grad_norm": 0.5443410277366638,
"learning_rate": 9.732688624856231e-05,
"loss": 0.6766,
"mean_token_accuracy": 0.774466859921813,
"num_tokens": 98452.0,
"step": 15710
},
{
"epoch": 7.736492307692307,
"grad_norm": 0.36821064352989197,
"learning_rate": 9.730146598698363e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.7423054609447718,
"num_tokens": 108409.0,
"step": 15720
},
{
"epoch": 7.741415384615385,
"grad_norm": 0.8412238955497742,
"learning_rate": 9.727592877996585e-05,
"loss": 0.6721,
"mean_token_accuracy": 0.7684708528220654,
"num_tokens": 116974.0,
"step": 15730
},
{
"epoch": 7.746338461538461,
"grad_norm": 0.709597647190094,
"learning_rate": 9.725027469064568e-05,
"loss": 0.6988,
"mean_token_accuracy": 0.769633786380291,
"num_tokens": 125747.0,
"step": 15740
},
{
"epoch": 7.751261538461538,
"grad_norm": 0.3418969213962555,
"learning_rate": 9.722450378244884e-05,
"loss": 0.739,
"mean_token_accuracy": 0.7599714059382677,
"num_tokens": 134645.0,
"step": 15750
},
{
"epoch": 7.7561846153846155,
"grad_norm": 0.6444127559661865,
"learning_rate": 9.719861611908984e-05,
"loss": 0.7256,
"mean_token_accuracy": 0.7679268248379231,
"num_tokens": 144249.0,
"step": 15760
},
{
"epoch": 7.761107692307692,
"grad_norm": 0.303688108921051,
"learning_rate": 9.717261176457187e-05,
"loss": 0.8164,
"mean_token_accuracy": 0.7411212358623743,
"num_tokens": 153958.0,
"step": 15770
},
{
"epoch": 7.7660307692307695,
"grad_norm": 0.3446861505508423,
"learning_rate": 9.71464907831866e-05,
"loss": 0.7577,
"mean_token_accuracy": 0.7447476647794247,
"num_tokens": 163223.0,
"step": 15780
},
{
"epoch": 7.770953846153846,
"grad_norm": 1.021315097808838,
"learning_rate": 9.712025323951405e-05,
"loss": 0.8067,
"mean_token_accuracy": 0.7387041725218296,
"num_tokens": 172836.0,
"step": 15790
},
{
"epoch": 7.775876923076924,
"grad_norm": 0.47042426466941833,
"learning_rate": 9.709389919842244e-05,
"loss": 0.644,
"mean_token_accuracy": 0.7863428425043821,
"num_tokens": 180923.0,
"step": 15800
},
{
"epoch": 7.7808,
"grad_norm": 0.4498484432697296,
"learning_rate": 9.706742872506796e-05,
"loss": 0.7652,
"mean_token_accuracy": 0.7459516085684299,
"num_tokens": 189045.0,
"step": 15810
},
{
"epoch": 7.785723076923077,
"grad_norm": 0.3966211974620819,
"learning_rate": 9.704084188489473e-05,
"loss": 0.7547,
"mean_token_accuracy": 0.7628035910427571,
"num_tokens": 197908.0,
"step": 15820
},
{
"epoch": 7.790646153846154,
"grad_norm": 0.5024072527885437,
"learning_rate": 9.701413874363449e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7664535760879516,
"num_tokens": 207676.0,
"step": 15830
},
{
"epoch": 7.795569230769231,
"grad_norm": 0.9534441232681274,
"learning_rate": 9.698731936730662e-05,
"loss": 0.6927,
"mean_token_accuracy": 0.7581623613834381,
"num_tokens": 216190.0,
"step": 15840
},
{
"epoch": 7.800492307692307,
"grad_norm": 0.3889976143836975,
"learning_rate": 9.696038382221775e-05,
"loss": 0.7342,
"mean_token_accuracy": 0.758062494546175,
"num_tokens": 224885.0,
"step": 15850
},
{
"epoch": 7.805415384615385,
"grad_norm": 0.3749610185623169,
"learning_rate": 9.693333217496183e-05,
"loss": 0.7733,
"mean_token_accuracy": 0.7525778859853745,
"num_tokens": 234675.0,
"step": 15860
},
{
"epoch": 7.8103384615384615,
"grad_norm": 1.377591848373413,
"learning_rate": 9.690616449241976e-05,
"loss": 0.7902,
"mean_token_accuracy": 0.7485566444694995,
"num_tokens": 243966.0,
"step": 15870
},
{
"epoch": 7.815261538461538,
"grad_norm": 0.4818339943885803,
"learning_rate": 9.68788808417594e-05,
"loss": 0.7403,
"mean_token_accuracy": 0.7505327112972736,
"num_tokens": 253324.0,
"step": 15880
},
{
"epoch": 7.8201846153846155,
"grad_norm": 0.6426275372505188,
"learning_rate": 9.685148129043528e-05,
"loss": 0.7431,
"mean_token_accuracy": 0.7493322882801294,
"num_tokens": 261869.0,
"step": 15890
},
{
"epoch": 7.825107692307692,
"grad_norm": 0.6179720163345337,
"learning_rate": 9.682396590618848e-05,
"loss": 0.8594,
"mean_token_accuracy": 0.726296653598547,
"num_tokens": 271518.0,
"step": 15900
},
{
"epoch": 7.83003076923077,
"grad_norm": 0.7233896851539612,
"learning_rate": 9.679633475704645e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.7537887316197157,
"num_tokens": 279856.0,
"step": 15910
},
{
"epoch": 7.834953846153846,
"grad_norm": 0.4409298002719879,
"learning_rate": 9.676858791132289e-05,
"loss": 0.6689,
"mean_token_accuracy": 0.7718529254198074,
"num_tokens": 288033.0,
"step": 15920
},
{
"epoch": 7.839876923076923,
"grad_norm": 1.0393766164779663,
"learning_rate": 9.674072543761747e-05,
"loss": 0.7102,
"mean_token_accuracy": 0.7685550011694431,
"num_tokens": 296825.0,
"step": 15930
},
{
"epoch": 7.8448,
"grad_norm": 0.4495505094528198,
"learning_rate": 9.671274740481584e-05,
"loss": 0.8089,
"mean_token_accuracy": 0.7236046094447375,
"num_tokens": 305764.0,
"step": 15940
},
{
"epoch": 7.849723076923077,
"grad_norm": 0.378525048494339,
"learning_rate": 9.668465388208923e-05,
"loss": 0.7541,
"mean_token_accuracy": 0.7619457546621561,
"num_tokens": 315045.0,
"step": 15950
},
{
"epoch": 7.854646153846154,
"grad_norm": 0.3276619613170624,
"learning_rate": 9.66564449388945e-05,
"loss": 0.7818,
"mean_token_accuracy": 0.7495603717863559,
"num_tokens": 324401.0,
"step": 15960
},
{
"epoch": 7.859569230769231,
"grad_norm": 0.6946415901184082,
"learning_rate": 9.66281206449738e-05,
"loss": 0.6496,
"mean_token_accuracy": 0.786292115598917,
"num_tokens": 332409.0,
"step": 15970
},
{
"epoch": 7.8644923076923074,
"grad_norm": 0.6143710613250732,
"learning_rate": 9.659968107035449e-05,
"loss": 0.7024,
"mean_token_accuracy": 0.7785773172974586,
"num_tokens": 341622.0,
"step": 15980
},
{
"epoch": 7.869415384615385,
"grad_norm": 1.626356601715088,
"learning_rate": 9.657112628534898e-05,
"loss": 0.7933,
"mean_token_accuracy": 0.7386716432869435,
"num_tokens": 350779.0,
"step": 15990
},
{
"epoch": 7.8743384615384615,
"grad_norm": 0.8346491456031799,
"learning_rate": 9.654245636055447e-05,
"loss": 0.6961,
"mean_token_accuracy": 0.7623608373105526,
"num_tokens": 360097.0,
"step": 16000
},
{
"epoch": 7.879261538461538,
"grad_norm": 0.5154821872711182,
"learning_rate": 9.651367136685283e-05,
"loss": 0.7421,
"mean_token_accuracy": 0.7435356438159942,
"num_tokens": 368657.0,
"step": 16010
},
{
"epoch": 7.884184615384616,
"grad_norm": 0.40335017442703247,
"learning_rate": 9.648477137541045e-05,
"loss": 0.7217,
"mean_token_accuracy": 0.7617222603410483,
"num_tokens": 376972.0,
"step": 16020
},
{
"epoch": 7.889107692307692,
"grad_norm": 0.3109897971153259,
"learning_rate": 9.645575645767802e-05,
"loss": 0.7785,
"mean_token_accuracy": 0.7252940777689219,
"num_tokens": 386691.0,
"step": 16030
},
{
"epoch": 7.894030769230769,
"grad_norm": 0.7454068064689636,
"learning_rate": 9.642662668539034e-05,
"loss": 0.7545,
"mean_token_accuracy": 0.7471344050019979,
"num_tokens": 395505.0,
"step": 16040
},
{
"epoch": 7.898953846153846,
"grad_norm": 0.37064892053604126,
"learning_rate": 9.63973821305662e-05,
"loss": 0.6351,
"mean_token_accuracy": 0.7928381565958261,
"num_tokens": 404112.0,
"step": 16050
},
{
"epoch": 7.903876923076923,
"grad_norm": 0.36622127890586853,
"learning_rate": 9.636802286550816e-05,
"loss": 0.7544,
"mean_token_accuracy": 0.7577709004282951,
"num_tokens": 413357.0,
"step": 16060
},
{
"epoch": 7.9088,
"grad_norm": 0.6665292978286743,
"learning_rate": 9.633854896280243e-05,
"loss": 0.7774,
"mean_token_accuracy": 0.743139598891139,
"num_tokens": 423588.0,
"step": 16070
},
{
"epoch": 7.913723076923077,
"grad_norm": 0.8473738431930542,
"learning_rate": 9.630896049531855e-05,
"loss": 0.7409,
"mean_token_accuracy": 0.7385849550366401,
"num_tokens": 432217.0,
"step": 16080
},
{
"epoch": 7.918646153846154,
"grad_norm": 1.1277004480361938,
"learning_rate": 9.627925753620939e-05,
"loss": 0.6382,
"mean_token_accuracy": 0.7997437328100204,
"num_tokens": 440454.0,
"step": 16090
},
{
"epoch": 7.923569230769231,
"grad_norm": 0.5827834010124207,
"learning_rate": 9.62494401589108e-05,
"loss": 0.7146,
"mean_token_accuracy": 0.7843764916062355,
"num_tokens": 449378.0,
"step": 16100
},
{
"epoch": 7.9284923076923075,
"grad_norm": 0.45561665296554565,
"learning_rate": 9.621950843714163e-05,
"loss": 0.7489,
"mean_token_accuracy": 0.755017938092351,
"num_tokens": 458985.0,
"step": 16110
},
{
"epoch": 7.933415384615385,
"grad_norm": 0.45355021953582764,
"learning_rate": 9.618946244490328e-05,
"loss": 0.7944,
"mean_token_accuracy": 0.7366019859910011,
"num_tokens": 467822.0,
"step": 16120
},
{
"epoch": 7.938338461538462,
"grad_norm": 0.45162439346313477,
"learning_rate": 9.61593022564798e-05,
"loss": 0.7063,
"mean_token_accuracy": 0.7567742951214314,
"num_tokens": 476100.0,
"step": 16130
},
{
"epoch": 7.943261538461538,
"grad_norm": 0.9754898548126221,
"learning_rate": 9.612902794643748e-05,
"loss": 0.6584,
"mean_token_accuracy": 0.780465978384018,
"num_tokens": 484368.0,
"step": 16140
},
{
"epoch": 7.948184615384616,
"grad_norm": 0.3318362832069397,
"learning_rate": 9.609863958962482e-05,
"loss": 0.6997,
"mean_token_accuracy": 0.7755364947021007,
"num_tokens": 493961.0,
"step": 16150
},
{
"epoch": 7.953107692307692,
"grad_norm": 0.435249388217926,
"learning_rate": 9.606813726117223e-05,
"loss": 0.5637,
"mean_token_accuracy": 0.7991742443293333,
"num_tokens": 501913.0,
"step": 16160
},
{
"epoch": 7.958030769230769,
"grad_norm": 0.43728408217430115,
"learning_rate": 9.603752103649194e-05,
"loss": 0.7412,
"mean_token_accuracy": 0.7628684055060149,
"num_tokens": 510392.0,
"step": 16170
},
{
"epoch": 7.962953846153846,
"grad_norm": 0.46618780493736267,
"learning_rate": 9.600679099127774e-05,
"loss": 0.7086,
"mean_token_accuracy": 0.7639894340187311,
"num_tokens": 519632.0,
"step": 16180
},
{
"epoch": 7.967876923076923,
"grad_norm": 0.35183632373809814,
"learning_rate": 9.597594720150485e-05,
"loss": 0.6746,
"mean_token_accuracy": 0.7732372462749482,
"num_tokens": 528385.0,
"step": 16190
},
{
"epoch": 7.9728,
"grad_norm": 0.42351534962654114,
"learning_rate": 9.59449897434297e-05,
"loss": 0.74,
"mean_token_accuracy": 0.7574392698705197,
"num_tokens": 537139.0,
"step": 16200
},
{
"epoch": 7.977723076923077,
"grad_norm": 0.451408326625824,
"learning_rate": 8.704204204204205e-05,
"loss": 0.739,
"mean_token_accuracy": 0.7462680261582136,
"num_tokens": 9224.0,
"step": 16210
},
{
"epoch": 7.9826461538461535,
"grad_norm": 0.519805908203125,
"learning_rate": 8.699199199199199e-05,
"loss": 0.727,
"mean_token_accuracy": 0.7574180524796248,
"num_tokens": 18701.0,
"step": 16220
},
{
"epoch": 7.987569230769231,
"grad_norm": 0.36464399099349976,
"learning_rate": 8.694194194194195e-05,
"loss": 0.6993,
"mean_token_accuracy": 0.7700857035815716,
"num_tokens": 27224.0,
"step": 16230
},
{
"epoch": 7.992492307692308,
"grad_norm": 0.2717822790145874,
"learning_rate": 8.68918918918919e-05,
"loss": 0.7204,
"mean_token_accuracy": 0.7610609702765941,
"num_tokens": 35640.0,
"step": 16240
},
{
"epoch": 7.997415384615385,
"grad_norm": 0.3014907240867615,
"learning_rate": 8.684184184184185e-05,
"loss": 0.8007,
"mean_token_accuracy": 0.7250883210450411,
"num_tokens": 45152.0,
"step": 16250
},
{
"epoch": 8.002461538461539,
"grad_norm": 0.4179680049419403,
"learning_rate": 8.67917917917918e-05,
"loss": 0.7608,
"mean_token_accuracy": 0.7735646199889299,
"num_tokens": 54246.0,
"step": 16260
},
{
"epoch": 8.007384615384616,
"grad_norm": 0.506325900554657,
"learning_rate": 8.674174174174175e-05,
"loss": 0.8325,
"mean_token_accuracy": 0.7233378864824772,
"num_tokens": 63913.0,
"step": 16270
},
{
"epoch": 8.012307692307692,
"grad_norm": 0.6368007063865662,
"learning_rate": 8.66916916916917e-05,
"loss": 0.7237,
"mean_token_accuracy": 0.7582856122404337,
"num_tokens": 72805.0,
"step": 16280
},
{
"epoch": 8.01723076923077,
"grad_norm": 0.45158663392066956,
"learning_rate": 8.664164164164165e-05,
"loss": 0.744,
"mean_token_accuracy": 0.7568928249180317,
"num_tokens": 81952.0,
"step": 16290
},
{
"epoch": 8.022153846153847,
"grad_norm": 0.8606657981872559,
"learning_rate": 8.659159159159159e-05,
"loss": 0.6465,
"mean_token_accuracy": 0.78743049018085,
"num_tokens": 90446.0,
"step": 16300
},
{
"epoch": 8.027076923076923,
"grad_norm": 0.8622094392776489,
"learning_rate": 8.654154154154155e-05,
"loss": 0.646,
"mean_token_accuracy": 0.7958425115793943,
"num_tokens": 99305.0,
"step": 16310
},
{
"epoch": 8.032,
"grad_norm": 0.37887170910835266,
"learning_rate": 8.649149149149149e-05,
"loss": 0.7942,
"mean_token_accuracy": 0.7280906450003386,
"num_tokens": 108733.0,
"step": 16320
},
{
"epoch": 8.036923076923078,
"grad_norm": 0.4614126980304718,
"learning_rate": 8.644144144144145e-05,
"loss": 0.7874,
"mean_token_accuracy": 0.7451971229165792,
"num_tokens": 118370.0,
"step": 16330
},
{
"epoch": 8.041846153846153,
"grad_norm": 0.5304930210113525,
"learning_rate": 8.639139139139139e-05,
"loss": 0.7829,
"mean_token_accuracy": 0.7486832808703184,
"num_tokens": 128043.0,
"step": 16340
},
{
"epoch": 8.04676923076923,
"grad_norm": 0.7120644450187683,
"learning_rate": 8.634134134134135e-05,
"loss": 0.8206,
"mean_token_accuracy": 0.7201329939067364,
"num_tokens": 138112.0,
"step": 16350
},
{
"epoch": 8.051692307692308,
"grad_norm": 0.40515926480293274,
"learning_rate": 8.62912912912913e-05,
"loss": 0.648,
"mean_token_accuracy": 0.7795034911483526,
"num_tokens": 146211.0,
"step": 16360
},
{
"epoch": 8.056615384615384,
"grad_norm": 0.5807082653045654,
"learning_rate": 8.624124124124125e-05,
"loss": 0.7214,
"mean_token_accuracy": 0.7683356497436762,
"num_tokens": 155212.0,
"step": 16370
},
{
"epoch": 8.061538461538461,
"grad_norm": 0.8227428793907166,
"learning_rate": 8.61911911911912e-05,
"loss": 0.7255,
"mean_token_accuracy": 0.7549647618085146,
"num_tokens": 164218.0,
"step": 16380
},
{
"epoch": 8.066461538461539,
"grad_norm": 0.3668994903564453,
"learning_rate": 8.614114114114115e-05,
"loss": 0.6535,
"mean_token_accuracy": 0.7847654249519109,
"num_tokens": 173078.0,
"step": 16390
},
{
"epoch": 8.071384615384616,
"grad_norm": 0.28024813532829285,
"learning_rate": 8.609109109109109e-05,
"loss": 0.6654,
"mean_token_accuracy": 0.7698544282466173,
"num_tokens": 181153.0,
"step": 16400
},
{
"epoch": 8.076307692307692,
"grad_norm": 0.328283429145813,
"learning_rate": 8.604104104104105e-05,
"loss": 0.6977,
"mean_token_accuracy": 0.7609763164073229,
"num_tokens": 189338.0,
"step": 16410
},
{
"epoch": 8.08123076923077,
"grad_norm": 0.7588334083557129,
"learning_rate": 8.599099099099099e-05,
"loss": 0.7558,
"mean_token_accuracy": 0.742162485793233,
"num_tokens": 198089.0,
"step": 16420
},
{
"epoch": 8.086153846153847,
"grad_norm": 0.3376314342021942,
"learning_rate": 8.594094094094095e-05,
"loss": 0.6729,
"mean_token_accuracy": 0.768299813196063,
"num_tokens": 206389.0,
"step": 16430
},
{
"epoch": 8.091076923076923,
"grad_norm": 0.5634762048721313,
"learning_rate": 8.589089089089089e-05,
"loss": 0.7448,
"mean_token_accuracy": 0.7404033329337836,
"num_tokens": 215926.0,
"step": 16440
},
{
"epoch": 8.096,
"grad_norm": 0.3723192811012268,
"learning_rate": 8.584084084084085e-05,
"loss": 0.7675,
"mean_token_accuracy": 0.7391767490655183,
"num_tokens": 225284.0,
"step": 16450
},
{
"epoch": 8.100923076923078,
"grad_norm": 0.3543316721916199,
"learning_rate": 8.57907907907908e-05,
"loss": 0.6199,
"mean_token_accuracy": 0.7883176296949387,
"num_tokens": 233644.0,
"step": 16460
},
{
"epoch": 8.105846153846153,
"grad_norm": 1.3809056282043457,
"learning_rate": 8.574074074074075e-05,
"loss": 0.6823,
"mean_token_accuracy": 0.7546458698809146,
"num_tokens": 242102.0,
"step": 16470
},
{
"epoch": 8.11076923076923,
"grad_norm": 0.4195917248725891,
"learning_rate": 8.56906906906907e-05,
"loss": 0.7027,
"mean_token_accuracy": 0.7585832923650742,
"num_tokens": 250978.0,
"step": 16480
},
{
"epoch": 8.115692307692308,
"grad_norm": 0.5387942790985107,
"learning_rate": 8.564064064064065e-05,
"loss": 0.785,
"mean_token_accuracy": 0.7469440281391144,
"num_tokens": 259569.0,
"step": 16490
},
{
"epoch": 8.120615384615384,
"grad_norm": 0.3012475371360779,
"learning_rate": 8.559059059059059e-05,
"loss": 0.8153,
"mean_token_accuracy": 0.7393761333078146,
"num_tokens": 269415.0,
"step": 16500
},
{
"epoch": 8.125538461538461,
"grad_norm": 0.8275740742683411,
"learning_rate": 8.554054054054055e-05,
"loss": 0.7829,
"mean_token_accuracy": 0.742312715575099,
"num_tokens": 278571.0,
"step": 16510
},
{
"epoch": 8.130461538461539,
"grad_norm": 1.3069651126861572,
"learning_rate": 8.549049049049049e-05,
"loss": 0.6394,
"mean_token_accuracy": 0.7816533345729113,
"num_tokens": 286512.0,
"step": 16520
},
{
"epoch": 8.135384615384615,
"grad_norm": 0.45634856820106506,
"learning_rate": 8.544044044044043e-05,
"loss": 0.7042,
"mean_token_accuracy": 0.765632963180542,
"num_tokens": 295790.0,
"step": 16530
},
{
"epoch": 8.140307692307692,
"grad_norm": 0.37332087755203247,
"learning_rate": 8.539039039039039e-05,
"loss": 0.6812,
"mean_token_accuracy": 0.7715237192809582,
"num_tokens": 304612.0,
"step": 16540
},
{
"epoch": 8.14523076923077,
"grad_norm": 0.3229140043258667,
"learning_rate": 8.534034034034033e-05,
"loss": 0.7512,
"mean_token_accuracy": 0.7543146207928657,
"num_tokens": 312878.0,
"step": 16550
},
{
"epoch": 8.150153846153847,
"grad_norm": 0.46332916617393494,
"learning_rate": 8.529029029029029e-05,
"loss": 0.7706,
"mean_token_accuracy": 0.7599172580987215,
"num_tokens": 322455.0,
"step": 16560
},
{
"epoch": 8.155076923076923,
"grad_norm": 0.3571588695049286,
"learning_rate": 8.524024024024025e-05,
"loss": 0.6311,
"mean_token_accuracy": 0.7764216579496861,
"num_tokens": 330287.0,
"step": 16570
},
{
"epoch": 8.16,
"grad_norm": 0.33986207842826843,
"learning_rate": 8.519019019019019e-05,
"loss": 0.7048,
"mean_token_accuracy": 0.7667416296899319,
"num_tokens": 339451.0,
"step": 16580
},
{
"epoch": 8.164923076923078,
"grad_norm": 0.4668309688568115,
"learning_rate": 8.514014014014015e-05,
"loss": 0.764,
"mean_token_accuracy": 0.757263046503067,
"num_tokens": 348412.0,
"step": 16590
},
{
"epoch": 8.169846153846153,
"grad_norm": 0.6498896479606628,
"learning_rate": 8.509009009009009e-05,
"loss": 0.7584,
"mean_token_accuracy": 0.7656078919768333,
"num_tokens": 357039.0,
"step": 16600
},
{
"epoch": 8.17476923076923,
"grad_norm": 0.7268086075782776,
"learning_rate": 8.504004004004005e-05,
"loss": 0.6747,
"mean_token_accuracy": 0.774444717913866,
"num_tokens": 365355.0,
"step": 16610
},
{
"epoch": 8.179692307692308,
"grad_norm": 0.29098740220069885,
"learning_rate": 8.498998998998999e-05,
"loss": 0.7868,
"mean_token_accuracy": 0.736617112159729,
"num_tokens": 374565.0,
"step": 16620
},
{
"epoch": 8.184615384615384,
"grad_norm": 0.453988254070282,
"learning_rate": 8.493993993993994e-05,
"loss": 0.7035,
"mean_token_accuracy": 0.7616805218160152,
"num_tokens": 383369.0,
"step": 16630
},
{
"epoch": 8.189538461538461,
"grad_norm": 0.5355010032653809,
"learning_rate": 8.488988988988989e-05,
"loss": 0.8054,
"mean_token_accuracy": 0.7372288048267365,
"num_tokens": 393060.0,
"step": 16640
},
{
"epoch": 8.194461538461539,
"grad_norm": 0.25265973806381226,
"learning_rate": 8.483983983983984e-05,
"loss": 0.6868,
"mean_token_accuracy": 0.7586379230022431,
"num_tokens": 401408.0,
"step": 16650
},
{
"epoch": 8.199384615384615,
"grad_norm": 0.3654129207134247,
"learning_rate": 8.478978978978979e-05,
"loss": 0.6787,
"mean_token_accuracy": 0.7611869160085917,
"num_tokens": 409918.0,
"step": 16660
},
{
"epoch": 8.204307692307692,
"grad_norm": 0.4879061281681061,
"learning_rate": 8.473973973973975e-05,
"loss": 0.6378,
"mean_token_accuracy": 0.7808506272733211,
"num_tokens": 418552.0,
"step": 16670
},
{
"epoch": 8.20923076923077,
"grad_norm": 0.32871031761169434,
"learning_rate": 8.468968968968969e-05,
"loss": 0.7656,
"mean_token_accuracy": 0.7485352344810963,
"num_tokens": 427532.0,
"step": 16680
},
{
"epoch": 8.214153846153847,
"grad_norm": 0.4512389600276947,
"learning_rate": 8.463963963963965e-05,
"loss": 0.6777,
"mean_token_accuracy": 0.7757456459105014,
"num_tokens": 435987.0,
"step": 16690
},
{
"epoch": 8.219076923076923,
"grad_norm": 0.698094367980957,
"learning_rate": 8.458958958958959e-05,
"loss": 0.5981,
"mean_token_accuracy": 0.7841473259031773,
"num_tokens": 444271.0,
"step": 16700
},
{
"epoch": 8.224,
"grad_norm": 0.5681586265563965,
"learning_rate": 8.453953953953955e-05,
"loss": 0.7459,
"mean_token_accuracy": 0.7463509045541287,
"num_tokens": 453153.0,
"step": 16710
},
{
"epoch": 8.228923076923078,
"grad_norm": 0.3863551914691925,
"learning_rate": 8.448948948948949e-05,
"loss": 0.7184,
"mean_token_accuracy": 0.7525065660476684,
"num_tokens": 462182.0,
"step": 16720
},
{
"epoch": 8.233846153846153,
"grad_norm": 1.4121780395507812,
"learning_rate": 8.443943943943944e-05,
"loss": 0.7869,
"mean_token_accuracy": 0.7282758131623268,
"num_tokens": 471829.0,
"step": 16730
},
{
"epoch": 8.23876923076923,
"grad_norm": 0.4115709662437439,
"learning_rate": 8.438938938938939e-05,
"loss": 0.6067,
"mean_token_accuracy": 0.7922206796705723,
"num_tokens": 479732.0,
"step": 16740
},
{
"epoch": 8.243692307692308,
"grad_norm": 0.35427096486091614,
"learning_rate": 8.433933933933934e-05,
"loss": 0.731,
"mean_token_accuracy": 0.7598383821547031,
"num_tokens": 488481.0,
"step": 16750
},
{
"epoch": 8.248615384615384,
"grad_norm": 0.4847518801689148,
"learning_rate": 8.428928928928929e-05,
"loss": 0.8492,
"mean_token_accuracy": 0.7316956970840692,
"num_tokens": 498612.0,
"step": 16760
},
{
"epoch": 8.253538461538461,
"grad_norm": 0.35778024792671204,
"learning_rate": 8.423923923923924e-05,
"loss": 0.8278,
"mean_token_accuracy": 0.7407127279788256,
"num_tokens": 508317.0,
"step": 16770
},
{
"epoch": 8.258461538461539,
"grad_norm": 0.4900796413421631,
"learning_rate": 8.418918918918919e-05,
"loss": 0.7169,
"mean_token_accuracy": 0.7744144190102815,
"num_tokens": 517266.0,
"step": 16780
},
{
"epoch": 8.263384615384615,
"grad_norm": 0.7427136898040771,
"learning_rate": 8.413913913913915e-05,
"loss": 0.721,
"mean_token_accuracy": 0.7743976633995772,
"num_tokens": 526031.0,
"step": 16790
},
{
"epoch": 8.268307692307692,
"grad_norm": 0.3626040518283844,
"learning_rate": 8.40890890890891e-05,
"loss": 0.7644,
"mean_token_accuracy": 0.7453425768762827,
"num_tokens": 535101.0,
"step": 16800
},
{
"epoch": 8.27323076923077,
"grad_norm": 0.3192290961742401,
"learning_rate": 8.403903903903905e-05,
"loss": 0.7191,
"mean_token_accuracy": 0.7587394848465919,
"num_tokens": 543850.0,
"step": 16810
},
{
"epoch": 8.278153846153845,
"grad_norm": 0.29766783118247986,
"learning_rate": 8.3988988988989e-05,
"loss": 0.7147,
"mean_token_accuracy": 0.7568968750536442,
"num_tokens": 552551.0,
"step": 16820
},
{
"epoch": 8.283076923076923,
"grad_norm": 0.42623892426490784,
"learning_rate": 8.393893893893894e-05,
"loss": 0.742,
"mean_token_accuracy": 0.752581474930048,
"num_tokens": 561647.0,
"step": 16830
},
{
"epoch": 8.288,
"grad_norm": 0.5091580152511597,
"learning_rate": 8.38888888888889e-05,
"loss": 0.74,
"mean_token_accuracy": 0.7456575892865658,
"num_tokens": 570203.0,
"step": 16840
},
{
"epoch": 8.292923076923078,
"grad_norm": 0.8799173831939697,
"learning_rate": 8.383883883883884e-05,
"loss": 0.7269,
"mean_token_accuracy": 0.7564969882369041,
"num_tokens": 578387.0,
"step": 16850
},
{
"epoch": 8.297846153846153,
"grad_norm": 0.4507330060005188,
"learning_rate": 8.37887887887888e-05,
"loss": 0.6896,
"mean_token_accuracy": 0.7675476286560297,
"num_tokens": 586819.0,
"step": 16860
},
{
"epoch": 8.302769230769231,
"grad_norm": 0.5583937168121338,
"learning_rate": 8.373873873873874e-05,
"loss": 0.7218,
"mean_token_accuracy": 0.7538196977227927,
"num_tokens": 595719.0,
"step": 16870
},
{
"epoch": 8.307692307692308,
"grad_norm": 0.30985692143440247,
"learning_rate": 8.36886886886887e-05,
"loss": 0.7591,
"mean_token_accuracy": 0.7350195806473494,
"num_tokens": 605157.0,
"step": 16880
},
{
"epoch": 8.312615384615384,
"grad_norm": 0.29996439814567566,
"learning_rate": 8.363863863863865e-05,
"loss": 0.8303,
"mean_token_accuracy": 0.72854442037642,
"num_tokens": 615318.0,
"step": 16890
},
{
"epoch": 8.317538461538462,
"grad_norm": 0.29050928354263306,
"learning_rate": 8.35885885885886e-05,
"loss": 0.6958,
"mean_token_accuracy": 0.7601312138140202,
"num_tokens": 623747.0,
"step": 16900
},
{
"epoch": 8.322461538461539,
"grad_norm": 0.7444137334823608,
"learning_rate": 8.353853853853855e-05,
"loss": 0.7091,
"mean_token_accuracy": 0.7593867909163237,
"num_tokens": 632061.0,
"step": 16910
},
{
"epoch": 8.327384615384615,
"grad_norm": 0.27352163195610046,
"learning_rate": 8.34884884884885e-05,
"loss": 0.5748,
"mean_token_accuracy": 0.8001094650477171,
"num_tokens": 639693.0,
"step": 16920
},
{
"epoch": 8.332307692307692,
"grad_norm": 0.31675222516059875,
"learning_rate": 8.343843843843844e-05,
"loss": 0.7142,
"mean_token_accuracy": 0.7502024855464697,
"num_tokens": 648616.0,
"step": 16930
},
{
"epoch": 8.33723076923077,
"grad_norm": 0.24953658878803253,
"learning_rate": 8.33883883883884e-05,
"loss": 0.7179,
"mean_token_accuracy": 0.7595278985798359,
"num_tokens": 658331.0,
"step": 16940
},
{
"epoch": 8.342153846153845,
"grad_norm": 0.29154184460639954,
"learning_rate": 8.333833833833834e-05,
"loss": 0.7491,
"mean_token_accuracy": 0.7431470949202776,
"num_tokens": 666914.0,
"step": 16950
},
{
"epoch": 8.347076923076923,
"grad_norm": 0.46732550859451294,
"learning_rate": 8.32882882882883e-05,
"loss": 0.6553,
"mean_token_accuracy": 0.7763445932418108,
"num_tokens": 675189.0,
"step": 16960
},
{
"epoch": 8.352,
"grad_norm": 0.6667472720146179,
"learning_rate": 8.323823823823824e-05,
"loss": 0.7915,
"mean_token_accuracy": 0.7494681358337403,
"num_tokens": 684818.0,
"step": 16970
},
{
"epoch": 8.356923076923078,
"grad_norm": 0.7695476412773132,
"learning_rate": 8.318818818818818e-05,
"loss": 0.6424,
"mean_token_accuracy": 0.7838353902101517,
"num_tokens": 693101.0,
"step": 16980
},
{
"epoch": 8.361846153846153,
"grad_norm": 0.48481637239456177,
"learning_rate": 8.313813813813814e-05,
"loss": 0.7594,
"mean_token_accuracy": 0.7688646581023931,
"num_tokens": 702267.0,
"step": 16990
},
{
"epoch": 8.366769230769231,
"grad_norm": 0.295489102602005,
"learning_rate": 8.30880880880881e-05,
"loss": 0.7447,
"mean_token_accuracy": 0.741809818893671,
"num_tokens": 710894.0,
"step": 17000
},
{
"epoch": 8.3712,
"grad_norm": 0.34440332651138306,
"learning_rate": 9.306748584382252e-05,
"loss": 0.6617,
"mean_token_accuracy": 0.7728524345904588,
"num_tokens": 8282.0,
"step": 17010
},
{
"epoch": 8.376123076923077,
"grad_norm": 0.46622994542121887,
"learning_rate": 9.302749347659147e-05,
"loss": 0.7989,
"mean_token_accuracy": 0.7468677569180727,
"num_tokens": 18349.0,
"step": 17020
},
{
"epoch": 8.381046153846153,
"grad_norm": 0.679972231388092,
"learning_rate": 9.298739473064651e-05,
"loss": 0.774,
"mean_token_accuracy": 0.748094291985035,
"num_tokens": 28331.0,
"step": 17030
},
{
"epoch": 8.38596923076923,
"grad_norm": 0.3294457495212555,
"learning_rate": 9.294718970512545e-05,
"loss": 0.7299,
"mean_token_accuracy": 0.755809823796153,
"num_tokens": 37071.0,
"step": 17040
},
{
"epoch": 8.390892307692308,
"grad_norm": 0.3379822373390198,
"learning_rate": 9.290687849942893e-05,
"loss": 0.7451,
"mean_token_accuracy": 0.7431266129016876,
"num_tokens": 45861.0,
"step": 17050
},
{
"epoch": 8.395815384615384,
"grad_norm": 0.3134821355342865,
"learning_rate": 9.286646121322004e-05,
"loss": 0.734,
"mean_token_accuracy": 0.7659897316247225,
"num_tokens": 55778.0,
"step": 17060
},
{
"epoch": 8.400738461538461,
"grad_norm": 1.9757238626480103,
"learning_rate": 9.282593794642423e-05,
"loss": 0.741,
"mean_token_accuracy": 0.760085154697299,
"num_tokens": 64336.0,
"step": 17070
},
{
"epoch": 8.405661538461539,
"grad_norm": 0.27846866846084595,
"learning_rate": 9.278530879922882e-05,
"loss": 0.6565,
"mean_token_accuracy": 0.7792624596506357,
"num_tokens": 72489.0,
"step": 17080
},
{
"epoch": 8.410584615384616,
"grad_norm": 0.30453068017959595,
"learning_rate": 9.274457387208305e-05,
"loss": 0.6348,
"mean_token_accuracy": 0.7794241864234209,
"num_tokens": 80730.0,
"step": 17090
},
{
"epoch": 8.415507692307692,
"grad_norm": 0.4652600884437561,
"learning_rate": 9.270373326569762e-05,
"loss": 0.6505,
"mean_token_accuracy": 0.7694964144378901,
"num_tokens": 88615.0,
"step": 17100
},
{
"epoch": 8.42043076923077,
"grad_norm": 0.28283053636550903,
"learning_rate": 9.266278708104448e-05,
"loss": 0.7432,
"mean_token_accuracy": 0.7504150871187448,
"num_tokens": 98314.0,
"step": 17110
},
{
"epoch": 8.425353846153847,
"grad_norm": 0.7553939819335938,
"learning_rate": 9.262173541935663e-05,
"loss": 0.818,
"mean_token_accuracy": 0.722613125666976,
"num_tokens": 108341.0,
"step": 17120
},
{
"epoch": 8.430276923076923,
"grad_norm": 0.4199792444705963,
"learning_rate": 9.25805783821279e-05,
"loss": 0.7948,
"mean_token_accuracy": 0.728472213447094,
"num_tokens": 118063.0,
"step": 17130
},
{
"epoch": 8.4352,
"grad_norm": 0.3130192756652832,
"learning_rate": 9.253931607111256e-05,
"loss": 0.773,
"mean_token_accuracy": 0.7356539122760296,
"num_tokens": 126793.0,
"step": 17140
},
{
"epoch": 8.440123076923078,
"grad_norm": 0.3275775909423828,
"learning_rate": 9.249794858832522e-05,
"loss": 0.6469,
"mean_token_accuracy": 0.7861156791448594,
"num_tokens": 134697.0,
"step": 17150
},
{
"epoch": 8.445046153846153,
"grad_norm": 0.3588021695613861,
"learning_rate": 9.245647603604042e-05,
"loss": 0.6775,
"mean_token_accuracy": 0.7681175690144301,
"num_tokens": 142697.0,
"step": 17160
},
{
"epoch": 8.44996923076923,
"grad_norm": 1.7960641384124756,
"learning_rate": 9.241489851679256e-05,
"loss": 0.6297,
"mean_token_accuracy": 0.787098852545023,
"num_tokens": 150829.0,
"step": 17170
},
{
"epoch": 8.454892307692308,
"grad_norm": 0.361743688583374,
"learning_rate": 9.237321613337552e-05,
"loss": 0.7567,
"mean_token_accuracy": 0.7540980920195579,
"num_tokens": 160250.0,
"step": 17180
},
{
"epoch": 8.459815384615384,
"grad_norm": 0.35826051235198975,
"learning_rate": 9.233142898884245e-05,
"loss": 0.7855,
"mean_token_accuracy": 0.7355491202324629,
"num_tokens": 169094.0,
"step": 17190
},
{
"epoch": 8.464738461538461,
"grad_norm": 0.46428659558296204,
"learning_rate": 9.228953718650548e-05,
"loss": 0.6995,
"mean_token_accuracy": 0.7721015859395266,
"num_tokens": 178014.0,
"step": 17200
},
{
"epoch": 8.469661538461539,
"grad_norm": 0.39374762773513794,
"learning_rate": 9.224754082993552e-05,
"loss": 0.6739,
"mean_token_accuracy": 0.7837085586041213,
"num_tokens": 187225.0,
"step": 17210
},
{
"epoch": 8.474584615384614,
"grad_norm": 0.5131353139877319,
"learning_rate": 9.220544002296194e-05,
"loss": 0.6725,
"mean_token_accuracy": 0.7798098236322403,
"num_tokens": 195918.0,
"step": 17220
},
{
"epoch": 8.479507692307692,
"grad_norm": 0.2942337095737457,
"learning_rate": 9.216323486967238e-05,
"loss": 0.8587,
"mean_token_accuracy": 0.7184354912489652,
"num_tokens": 205933.0,
"step": 17230
},
{
"epoch": 8.48443076923077,
"grad_norm": 0.6050145030021667,
"learning_rate": 9.212092547441246e-05,
"loss": 0.7515,
"mean_token_accuracy": 0.7520445462316274,
"num_tokens": 215316.0,
"step": 17240
},
{
"epoch": 8.489353846153847,
"grad_norm": 0.34459981322288513,
"learning_rate": 9.207851194178548e-05,
"loss": 0.7056,
"mean_token_accuracy": 0.7616019807755947,
"num_tokens": 224106.0,
"step": 17250
},
{
"epoch": 8.494276923076923,
"grad_norm": 0.3424946963787079,
"learning_rate": 9.203599437665226e-05,
"loss": 0.8357,
"mean_token_accuracy": 0.7299704484641552,
"num_tokens": 234099.0,
"step": 17260
},
{
"epoch": 8.4992,
"grad_norm": 0.3472613990306854,
"learning_rate": 9.19933728841308e-05,
"loss": 0.7371,
"mean_token_accuracy": 0.739522896334529,
"num_tokens": 243567.0,
"step": 17270
},
{
"epoch": 8.504123076923078,
"grad_norm": 0.4420841634273529,
"learning_rate": 9.1950647569596e-05,
"loss": 0.7009,
"mean_token_accuracy": 0.775427482649684,
"num_tokens": 252398.0,
"step": 17280
},
{
"epoch": 8.509046153846153,
"grad_norm": 0.42884570360183716,
"learning_rate": 9.19078185386795e-05,
"loss": 0.7443,
"mean_token_accuracy": 0.7564741510897874,
"num_tokens": 262701.0,
"step": 17290
},
{
"epoch": 8.51396923076923,
"grad_norm": 0.31011125445365906,
"learning_rate": 9.186488589726937e-05,
"loss": 0.7856,
"mean_token_accuracy": 0.7426570508629083,
"num_tokens": 272557.0,
"step": 17300
},
{
"epoch": 8.518892307692308,
"grad_norm": 0.3713008463382721,
"learning_rate": 9.18218497515098e-05,
"loss": 0.7374,
"mean_token_accuracy": 0.7558617364615202,
"num_tokens": 282227.0,
"step": 17310
},
{
"epoch": 8.523815384615384,
"grad_norm": 0.3813200891017914,
"learning_rate": 9.17787102078009e-05,
"loss": 0.7099,
"mean_token_accuracy": 0.761194471269846,
"num_tokens": 291044.0,
"step": 17320
},
{
"epoch": 8.528738461538461,
"grad_norm": 0.4389830529689789,
"learning_rate": 9.17354673727984e-05,
"loss": 0.6955,
"mean_token_accuracy": 0.7634035963565111,
"num_tokens": 299331.0,
"step": 17330
},
{
"epoch": 8.533661538461539,
"grad_norm": 0.26211830973625183,
"learning_rate": 9.169212135341343e-05,
"loss": 0.8433,
"mean_token_accuracy": 0.7042404491454363,
"num_tokens": 309303.0,
"step": 17340
},
{
"epoch": 8.538584615384615,
"grad_norm": 0.3813954293727875,
"learning_rate": 9.164867225681219e-05,
"loss": 0.7424,
"mean_token_accuracy": 0.7665748696774244,
"num_tokens": 318409.0,
"step": 17350
},
{
"epoch": 8.543507692307692,
"grad_norm": 0.2562119960784912,
"learning_rate": 9.160512019041577e-05,
"loss": 0.7056,
"mean_token_accuracy": 0.7714785143733025,
"num_tokens": 326915.0,
"step": 17360
},
{
"epoch": 8.54843076923077,
"grad_norm": 0.7329946160316467,
"learning_rate": 9.156146526189975e-05,
"loss": 0.6707,
"mean_token_accuracy": 0.7758157294243574,
"num_tokens": 335886.0,
"step": 17370
},
{
"epoch": 8.553353846153847,
"grad_norm": 0.7640717029571533,
"learning_rate": 9.151770757919414e-05,
"loss": 0.6965,
"mean_token_accuracy": 0.7744528673589229,
"num_tokens": 344820.0,
"step": 17380
},
{
"epoch": 8.558276923076923,
"grad_norm": 0.6143106818199158,
"learning_rate": 9.147384725048292e-05,
"loss": 0.6567,
"mean_token_accuracy": 0.7768244970589876,
"num_tokens": 353154.0,
"step": 17390
},
{
"epoch": 8.5632,
"grad_norm": 0.7920124530792236,
"learning_rate": 9.142988438420383e-05,
"loss": 0.6259,
"mean_token_accuracy": 0.8012331046164036,
"num_tokens": 361761.0,
"step": 17400
},
{
"epoch": 8.568123076923078,
"grad_norm": 0.6481062769889832,
"learning_rate": 9.138581908904818e-05,
"loss": 0.7013,
"mean_token_accuracy": 0.7641973450779915,
"num_tokens": 371035.0,
"step": 17410
},
{
"epoch": 8.573046153846153,
"grad_norm": 1.1524738073349,
"learning_rate": 9.134165147396045e-05,
"loss": 0.7164,
"mean_token_accuracy": 0.7598775941878557,
"num_tokens": 380223.0,
"step": 17420
},
{
"epoch": 8.57796923076923,
"grad_norm": 0.4807913601398468,
"learning_rate": 9.129738164813814e-05,
"loss": 0.5843,
"mean_token_accuracy": 0.7955730833113194,
"num_tokens": 388359.0,
"step": 17430
},
{
"epoch": 8.582892307692308,
"grad_norm": 0.35724836587905884,
"learning_rate": 9.125300972103146e-05,
"loss": 0.6942,
"mean_token_accuracy": 0.7595485664904118,
"num_tokens": 396973.0,
"step": 17440
},
{
"epoch": 8.587815384615384,
"grad_norm": 0.42613887786865234,
"learning_rate": 9.120853580234299e-05,
"loss": 0.7747,
"mean_token_accuracy": 0.7444280967116356,
"num_tokens": 406644.0,
"step": 17450
},
{
"epoch": 8.592738461538461,
"grad_norm": 0.44058769941329956,
"learning_rate": 9.116396000202752e-05,
"loss": 0.6742,
"mean_token_accuracy": 0.771963307633996,
"num_tokens": 415159.0,
"step": 17460
},
{
"epoch": 8.597661538461539,
"grad_norm": 0.8332029581069946,
"learning_rate": 9.111928243029171e-05,
"loss": 0.7305,
"mean_token_accuracy": 0.7594617635011673,
"num_tokens": 423770.0,
"step": 17470
},
{
"epoch": 8.602584615384615,
"grad_norm": 0.4663828909397125,
"learning_rate": 9.107450319759382e-05,
"loss": 0.7572,
"mean_token_accuracy": 0.7509522173553705,
"num_tokens": 432396.0,
"step": 17480
},
{
"epoch": 8.607507692307692,
"grad_norm": 0.34183964133262634,
"learning_rate": 9.102962241464348e-05,
"loss": 0.7106,
"mean_token_accuracy": 0.7614024080336094,
"num_tokens": 441108.0,
"step": 17490
},
{
"epoch": 8.61243076923077,
"grad_norm": 0.42849647998809814,
"learning_rate": 9.098464019240138e-05,
"loss": 0.6806,
"mean_token_accuracy": 0.7648319080471992,
"num_tokens": 449634.0,
"step": 17500
},
{
"epoch": 8.617353846153847,
"grad_norm": 0.3230085074901581,
"learning_rate": 9.093955664207895e-05,
"loss": 0.7568,
"mean_token_accuracy": 0.7501963946968317,
"num_tokens": 459276.0,
"step": 17510
},
{
"epoch": 8.622276923076923,
"grad_norm": 0.46625813841819763,
"learning_rate": 9.089437187513821e-05,
"loss": 0.6697,
"mean_token_accuracy": 0.7733147449791431,
"num_tokens": 467354.0,
"step": 17520
},
{
"epoch": 8.6272,
"grad_norm": 0.34277865290641785,
"learning_rate": 9.08490860032914e-05,
"loss": 0.6949,
"mean_token_accuracy": 0.7638208650052547,
"num_tokens": 475322.0,
"step": 17530
},
{
"epoch": 8.632123076923078,
"grad_norm": 0.7064818739891052,
"learning_rate": 9.080369913850072e-05,
"loss": 0.7679,
"mean_token_accuracy": 0.7412798043340445,
"num_tokens": 484199.0,
"step": 17540
},
{
"epoch": 8.637046153846153,
"grad_norm": 0.3127409815788269,
"learning_rate": 9.075821139297805e-05,
"loss": 0.7316,
"mean_token_accuracy": 0.7602858003228903,
"num_tokens": 493997.0,
"step": 17550
},
{
"epoch": 8.64196923076923,
"grad_norm": 0.7649181485176086,
"learning_rate": 9.071262287918467e-05,
"loss": 0.6458,
"mean_token_accuracy": 0.7748285502195358,
"num_tokens": 502572.0,
"step": 17560
},
{
"epoch": 8.646892307692308,
"grad_norm": 0.3664408326148987,
"learning_rate": 9.066693370983105e-05,
"loss": 0.639,
"mean_token_accuracy": 0.7795850615948439,
"num_tokens": 511053.0,
"step": 17570
},
{
"epoch": 8.651815384615384,
"grad_norm": 0.8689625263214111,
"learning_rate": 9.062114399787647e-05,
"loss": 0.6433,
"mean_token_accuracy": 0.7788681592792273,
"num_tokens": 519715.0,
"step": 17580
},
{
"epoch": 8.656738461538461,
"grad_norm": 0.5213949084281921,
"learning_rate": 9.057525385652878e-05,
"loss": 0.6952,
"mean_token_accuracy": 0.7548884745687247,
"num_tokens": 529725.0,
"step": 17590
},
{
"epoch": 8.661661538461539,
"grad_norm": 0.3532446622848511,
"learning_rate": 9.052926339924413e-05,
"loss": 0.7587,
"mean_token_accuracy": 0.746376433596015,
"num_tokens": 539374.0,
"step": 17600
},
{
"epoch": 8.666584615384615,
"grad_norm": 0.36963027715682983,
"learning_rate": 9.048317273972675e-05,
"loss": 0.7293,
"mean_token_accuracy": 0.758986271545291,
"num_tokens": 548541.0,
"step": 17610
},
{
"epoch": 8.671507692307692,
"grad_norm": 0.3961709141731262,
"learning_rate": 9.043698199192849e-05,
"loss": 0.7,
"mean_token_accuracy": 0.767149792611599,
"num_tokens": 557495.0,
"step": 17620
},
{
"epoch": 8.67643076923077,
"grad_norm": 0.24584902822971344,
"learning_rate": 9.039069127004875e-05,
"loss": 0.7539,
"mean_token_accuracy": 0.7440818291157484,
"num_tokens": 566503.0,
"step": 17630
},
{
"epoch": 8.681353846153847,
"grad_norm": 0.2721676230430603,
"learning_rate": 9.034430068853405e-05,
"loss": 0.7589,
"mean_token_accuracy": 0.7534482311457396,
"num_tokens": 575670.0,
"step": 17640
},
{
"epoch": 8.686276923076923,
"grad_norm": 0.38591504096984863,
"learning_rate": 9.029781036207781e-05,
"loss": 0.6774,
"mean_token_accuracy": 0.7606659393757582,
"num_tokens": 585086.0,
"step": 17650
},
{
"epoch": 8.6912,
"grad_norm": 0.49064919352531433,
"learning_rate": 9.025122040562007e-05,
"loss": 0.7503,
"mean_token_accuracy": 0.7477210737764836,
"num_tokens": 593883.0,
"step": 17660
},
{
"epoch": 8.696123076923078,
"grad_norm": 0.36603498458862305,
"learning_rate": 9.020453093434714e-05,
"loss": 0.6822,
"mean_token_accuracy": 0.767727042734623,
"num_tokens": 602034.0,
"step": 17670
},
{
"epoch": 8.701046153846153,
"grad_norm": 0.5092780590057373,
"learning_rate": 9.015774206369143e-05,
"loss": 0.6706,
"mean_token_accuracy": 0.7680165067315101,
"num_tokens": 611455.0,
"step": 17680
},
{
"epoch": 8.705969230769231,
"grad_norm": 0.33634933829307556,
"learning_rate": 9.011085390933105e-05,
"loss": 0.6266,
"mean_token_accuracy": 0.7781741376966238,
"num_tokens": 619532.0,
"step": 17690
},
{
"epoch": 8.710892307692308,
"grad_norm": 0.35559213161468506,
"learning_rate": 9.00638665871896e-05,
"loss": 0.6879,
"mean_token_accuracy": 0.7744422752410174,
"num_tokens": 628842.0,
"step": 17700
},
{
"epoch": 8.715815384615384,
"grad_norm": 0.2505728602409363,
"learning_rate": 9.001678021343586e-05,
"loss": 0.7787,
"mean_token_accuracy": 0.7485674019902945,
"num_tokens": 638729.0,
"step": 17710
},
{
"epoch": 8.720738461538462,
"grad_norm": 0.6747182011604309,
"learning_rate": 8.996959490448346e-05,
"loss": 0.6124,
"mean_token_accuracy": 0.7967943239957094,
"num_tokens": 647220.0,
"step": 17720
},
{
"epoch": 8.725661538461539,
"grad_norm": 0.24621552228927612,
"learning_rate": 8.992231077699067e-05,
"loss": 0.6561,
"mean_token_accuracy": 0.7866641227155924,
"num_tokens": 655677.0,
"step": 17730
},
{
"epoch": 8.730584615384615,
"grad_norm": 0.26347342133522034,
"learning_rate": 8.987492794786006e-05,
"loss": 0.7491,
"mean_token_accuracy": 0.7501132309436798,
"num_tokens": 665125.0,
"step": 17740
},
{
"epoch": 8.735507692307692,
"grad_norm": 0.32913926243782043,
"learning_rate": 8.982744653423825e-05,
"loss": 0.7115,
"mean_token_accuracy": 0.7552514169365168,
"num_tokens": 674914.0,
"step": 17750
},
{
"epoch": 8.74043076923077,
"grad_norm": 0.37196052074432373,
"learning_rate": 8.977986665351552e-05,
"loss": 0.6786,
"mean_token_accuracy": 0.7645082645118236,
"num_tokens": 683568.0,
"step": 17760
},
{
"epoch": 8.745353846153845,
"grad_norm": 0.7010545134544373,
"learning_rate": 8.97321884233257e-05,
"loss": 0.7045,
"mean_token_accuracy": 0.7673761691898108,
"num_tokens": 692549.0,
"step": 17770
},
{
"epoch": 8.750276923076923,
"grad_norm": 0.39983436465263367,
"learning_rate": 8.96844119615457e-05,
"loss": 0.6805,
"mean_token_accuracy": 0.7654839035123586,
"num_tokens": 700969.0,
"step": 17780
},
{
"epoch": 8.7552,
"grad_norm": 0.3482830822467804,
"learning_rate": 8.96365373862953e-05,
"loss": 0.7268,
"mean_token_accuracy": 0.7699950773268938,
"num_tokens": 710766.0,
"step": 17790
},
{
"epoch": 8.760123076923076,
"grad_norm": 0.7171288132667542,
"learning_rate": 8.958856481593687e-05,
"loss": 0.7709,
"mean_token_accuracy": 0.7531832829117775,
"num_tokens": 720097.0,
"step": 17800
},
{
"epoch": 8.765046153846153,
"grad_norm": 0.3223002552986145,
"learning_rate": 8.954049436907506e-05,
"loss": 0.7723,
"mean_token_accuracy": 0.7442610811442136,
"num_tokens": 9539.0,
"step": 17810
},
{
"epoch": 8.769969230769231,
"grad_norm": 0.2629016935825348,
"learning_rate": 8.949232616455647e-05,
"loss": 0.7714,
"mean_token_accuracy": 0.7383145179599524,
"num_tokens": 19114.0,
"step": 17820
},
{
"epoch": 8.774892307692308,
"grad_norm": 0.6586378216743469,
"learning_rate": 8.944406032146944e-05,
"loss": 0.642,
"mean_token_accuracy": 0.7885478623211384,
"num_tokens": 27435.0,
"step": 17830
},
{
"epoch": 8.779815384615384,
"grad_norm": 0.2907133400440216,
"learning_rate": 8.939569695914367e-05,
"loss": 0.7391,
"mean_token_accuracy": 0.748593881353736,
"num_tokens": 35485.0,
"step": 17840
},
{
"epoch": 8.784738461538462,
"grad_norm": 0.28475967049598694,
"learning_rate": 8.934723619714996e-05,
"loss": 0.7719,
"mean_token_accuracy": 0.7494703732430935,
"num_tokens": 44542.0,
"step": 17850
},
{
"epoch": 8.789661538461539,
"grad_norm": 0.47861045598983765,
"learning_rate": 8.929867815529993e-05,
"loss": 0.6828,
"mean_token_accuracy": 0.7696808248758316,
"num_tokens": 53560.0,
"step": 17860
},
{
"epoch": 8.794584615384615,
"grad_norm": 0.25639232993125916,
"learning_rate": 8.925002295364571e-05,
"loss": 0.7176,
"mean_token_accuracy": 0.7549582026898861,
"num_tokens": 62836.0,
"step": 17870
},
{
"epoch": 8.799507692307692,
"grad_norm": 0.27395716309547424,
"learning_rate": 8.920127071247963e-05,
"loss": 0.7167,
"mean_token_accuracy": 0.7506252504885197,
"num_tokens": 71377.0,
"step": 17880
},
{
"epoch": 8.80443076923077,
"grad_norm": 0.26782044768333435,
"learning_rate": 8.915242155233396e-05,
"loss": 0.7433,
"mean_token_accuracy": 0.7559556499123573,
"num_tokens": 80539.0,
"step": 17890
},
{
"epoch": 8.809353846153845,
"grad_norm": 0.31977975368499756,
"learning_rate": 8.910347559398056e-05,
"loss": 0.7916,
"mean_token_accuracy": 0.7560835804790258,
"num_tokens": 90708.0,
"step": 17900
},
{
"epoch": 8.814276923076923,
"grad_norm": 0.492887407541275,
"learning_rate": 8.905443295843061e-05,
"loss": 0.6752,
"mean_token_accuracy": 0.7661668874323369,
"num_tokens": 99271.0,
"step": 17910
},
{
"epoch": 8.8192,
"grad_norm": 0.32388588786125183,
"learning_rate": 8.900529376693434e-05,
"loss": 0.7657,
"mean_token_accuracy": 0.7424514323472977,
"num_tokens": 107993.0,
"step": 17920
},
{
"epoch": 8.824123076923076,
"grad_norm": 0.4508485496044159,
"learning_rate": 8.895605814098064e-05,
"loss": 0.8702,
"mean_token_accuracy": 0.7194077134132385,
"num_tokens": 118063.0,
"step": 17930
},
{
"epoch": 8.829046153846154,
"grad_norm": 0.3053992986679077,
"learning_rate": 8.89067262022969e-05,
"loss": 0.6951,
"mean_token_accuracy": 0.7752657104283571,
"num_tokens": 126208.0,
"step": 17940
},
{
"epoch": 8.833969230769231,
"grad_norm": 0.3835429251194,
"learning_rate": 8.885729807284856e-05,
"loss": 0.7114,
"mean_token_accuracy": 0.7549926679581404,
"num_tokens": 134733.0,
"step": 17950
},
{
"epoch": 8.838892307692308,
"grad_norm": 0.21046239137649536,
"learning_rate": 8.880777387483888e-05,
"loss": 0.7411,
"mean_token_accuracy": 0.7572793487459422,
"num_tokens": 143481.0,
"step": 17960
},
{
"epoch": 8.843815384615384,
"grad_norm": 0.2608044445514679,
"learning_rate": 8.875815373070868e-05,
"loss": 0.7923,
"mean_token_accuracy": 0.7279406886547803,
"num_tokens": 152666.0,
"step": 17970
},
{
"epoch": 8.848738461538462,
"grad_norm": 0.35140207409858704,
"learning_rate": 8.870843776313598e-05,
"loss": 0.7014,
"mean_token_accuracy": 0.7720128271728754,
"num_tokens": 161509.0,
"step": 17980
},
{
"epoch": 8.85366153846154,
"grad_norm": 0.2577463984489441,
"learning_rate": 8.865862609503566e-05,
"loss": 0.7647,
"mean_token_accuracy": 0.7505464531481266,
"num_tokens": 170704.0,
"step": 17990
},
{
"epoch": 8.858584615384615,
"grad_norm": 0.4267882704734802,
"learning_rate": 8.860871884955925e-05,
"loss": 0.694,
"mean_token_accuracy": 0.7794535614550113,
"num_tokens": 179267.0,
"step": 18000
},
{
"epoch": 8.863507692307692,
"grad_norm": 0.3592469394207001,
"learning_rate": 8.855871615009459e-05,
"loss": 0.6028,
"mean_token_accuracy": 0.8042227383702993,
"num_tokens": 187517.0,
"step": 18010
},
{
"epoch": 8.86843076923077,
"grad_norm": 0.30718037486076355,
"learning_rate": 8.850861812026548e-05,
"loss": 0.8283,
"mean_token_accuracy": 0.7271805927157402,
"num_tokens": 197279.0,
"step": 18020
},
{
"epoch": 8.873353846153845,
"grad_norm": 0.429610937833786,
"learning_rate": 8.845842488393141e-05,
"loss": 0.7049,
"mean_token_accuracy": 0.7623873326927424,
"num_tokens": 206305.0,
"step": 18030
},
{
"epoch": 8.878276923076923,
"grad_norm": 0.3396170139312744,
"learning_rate": 8.840813656518728e-05,
"loss": 0.7685,
"mean_token_accuracy": 0.7294337477535009,
"num_tokens": 215493.0,
"step": 18040
},
{
"epoch": 8.8832,
"grad_norm": 0.2978787124156952,
"learning_rate": 8.835775328836306e-05,
"loss": 0.658,
"mean_token_accuracy": 0.7802751030772924,
"num_tokens": 223108.0,
"step": 18050
},
{
"epoch": 8.888123076923076,
"grad_norm": 0.3851058781147003,
"learning_rate": 8.830727517802347e-05,
"loss": 0.7847,
"mean_token_accuracy": 0.7197605889290571,
"num_tokens": 233031.0,
"step": 18060
},
{
"epoch": 8.893046153846154,
"grad_norm": 0.8238245248794556,
"learning_rate": 8.82567023589677e-05,
"loss": 0.7422,
"mean_token_accuracy": 0.7518959946930408,
"num_tokens": 241782.0,
"step": 18070
},
{
"epoch": 8.897969230769231,
"grad_norm": 0.8435314297676086,
"learning_rate": 8.820603495622912e-05,
"loss": 0.6705,
"mean_token_accuracy": 0.7846408020704985,
"num_tokens": 250884.0,
"step": 18080
},
{
"epoch": 8.902892307692309,
"grad_norm": 0.6095793843269348,
"learning_rate": 8.81552730950749e-05,
"loss": 0.6823,
"mean_token_accuracy": 0.775208180397749,
"num_tokens": 259293.0,
"step": 18090
},
{
"epoch": 8.907815384615384,
"grad_norm": 0.27715322375297546,
"learning_rate": 8.810441690100575e-05,
"loss": 0.8093,
"mean_token_accuracy": 0.7308334667235613,
"num_tokens": 269988.0,
"step": 18100
},
{
"epoch": 8.912738461538462,
"grad_norm": 0.39033225178718567,
"learning_rate": 8.805346649975565e-05,
"loss": 0.7432,
"mean_token_accuracy": 0.7343447051942349,
"num_tokens": 279011.0,
"step": 18110
},
{
"epoch": 8.91766153846154,
"grad_norm": 0.3180443048477173,
"learning_rate": 8.800242201729141e-05,
"loss": 0.6186,
"mean_token_accuracy": 0.7984683159738779,
"num_tokens": 287209.0,
"step": 18120
},
{
"epoch": 8.922584615384615,
"grad_norm": 0.46220842003822327,
"learning_rate": 8.795128357981253e-05,
"loss": 0.7108,
"mean_token_accuracy": 0.7846879895776511,
"num_tokens": 296052.0,
"step": 18130
},
{
"epoch": 8.927507692307692,
"grad_norm": 0.5963321328163147,
"learning_rate": 8.790005131375074e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.7786654643714428,
"num_tokens": 304656.0,
"step": 18140
},
{
"epoch": 8.93243076923077,
"grad_norm": 1.1265039443969727,
"learning_rate": 8.784872534576978e-05,
"loss": 0.8073,
"mean_token_accuracy": 0.7344447121024131,
"num_tokens": 314434.0,
"step": 18150
},
{
"epoch": 8.937353846153846,
"grad_norm": 0.4689125120639801,
"learning_rate": 8.779730580276501e-05,
"loss": 0.7303,
"mean_token_accuracy": 0.7458708386868238,
"num_tokens": 322607.0,
"step": 18160
},
{
"epoch": 8.942276923076923,
"grad_norm": 0.2908729612827301,
"learning_rate": 8.774579281186319e-05,
"loss": 0.6007,
"mean_token_accuracy": 0.7946537002921105,
"num_tokens": 330677.0,
"step": 18170
},
{
"epoch": 8.9472,
"grad_norm": 0.3566810190677643,
"learning_rate": 8.76941865004221e-05,
"loss": 0.7285,
"mean_token_accuracy": 0.7698224943131209,
"num_tokens": 340641.0,
"step": 18180
},
{
"epoch": 8.952123076923076,
"grad_norm": 0.35960811376571655,
"learning_rate": 8.76424869960302e-05,
"loss": 0.5989,
"mean_token_accuracy": 0.7901786677539349,
"num_tokens": 348726.0,
"step": 18190
},
{
"epoch": 8.957046153846154,
"grad_norm": 0.38176608085632324,
"learning_rate": 8.75906944265064e-05,
"loss": 0.7193,
"mean_token_accuracy": 0.7635401219129563,
"num_tokens": 357104.0,
"step": 18200
},
{
"epoch": 8.961969230769231,
"grad_norm": 0.2969922423362732,
"learning_rate": 8.753880891989972e-05,
"loss": 0.6534,
"mean_token_accuracy": 0.7754518780857325,
"num_tokens": 8926.0,
"step": 18210
},
{
"epoch": 8.966892307692309,
"grad_norm": 0.4824042022228241,
"learning_rate": 8.748683060448886e-05,
"loss": 0.6409,
"mean_token_accuracy": 0.7802204493433237,
"num_tokens": 17538.0,
"step": 18220
},
{
"epoch": 8.971815384615384,
"grad_norm": 0.28852856159210205,
"learning_rate": 8.743475960878209e-05,
"loss": 0.7454,
"mean_token_accuracy": 0.7477544978260994,
"num_tokens": 26410.0,
"step": 18230
},
{
"epoch": 8.976738461538462,
"grad_norm": 0.32177209854125977,
"learning_rate": 8.738259606151672e-05,
"loss": 0.7582,
"mean_token_accuracy": 0.7423262905329466,
"num_tokens": 35455.0,
"step": 18240
},
{
"epoch": 8.98166153846154,
"grad_norm": 0.2885516285896301,
"learning_rate": 8.733034009165894e-05,
"loss": 0.7031,
"mean_token_accuracy": 0.7598079223185777,
"num_tokens": 45252.0,
"step": 18250
},
{
"epoch": 8.986584615384615,
"grad_norm": 0.2479109764099121,
"learning_rate": 8.727799182840344e-05,
"loss": 0.7091,
"mean_token_accuracy": 0.7633897583931685,
"num_tokens": 53904.0,
"step": 18260
},
{
"epoch": 8.991507692307692,
"grad_norm": 0.4614357650279999,
"learning_rate": 8.722555140117303e-05,
"loss": 0.6807,
"mean_token_accuracy": 0.7741472873836756,
"num_tokens": 62135.0,
"step": 18270
},
{
"epoch": 8.99643076923077,
"grad_norm": 0.24129466712474823,
"learning_rate": 8.717301893961844e-05,
"loss": 0.7596,
"mean_token_accuracy": 0.7397197656333446,
"num_tokens": 70828.0,
"step": 18280
},
{
"epoch": 9.001476923076924,
"grad_norm": 0.3193853497505188,
"learning_rate": 8.712039457361795e-05,
"loss": 0.8814,
"mean_token_accuracy": 0.7301918130095412,
"num_tokens": 80962.0,
"step": 18290
},
{
"epoch": 9.0064,
"grad_norm": 0.5235174894332886,
"learning_rate": 8.7067678433277e-05,
"loss": 0.6238,
"mean_token_accuracy": 0.7859724014997482,
"num_tokens": 88659.0,
"step": 18300
},
{
"epoch": 9.011323076923077,
"grad_norm": 0.6154448986053467,
"learning_rate": 8.701487064892797e-05,
"loss": 0.6343,
"mean_token_accuracy": 0.7893827341496944,
"num_tokens": 97611.0,
"step": 18310
},
{
"epoch": 9.016246153846154,
"grad_norm": 0.4159059226512909,
"learning_rate": 8.69619713511298e-05,
"loss": 0.7506,
"mean_token_accuracy": 0.73648741543293,
"num_tokens": 107105.0,
"step": 18320
},
{
"epoch": 9.02116923076923,
"grad_norm": 0.3411385118961334,
"learning_rate": 8.690898067066771e-05,
"loss": 0.7644,
"mean_token_accuracy": 0.7501700416207313,
"num_tokens": 116559.0,
"step": 18330
},
{
"epoch": 9.026092307692307,
"grad_norm": 0.36136379837989807,
"learning_rate": 8.68558987385528e-05,
"loss": 0.8201,
"mean_token_accuracy": 0.716056851670146,
"num_tokens": 125666.0,
"step": 18340
},
{
"epoch": 9.031015384615385,
"grad_norm": 0.5617722272872925,
"learning_rate": 8.680272568602181e-05,
"loss": 0.7773,
"mean_token_accuracy": 0.7351651962846517,
"num_tokens": 135346.0,
"step": 18350
},
{
"epoch": 9.035938461538462,
"grad_norm": 0.9331738948822021,
"learning_rate": 8.674946164453677e-05,
"loss": 0.6327,
"mean_token_accuracy": 0.7781557217240334,
"num_tokens": 144168.0,
"step": 18360
},
{
"epoch": 9.040861538461538,
"grad_norm": 0.7562224268913269,
"learning_rate": 8.669610674578463e-05,
"loss": 0.699,
"mean_token_accuracy": 0.7659288670867681,
"num_tokens": 153162.0,
"step": 18370
},
{
"epoch": 9.045784615384616,
"grad_norm": 0.2653519809246063,
"learning_rate": 8.664266112167702e-05,
"loss": 0.7789,
"mean_token_accuracy": 0.7399741619825363,
"num_tokens": 162494.0,
"step": 18380
},
{
"epoch": 9.050707692307693,
"grad_norm": 0.4635138213634491,
"learning_rate": 8.658912490434981e-05,
"loss": 0.7132,
"mean_token_accuracy": 0.7637255847454071,
"num_tokens": 171751.0,
"step": 18390
},
{
"epoch": 9.055630769230769,
"grad_norm": 1.0278699398040771,
"learning_rate": 8.653549822616289e-05,
"loss": 0.6387,
"mean_token_accuracy": 0.7829495001584291,
"num_tokens": 180110.0,
"step": 18400
},
{
"epoch": 9.060553846153846,
"grad_norm": 0.44372498989105225,
"learning_rate": 8.648178121969978e-05,
"loss": 0.6717,
"mean_token_accuracy": 0.7774417765438557,
"num_tokens": 188293.0,
"step": 18410
},
{
"epoch": 9.065476923076924,
"grad_norm": 0.5009580254554749,
"learning_rate": 8.642797401776739e-05,
"loss": 0.7577,
"mean_token_accuracy": 0.7463389489799738,
"num_tokens": 197442.0,
"step": 18420
},
{
"epoch": 9.0704,
"grad_norm": 0.3624105751514435,
"learning_rate": 8.63740767533955e-05,
"loss": 0.7365,
"mean_token_accuracy": 0.7567340433597565,
"num_tokens": 206170.0,
"step": 18430
},
{
"epoch": 9.075323076923077,
"grad_norm": 0.40718990564346313,
"learning_rate": 8.632008955983667e-05,
"loss": 0.7613,
"mean_token_accuracy": 0.7605375040322542,
"num_tokens": 215198.0,
"step": 18440
},
{
"epoch": 9.080246153846154,
"grad_norm": 0.8007605075836182,
"learning_rate": 8.626601257056573e-05,
"loss": 0.6795,
"mean_token_accuracy": 0.7678759694099426,
"num_tokens": 223760.0,
"step": 18450
},
{
"epoch": 9.08516923076923,
"grad_norm": 0.6228090524673462,
"learning_rate": 8.621184591927953e-05,
"loss": 0.7174,
"mean_token_accuracy": 0.7554727476090193,
"num_tokens": 232476.0,
"step": 18460
},
{
"epoch": 9.090092307692307,
"grad_norm": 0.36368465423583984,
"learning_rate": 8.61575897398966e-05,
"loss": 0.7264,
"mean_token_accuracy": 0.7487952623516321,
"num_tokens": 242567.0,
"step": 18470
},
{
"epoch": 9.095015384615385,
"grad_norm": 0.6309615969657898,
"learning_rate": 8.610324416655684e-05,
"loss": 0.6797,
"mean_token_accuracy": 0.7755587588995695,
"num_tokens": 251964.0,
"step": 18480
},
{
"epoch": 9.09993846153846,
"grad_norm": 0.5346474647521973,
"learning_rate": 8.604880933362113e-05,
"loss": 0.6778,
"mean_token_accuracy": 0.7865086987614631,
"num_tokens": 260748.0,
"step": 18490
},
{
"epoch": 9.104861538461538,
"grad_norm": 0.7138431668281555,
"learning_rate": 8.599428537567101e-05,
"loss": 0.681,
"mean_token_accuracy": 0.7724569093436002,
"num_tokens": 269283.0,
"step": 18500
},
{
"epoch": 9.109784615384616,
"grad_norm": 0.2846992611885071,
"learning_rate": 8.593967242750843e-05,
"loss": 0.7066,
"mean_token_accuracy": 0.7593025963753461,
"num_tokens": 278494.0,
"step": 18510
},
{
"epoch": 9.114707692307693,
"grad_norm": 0.2537672817707062,
"learning_rate": 8.588497062415528e-05,
"loss": 0.7057,
"mean_token_accuracy": 0.7501687645912171,
"num_tokens": 288579.0,
"step": 18520
},
{
"epoch": 9.119630769230769,
"grad_norm": 0.3954591453075409,
"learning_rate": 8.583018010085321e-05,
"loss": 0.7496,
"mean_token_accuracy": 0.7489098712801934,
"num_tokens": 298019.0,
"step": 18530
},
{
"epoch": 9.124553846153846,
"grad_norm": 1.1767851114273071,
"learning_rate": 8.577530099306317e-05,
"loss": 0.6797,
"mean_token_accuracy": 0.7796317916363478,
"num_tokens": 307575.0,
"step": 18540
},
{
"epoch": 9.129476923076924,
"grad_norm": 0.7717283964157104,
"learning_rate": 8.57203334364651e-05,
"loss": 0.7018,
"mean_token_accuracy": 0.763766011595726,
"num_tokens": 316091.0,
"step": 18550
},
{
"epoch": 9.1344,
"grad_norm": 0.366485595703125,
"learning_rate": 8.566527756695766e-05,
"loss": 0.6554,
"mean_token_accuracy": 0.7748038172721863,
"num_tokens": 324292.0,
"step": 18560
},
{
"epoch": 9.139323076923077,
"grad_norm": 0.44988542795181274,
"learning_rate": 8.561013352065783e-05,
"loss": 0.7434,
"mean_token_accuracy": 0.7497165717184544,
"num_tokens": 332960.0,
"step": 18570
},
{
"epoch": 9.144246153846154,
"grad_norm": 0.27599868178367615,
"learning_rate": 8.555490143390062e-05,
"loss": 0.6943,
"mean_token_accuracy": 0.7611289013177156,
"num_tokens": 341446.0,
"step": 18580
},
{
"epoch": 9.14916923076923,
"grad_norm": 0.29391446709632874,
"learning_rate": 8.549958144323862e-05,
"loss": 0.6971,
"mean_token_accuracy": 0.7690398130565882,
"num_tokens": 349712.0,
"step": 18590
},
{
"epoch": 9.154092307692308,
"grad_norm": 0.30475255846977234,
"learning_rate": 8.544417368544189e-05,
"loss": 0.7287,
"mean_token_accuracy": 0.7482383538037538,
"num_tokens": 359021.0,
"step": 18600
},
{
"epoch": 9.158523076923077,
"grad_norm": 0.5902156829833984,
"learning_rate": 8.538867829749734e-05,
"loss": 0.7084,
"mean_token_accuracy": 0.7611913044005633,
"num_tokens": 9586.0,
"step": 18610
},
{
"epoch": 9.163446153846154,
"grad_norm": 0.38383767008781433,
"learning_rate": 8.533309541660863e-05,
"loss": 0.7506,
"mean_token_accuracy": 0.7360954392701388,
"num_tokens": 18015.0,
"step": 18620
},
{
"epoch": 9.168369230769231,
"grad_norm": 0.27133414149284363,
"learning_rate": 8.527742518019567e-05,
"loss": 0.6913,
"mean_token_accuracy": 0.7697105508297681,
"num_tokens": 27033.0,
"step": 18630
},
{
"epoch": 9.173292307692307,
"grad_norm": 0.3312305510044098,
"learning_rate": 8.52216677258944e-05,
"loss": 0.7035,
"mean_token_accuracy": 0.7611101619899273,
"num_tokens": 36045.0,
"step": 18640
},
{
"epoch": 9.178215384615385,
"grad_norm": 0.7461434602737427,
"learning_rate": 8.516582319155633e-05,
"loss": 0.7247,
"mean_token_accuracy": 0.7545670151710511,
"num_tokens": 44409.0,
"step": 18650
},
{
"epoch": 9.183138461538462,
"grad_norm": 0.4726799726486206,
"learning_rate": 8.51098917152483e-05,
"loss": 0.7093,
"mean_token_accuracy": 0.7689370591193437,
"num_tokens": 53497.0,
"step": 18660
},
{
"epoch": 9.188061538461538,
"grad_norm": 0.26350924372673035,
"learning_rate": 8.505387343525209e-05,
"loss": 0.7406,
"mean_token_accuracy": 0.766264171525836,
"num_tokens": 61939.0,
"step": 18670
},
{
"epoch": 9.192984615384615,
"grad_norm": 0.20984847843647003,
"learning_rate": 8.49977684900641e-05,
"loss": 0.6724,
"mean_token_accuracy": 0.7731727968901396,
"num_tokens": 70391.0,
"step": 18680
},
{
"epoch": 9.197907692307693,
"grad_norm": 0.2626084089279175,
"learning_rate": 8.4941577018395e-05,
"loss": 0.6348,
"mean_token_accuracy": 0.7817917808890342,
"num_tokens": 78312.0,
"step": 18690
},
{
"epoch": 9.202830769230768,
"grad_norm": 0.29489243030548096,
"learning_rate": 8.488529915916936e-05,
"loss": 0.746,
"mean_token_accuracy": 0.7493366193026304,
"num_tokens": 87869.0,
"step": 18700
},
{
"epoch": 9.207753846153846,
"grad_norm": 0.4739731550216675,
"learning_rate": 8.482893505152533e-05,
"loss": 0.7485,
"mean_token_accuracy": 0.751647999510169,
"num_tokens": 96885.0,
"step": 18710
},
{
"epoch": 9.212676923076923,
"grad_norm": 0.8138965368270874,
"learning_rate": 8.47724848348143e-05,
"loss": 0.6989,
"mean_token_accuracy": 0.7752762287855148,
"num_tokens": 106453.0,
"step": 18720
},
{
"epoch": 9.2176,
"grad_norm": 0.3375241160392761,
"learning_rate": 8.471594864860058e-05,
"loss": 0.7631,
"mean_token_accuracy": 0.7568930108100176,
"num_tokens": 116549.0,
"step": 18730
},
{
"epoch": 9.222523076923077,
"grad_norm": 0.2639356851577759,
"learning_rate": 8.4659326632661e-05,
"loss": 0.6026,
"mean_token_accuracy": 0.7832478541880846,
"num_tokens": 124510.0,
"step": 18740
},
{
"epoch": 9.227446153846154,
"grad_norm": 0.915088951587677,
"learning_rate": 8.460261892698457e-05,
"loss": 0.6784,
"mean_token_accuracy": 0.772139797359705,
"num_tokens": 132722.0,
"step": 18750
},
{
"epoch": 9.232369230769232,
"grad_norm": 0.45034798979759216,
"learning_rate": 8.454582567177223e-05,
"loss": 0.5967,
"mean_token_accuracy": 0.8007356438785791,
"num_tokens": 141020.0,
"step": 18760
},
{
"epoch": 9.237292307692307,
"grad_norm": 0.23716862499713898,
"learning_rate": 8.44889470074363e-05,
"loss": 0.6551,
"mean_token_accuracy": 0.7765100870281458,
"num_tokens": 149320.0,
"step": 18770
},
{
"epoch": 9.242215384615385,
"grad_norm": 0.2853967845439911,
"learning_rate": 8.443198307460041e-05,
"loss": 0.7346,
"mean_token_accuracy": 0.7537438083440066,
"num_tokens": 158038.0,
"step": 18780
},
{
"epoch": 9.247138461538462,
"grad_norm": 0.40558820962905884,
"learning_rate": 8.437493401409888e-05,
"loss": 0.6459,
"mean_token_accuracy": 0.7761574640870095,
"num_tokens": 166488.0,
"step": 18790
},
{
"epoch": 9.252061538461538,
"grad_norm": 0.3173389434814453,
"learning_rate": 8.431779996697656e-05,
"loss": 0.6979,
"mean_token_accuracy": 0.7614830315113068,
"num_tokens": 175398.0,
"step": 18800
},
{
"epoch": 9.256984615384615,
"grad_norm": 0.31110990047454834,
"learning_rate": 8.426058107448841e-05,
"loss": 0.7819,
"mean_token_accuracy": 0.7374692268669605,
"num_tokens": 184697.0,
"step": 18810
},
{
"epoch": 9.261907692307693,
"grad_norm": 0.23878473043441772,
"learning_rate": 8.420327747809913e-05,
"loss": 0.732,
"mean_token_accuracy": 0.7535562068223953,
"num_tokens": 193847.0,
"step": 18820
},
{
"epoch": 9.266830769230769,
"grad_norm": 0.28896352648735046,
"learning_rate": 8.414588931948287e-05,
"loss": 0.6439,
"mean_token_accuracy": 0.7868727888911963,
"num_tokens": 202991.0,
"step": 18830
},
{
"epoch": 9.271753846153846,
"grad_norm": 0.43781089782714844,
"learning_rate": 8.408841674052284e-05,
"loss": 0.8251,
"mean_token_accuracy": 0.7127380024641752,
"num_tokens": 212459.0,
"step": 18840
},
{
"epoch": 9.276676923076923,
"grad_norm": 0.3533414900302887,
"learning_rate": 8.403085988331092e-05,
"loss": 0.7372,
"mean_token_accuracy": 0.7454831000417471,
"num_tokens": 221274.0,
"step": 18850
},
{
"epoch": 9.2816,
"grad_norm": 0.35645604133605957,
"learning_rate": 8.397321889014743e-05,
"loss": 0.7142,
"mean_token_accuracy": 0.7632349513471126,
"num_tokens": 229612.0,
"step": 18860
},
{
"epoch": 9.286523076923077,
"grad_norm": 0.4839099645614624,
"learning_rate": 8.391549390354061e-05,
"loss": 0.685,
"mean_token_accuracy": 0.7794561486691236,
"num_tokens": 239028.0,
"step": 18870
},
{
"epoch": 9.291446153846154,
"grad_norm": 0.8532965183258057,
"learning_rate": 8.385768506620649e-05,
"loss": 0.7402,
"mean_token_accuracy": 0.7652015954256057,
"num_tokens": 248482.0,
"step": 18880
},
{
"epoch": 9.296369230769232,
"grad_norm": 0.2782347798347473,
"learning_rate": 8.379979252106829e-05,
"loss": 0.6769,
"mean_token_accuracy": 0.7735106501728296,
"num_tokens": 256626.0,
"step": 18890
},
{
"epoch": 9.301292307692307,
"grad_norm": 0.2272525280714035,
"learning_rate": 8.374181641125622e-05,
"loss": 0.7279,
"mean_token_accuracy": 0.7602387875318527,
"num_tokens": 265897.0,
"step": 18900
},
{
"epoch": 9.306215384615385,
"grad_norm": 0.3278202414512634,
"learning_rate": 8.368375688010712e-05,
"loss": 0.7268,
"mean_token_accuracy": 0.7507894467562437,
"num_tokens": 275027.0,
"step": 18910
},
{
"epoch": 9.311138461538462,
"grad_norm": 0.26526904106140137,
"learning_rate": 8.362561407116405e-05,
"loss": 0.6761,
"mean_token_accuracy": 0.7650868054479361,
"num_tokens": 284258.0,
"step": 18920
},
{
"epoch": 9.316061538461538,
"grad_norm": 0.4439482092857361,
"learning_rate": 8.356738812817596e-05,
"loss": 0.7357,
"mean_token_accuracy": 0.7472224164754152,
"num_tokens": 293076.0,
"step": 18930
},
{
"epoch": 9.320984615384615,
"grad_norm": 0.27552875876426697,
"learning_rate": 8.350907919509734e-05,
"loss": 0.6793,
"mean_token_accuracy": 0.7668122231960297,
"num_tokens": 301359.0,
"step": 18940
},
{
"epoch": 9.325907692307693,
"grad_norm": 0.268410325050354,
"learning_rate": 8.345068741608786e-05,
"loss": 0.7784,
"mean_token_accuracy": 0.7451492633670569,
"num_tokens": 310342.0,
"step": 18950
},
{
"epoch": 9.330830769230769,
"grad_norm": 0.3308853209018707,
"learning_rate": 8.339221293551203e-05,
"loss": 0.6681,
"mean_token_accuracy": 0.7704876314848661,
"num_tokens": 318452.0,
"step": 18960
},
{
"epoch": 9.335753846153846,
"grad_norm": 0.25952640175819397,
"learning_rate": 8.33336558979388e-05,
"loss": 0.7042,
"mean_token_accuracy": 0.7535649377852678,
"num_tokens": 328195.0,
"step": 18970
},
{
"epoch": 9.340676923076924,
"grad_norm": 0.2396654337644577,
"learning_rate": 8.327501644814122e-05,
"loss": 0.6969,
"mean_token_accuracy": 0.7682115890085697,
"num_tokens": 337053.0,
"step": 18980
},
{
"epoch": 9.3456,
"grad_norm": 0.32446226477622986,
"learning_rate": 8.321629473109615e-05,
"loss": 0.6289,
"mean_token_accuracy": 0.7851255543529987,
"num_tokens": 345015.0,
"step": 18990
},
{
"epoch": 9.350523076923077,
"grad_norm": 0.2200402021408081,
"learning_rate": 8.315749089198378e-05,
"loss": 0.849,
"mean_token_accuracy": 0.714352885633707,
"num_tokens": 355063.0,
"step": 19000
},
{
"epoch": 9.355446153846154,
"grad_norm": 0.40607473254203796,
"learning_rate": 8.309860507618737e-05,
"loss": 0.7187,
"mean_token_accuracy": 0.7435123972594738,
"num_tokens": 364147.0,
"step": 19010
},
{
"epoch": 9.36036923076923,
"grad_norm": 0.2307461053133011,
"learning_rate": 8.303963742929284e-05,
"loss": 0.7594,
"mean_token_accuracy": 0.7431487880647183,
"num_tokens": 373084.0,
"step": 19020
},
{
"epoch": 9.365292307692307,
"grad_norm": 0.36096903681755066,
"learning_rate": 8.298058809708842e-05,
"loss": 0.8165,
"mean_token_accuracy": 0.7338841069489718,
"num_tokens": 383053.0,
"step": 19030
},
{
"epoch": 9.370215384615385,
"grad_norm": 0.4592624604701996,
"learning_rate": 8.292145722556431e-05,
"loss": 0.6305,
"mean_token_accuracy": 0.7788219083100557,
"num_tokens": 391397.0,
"step": 19040
},
{
"epoch": 9.375138461538462,
"grad_norm": 0.24138374626636505,
"learning_rate": 8.286224496091228e-05,
"loss": 0.7563,
"mean_token_accuracy": 0.7563010204583407,
"num_tokens": 401057.0,
"step": 19050
},
{
"epoch": 9.380061538461538,
"grad_norm": 1.3317973613739014,
"learning_rate": 8.280295144952536e-05,
"loss": 0.874,
"mean_token_accuracy": 0.7216722797602415,
"num_tokens": 411806.0,
"step": 19060
},
{
"epoch": 9.384984615384615,
"grad_norm": 0.24880658090114594,
"learning_rate": 8.274357683799744e-05,
"loss": 0.6631,
"mean_token_accuracy": 0.7717092610895634,
"num_tokens": 420346.0,
"step": 19070
},
{
"epoch": 9.389907692307693,
"grad_norm": 0.28920841217041016,
"learning_rate": 8.268412127312293e-05,
"loss": 0.7502,
"mean_token_accuracy": 0.7458052407950163,
"num_tokens": 429329.0,
"step": 19080
},
{
"epoch": 9.394830769230769,
"grad_norm": 0.8361232280731201,
"learning_rate": 8.262458490189633e-05,
"loss": 0.7201,
"mean_token_accuracy": 0.7693693403154611,
"num_tokens": 438637.0,
"step": 19090
},
{
"epoch": 9.399753846153846,
"grad_norm": 1.081518292427063,
"learning_rate": 8.256496787151197e-05,
"loss": 0.73,
"mean_token_accuracy": 0.758531778678298,
"num_tokens": 447629.0,
"step": 19100
},
{
"epoch": 9.404676923076924,
"grad_norm": 0.3594396412372589,
"learning_rate": 8.250527032936359e-05,
"loss": 0.6957,
"mean_token_accuracy": 0.7701297465711832,
"num_tokens": 455960.0,
"step": 19110
},
{
"epoch": 9.4096,
"grad_norm": 0.2965713143348694,
"learning_rate": 8.244549242304399e-05,
"loss": 0.6313,
"mean_token_accuracy": 0.7778208505362272,
"num_tokens": 464243.0,
"step": 19120
},
{
"epoch": 9.414523076923077,
"grad_norm": 0.44778725504875183,
"learning_rate": 8.238563430034463e-05,
"loss": 0.6234,
"mean_token_accuracy": 0.7768757071346044,
"num_tokens": 472177.0,
"step": 19130
},
{
"epoch": 9.419446153846154,
"grad_norm": 0.4347788393497467,
"learning_rate": 8.232569610925533e-05,
"loss": 0.7254,
"mean_token_accuracy": 0.7469818860292434,
"num_tokens": 481557.0,
"step": 19140
},
{
"epoch": 9.42436923076923,
"grad_norm": 0.2500903010368347,
"learning_rate": 8.226567799796383e-05,
"loss": 0.782,
"mean_token_accuracy": 0.7323465205729007,
"num_tokens": 491491.0,
"step": 19150
},
{
"epoch": 9.429292307692307,
"grad_norm": 0.2921452820301056,
"learning_rate": 8.220558011485546e-05,
"loss": 0.7998,
"mean_token_accuracy": 0.7310435988008976,
"num_tokens": 501074.0,
"step": 19160
},
{
"epoch": 9.434215384615385,
"grad_norm": 0.2585694491863251,
"learning_rate": 8.21454026085128e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7234194416552782,
"num_tokens": 509884.0,
"step": 19170
},
{
"epoch": 9.439138461538462,
"grad_norm": 0.24239717423915863,
"learning_rate": 8.208514562771532e-05,
"loss": 0.671,
"mean_token_accuracy": 0.7750304438173771,
"num_tokens": 518146.0,
"step": 19180
},
{
"epoch": 9.444061538461538,
"grad_norm": 0.21721267700195312,
"learning_rate": 8.202480932143887e-05,
"loss": 0.6366,
"mean_token_accuracy": 0.7861224085092544,
"num_tokens": 526055.0,
"step": 19190
},
{
"epoch": 9.448984615384616,
"grad_norm": 0.8764368891716003,
"learning_rate": 8.19643938388555e-05,
"loss": 0.6603,
"mean_token_accuracy": 0.7765530787408352,
"num_tokens": 534409.0,
"step": 19200
},
{
"epoch": 9.453907692307693,
"grad_norm": 0.28634384274482727,
"learning_rate": 8.190389932933301e-05,
"loss": 0.6941,
"mean_token_accuracy": 0.7651392992585897,
"num_tokens": 543311.0,
"step": 19210
},
{
"epoch": 9.458830769230769,
"grad_norm": 0.30995234847068787,
"learning_rate": 8.184332594243455e-05,
"loss": 0.7696,
"mean_token_accuracy": 0.7483407512307167,
"num_tokens": 552293.0,
"step": 19220
},
{
"epoch": 9.463753846153846,
"grad_norm": 0.26820695400238037,
"learning_rate": 8.17826738279183e-05,
"loss": 0.7136,
"mean_token_accuracy": 0.7572588924318552,
"num_tokens": 560875.0,
"step": 19230
},
{
"epoch": 9.468676923076924,
"grad_norm": 0.3167503774166107,
"learning_rate": 8.172194313573711e-05,
"loss": 0.6687,
"mean_token_accuracy": 0.78301134519279,
"num_tokens": 570337.0,
"step": 19240
},
{
"epoch": 9.4736,
"grad_norm": 0.38579657673835754,
"learning_rate": 8.166113401603802e-05,
"loss": 0.6541,
"mean_token_accuracy": 0.7858757961541414,
"num_tokens": 578355.0,
"step": 19250
},
{
"epoch": 9.478523076923077,
"grad_norm": 0.2878792881965637,
"learning_rate": 8.160024661916204e-05,
"loss": 0.836,
"mean_token_accuracy": 0.7330463856458664,
"num_tokens": 588762.0,
"step": 19260
},
{
"epoch": 9.483446153846154,
"grad_norm": 0.3626146614551544,
"learning_rate": 8.153928109564369e-05,
"loss": 0.8072,
"mean_token_accuracy": 0.7299764085561037,
"num_tokens": 598467.0,
"step": 19270
},
{
"epoch": 9.48836923076923,
"grad_norm": 0.2230752855539322,
"learning_rate": 8.147823759621063e-05,
"loss": 0.6656,
"mean_token_accuracy": 0.7746995214372874,
"num_tokens": 607228.0,
"step": 19280
},
{
"epoch": 9.493292307692307,
"grad_norm": 0.35763120651245117,
"learning_rate": 8.141711627178335e-05,
"loss": 0.7943,
"mean_token_accuracy": 0.7384316265583039,
"num_tokens": 616620.0,
"step": 19290
},
{
"epoch": 9.498215384615385,
"grad_norm": 0.31609174609184265,
"learning_rate": 8.135591727347469e-05,
"loss": 0.7832,
"mean_token_accuracy": 0.731552030518651,
"num_tokens": 627052.0,
"step": 19300
},
{
"epoch": 9.503138461538462,
"grad_norm": 0.278390496969223,
"learning_rate": 8.129464075258956e-05,
"loss": 0.7252,
"mean_token_accuracy": 0.7618256121873855,
"num_tokens": 636036.0,
"step": 19310
},
{
"epoch": 9.508061538461538,
"grad_norm": 0.3673849105834961,
"learning_rate": 8.123328686062453e-05,
"loss": 0.6438,
"mean_token_accuracy": 0.7896918896585703,
"num_tokens": 645343.0,
"step": 19320
},
{
"epoch": 9.512984615384616,
"grad_norm": 0.33690834045410156,
"learning_rate": 8.117185574926744e-05,
"loss": 0.8169,
"mean_token_accuracy": 0.7260203436017036,
"num_tokens": 655914.0,
"step": 19330
},
{
"epoch": 9.517907692307693,
"grad_norm": 0.38085484504699707,
"learning_rate": 8.111034757039707e-05,
"loss": 0.7446,
"mean_token_accuracy": 0.7486122488975525,
"num_tokens": 665555.0,
"step": 19340
},
{
"epoch": 9.522830769230769,
"grad_norm": 0.8052934408187866,
"learning_rate": 8.10487624760827e-05,
"loss": 0.7267,
"mean_token_accuracy": 0.7574852678924799,
"num_tokens": 674482.0,
"step": 19350
},
{
"epoch": 9.527753846153846,
"grad_norm": 0.25485706329345703,
"learning_rate": 8.098710061858381e-05,
"loss": 0.6928,
"mean_token_accuracy": 0.7583078496158123,
"num_tokens": 682777.0,
"step": 19360
},
{
"epoch": 9.532676923076924,
"grad_norm": 0.2488587498664856,
"learning_rate": 8.092536215034967e-05,
"loss": 0.7838,
"mean_token_accuracy": 0.7227123014628887,
"num_tokens": 692407.0,
"step": 19370
},
{
"epoch": 9.5376,
"grad_norm": 1.3520376682281494,
"learning_rate": 8.086354722401892e-05,
"loss": 0.7324,
"mean_token_accuracy": 0.7700716838240623,
"num_tokens": 701713.0,
"step": 19380
},
{
"epoch": 9.542523076923077,
"grad_norm": 0.278576135635376,
"learning_rate": 8.080165599241924e-05,
"loss": 0.7461,
"mean_token_accuracy": 0.755218057706952,
"num_tokens": 710344.0,
"step": 19390
},
{
"epoch": 9.547446153846154,
"grad_norm": 0.3666495680809021,
"learning_rate": 8.0739688608567e-05,
"loss": 0.672,
"mean_token_accuracy": 0.7804633747786284,
"num_tokens": 719045.0,
"step": 19400
}
],
"logging_steps": 10,
"max_steps": 20000,
"num_input_tokens_seen": 0,
"num_train_epochs": 10,
"save_steps": 200,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 4.126844244746281e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}