sciagent / trainer_state.json
aoooa's picture
Upload folder using huggingface_hub
15d1818 verified
{
"best_global_step": 1038,
"best_metric": 0.5740059,
"best_model_checkpoint": "/mnt/gpfs/shenyujiong/output/qwen3-vl-8b-int-sft-merged-nv5592-third3000-full-3epoch/v0-20251226-140741/checkpoint-1038",
"epoch": 3.0,
"eval_steps": 500,
"global_step": 1038,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.002890173410404624,
"grad_norm": 6.073309605336921,
"learning_rate": 1.923076923076923e-08,
"loss": 0.8852723240852356,
"step": 1,
"token_acc": 0.7513407750453963
},
{
"epoch": 0.005780346820809248,
"grad_norm": 5.632770918536085,
"learning_rate": 3.846153846153846e-08,
"loss": 0.8229959607124329,
"step": 2,
"token_acc": 0.7648557050426209
},
{
"epoch": 0.008670520231213872,
"grad_norm": 5.550843708913173,
"learning_rate": 5.7692307692307695e-08,
"loss": 0.8395601511001587,
"step": 3,
"token_acc": 0.7611515500814708
},
{
"epoch": 0.011560693641618497,
"grad_norm": 5.463688271600264,
"learning_rate": 7.692307692307692e-08,
"loss": 0.8262450695037842,
"step": 4,
"token_acc": 0.7617775757231346
},
{
"epoch": 0.014450867052023121,
"grad_norm": 5.208733348546384,
"learning_rate": 9.615384615384616e-08,
"loss": 0.7870609760284424,
"step": 5,
"token_acc": 0.7738227378472486
},
{
"epoch": 0.017341040462427744,
"grad_norm": 6.094089600000965,
"learning_rate": 1.1538461538461539e-07,
"loss": 0.890167236328125,
"step": 6,
"token_acc": 0.7463134620800402
},
{
"epoch": 0.02023121387283237,
"grad_norm": 5.511558073866942,
"learning_rate": 1.346153846153846e-07,
"loss": 0.8200665712356567,
"step": 7,
"token_acc": 0.7655801718674399
},
{
"epoch": 0.023121387283236993,
"grad_norm": 5.840135867020467,
"learning_rate": 1.5384615384615385e-07,
"loss": 0.8561823964118958,
"step": 8,
"token_acc": 0.7551989061787877
},
{
"epoch": 0.02601156069364162,
"grad_norm": 4.93074237263625,
"learning_rate": 1.7307692307692305e-07,
"loss": 0.7908620834350586,
"step": 9,
"token_acc": 0.7736331966727492
},
{
"epoch": 0.028901734104046242,
"grad_norm": 5.513250434452228,
"learning_rate": 1.9230769230769231e-07,
"loss": 0.8536443710327148,
"step": 10,
"token_acc": 0.7537275655775426
},
{
"epoch": 0.031791907514450865,
"grad_norm": 5.6890026898261254,
"learning_rate": 2.1153846153846152e-07,
"loss": 0.8860396146774292,
"step": 11,
"token_acc": 0.7444433233394834
},
{
"epoch": 0.03468208092485549,
"grad_norm": 5.204460891865508,
"learning_rate": 2.3076923076923078e-07,
"loss": 0.8523805141448975,
"step": 12,
"token_acc": 0.7516541745600307
},
{
"epoch": 0.03757225433526012,
"grad_norm": 5.727537830602335,
"learning_rate": 2.5e-07,
"loss": 0.8715107440948486,
"step": 13,
"token_acc": 0.7483992966857977
},
{
"epoch": 0.04046242774566474,
"grad_norm": 5.573759954820184,
"learning_rate": 2.692307692307692e-07,
"loss": 0.8587294220924377,
"step": 14,
"token_acc": 0.752293881658215
},
{
"epoch": 0.04335260115606936,
"grad_norm": 5.626217493866761,
"learning_rate": 2.884615384615384e-07,
"loss": 0.8353704810142517,
"step": 15,
"token_acc": 0.7603716874100415
},
{
"epoch": 0.046242774566473986,
"grad_norm": 5.780174641621012,
"learning_rate": 3.076923076923077e-07,
"loss": 0.8726707100868225,
"step": 16,
"token_acc": 0.750940308255944
},
{
"epoch": 0.049132947976878616,
"grad_norm": 4.3328681597964875,
"learning_rate": 3.269230769230769e-07,
"loss": 0.718013346195221,
"step": 17,
"token_acc": 0.7931623195891079
},
{
"epoch": 0.05202312138728324,
"grad_norm": 5.47302287757926,
"learning_rate": 3.461538461538461e-07,
"loss": 0.8578764200210571,
"step": 18,
"token_acc": 0.7521628365412952
},
{
"epoch": 0.05491329479768786,
"grad_norm": 5.003969625540578,
"learning_rate": 3.6538461538461534e-07,
"loss": 0.8133180737495422,
"step": 19,
"token_acc": 0.7619723575896223
},
{
"epoch": 0.057803468208092484,
"grad_norm": 5.6946171227062115,
"learning_rate": 3.8461538461538463e-07,
"loss": 0.8691498041152954,
"step": 20,
"token_acc": 0.7492446732183174
},
{
"epoch": 0.06069364161849711,
"grad_norm": 5.520197357593707,
"learning_rate": 4.0384615384615386e-07,
"loss": 0.907565712928772,
"step": 21,
"token_acc": 0.739601049536876
},
{
"epoch": 0.06358381502890173,
"grad_norm": 4.583439446754697,
"learning_rate": 4.2307692307692304e-07,
"loss": 0.8114128708839417,
"step": 22,
"token_acc": 0.7639052404881551
},
{
"epoch": 0.06647398843930635,
"grad_norm": 4.920313367321747,
"learning_rate": 4.423076923076923e-07,
"loss": 0.8422179222106934,
"step": 23,
"token_acc": 0.7567072154640894
},
{
"epoch": 0.06936416184971098,
"grad_norm": 5.263032949222765,
"learning_rate": 4.6153846153846156e-07,
"loss": 0.8715439438819885,
"step": 24,
"token_acc": 0.7464963254144591
},
{
"epoch": 0.07225433526011561,
"grad_norm": 4.870068302475069,
"learning_rate": 4.807692307692307e-07,
"loss": 0.8316457271575928,
"step": 25,
"token_acc": 0.7582423573346417
},
{
"epoch": 0.07514450867052024,
"grad_norm": 4.199216776916685,
"learning_rate": 5e-07,
"loss": 0.7344825267791748,
"step": 26,
"token_acc": 0.7839733369517283
},
{
"epoch": 0.07803468208092486,
"grad_norm": 4.588333481721223,
"learning_rate": 5.192307692307692e-07,
"loss": 0.8012775182723999,
"step": 27,
"token_acc": 0.767028959599571
},
{
"epoch": 0.08092485549132948,
"grad_norm": 4.194674553902997,
"learning_rate": 5.384615384615384e-07,
"loss": 0.712963879108429,
"step": 28,
"token_acc": 0.7923740483107238
},
{
"epoch": 0.0838150289017341,
"grad_norm": 4.053747357354017,
"learning_rate": 5.576923076923077e-07,
"loss": 0.7496437430381775,
"step": 29,
"token_acc": 0.7814042116577906
},
{
"epoch": 0.08670520231213873,
"grad_norm": 3.718069447981091,
"learning_rate": 5.769230769230768e-07,
"loss": 0.7818017601966858,
"step": 30,
"token_acc": 0.7700440596977877
},
{
"epoch": 0.08959537572254335,
"grad_norm": 3.420080175405301,
"learning_rate": 5.961538461538461e-07,
"loss": 0.7861907482147217,
"step": 31,
"token_acc": 0.7627405151738911
},
{
"epoch": 0.09248554913294797,
"grad_norm": 2.639800184791621,
"learning_rate": 6.153846153846154e-07,
"loss": 0.6684123277664185,
"step": 32,
"token_acc": 0.7977043354655295
},
{
"epoch": 0.0953757225433526,
"grad_norm": 2.9502697501210413,
"learning_rate": 6.346153846153845e-07,
"loss": 0.7446445226669312,
"step": 33,
"token_acc": 0.771793289625916
},
{
"epoch": 0.09826589595375723,
"grad_norm": 2.8110101894954345,
"learning_rate": 6.538461538461538e-07,
"loss": 0.7382901906967163,
"step": 34,
"token_acc": 0.7770551133606955
},
{
"epoch": 0.10115606936416185,
"grad_norm": 2.9797000830123226,
"learning_rate": 6.730769230769231e-07,
"loss": 0.7384837865829468,
"step": 35,
"token_acc": 0.7742859974561853
},
{
"epoch": 0.10404624277456648,
"grad_norm": 2.7709890477908177,
"learning_rate": 6.923076923076922e-07,
"loss": 0.7289628982543945,
"step": 36,
"token_acc": 0.7765123239561783
},
{
"epoch": 0.1069364161849711,
"grad_norm": 2.59015685758215,
"learning_rate": 7.115384615384616e-07,
"loss": 0.7290064096450806,
"step": 37,
"token_acc": 0.7784733624454149
},
{
"epoch": 0.10982658959537572,
"grad_norm": 2.8646835764259233,
"learning_rate": 7.307692307692307e-07,
"loss": 0.7594764828681946,
"step": 38,
"token_acc": 0.7671359481427088
},
{
"epoch": 0.11271676300578035,
"grad_norm": 2.349168631790223,
"learning_rate": 7.5e-07,
"loss": 0.72218257188797,
"step": 39,
"token_acc": 0.7804759091596026
},
{
"epoch": 0.11560693641618497,
"grad_norm": 2.511985129172397,
"learning_rate": 7.692307692307693e-07,
"loss": 0.7277328968048096,
"step": 40,
"token_acc": 0.7780124249072232
},
{
"epoch": 0.11849710982658959,
"grad_norm": 2.5792884120122235,
"learning_rate": 7.884615384615384e-07,
"loss": 0.7460165619850159,
"step": 41,
"token_acc": 0.7733394615523893
},
{
"epoch": 0.12138728323699421,
"grad_norm": 1.5451971118538999,
"learning_rate": 8.076923076923077e-07,
"loss": 0.7386133670806885,
"step": 42,
"token_acc": 0.772020024353944
},
{
"epoch": 0.12427745664739884,
"grad_norm": 1.3982437840218045,
"learning_rate": 8.269230769230768e-07,
"loss": 0.7192668914794922,
"step": 43,
"token_acc": 0.7744656594039339
},
{
"epoch": 0.12716763005780346,
"grad_norm": 1.4772019806138394,
"learning_rate": 8.461538461538461e-07,
"loss": 0.6977580189704895,
"step": 44,
"token_acc": 0.7803848372212253
},
{
"epoch": 0.13005780346820808,
"grad_norm": 1.426662829341362,
"learning_rate": 8.653846153846154e-07,
"loss": 0.6999402642250061,
"step": 45,
"token_acc": 0.778780228821366
},
{
"epoch": 0.1329479768786127,
"grad_norm": 1.4168889938692493,
"learning_rate": 8.846153846153846e-07,
"loss": 0.7392410635948181,
"step": 46,
"token_acc": 0.7691270558007607
},
{
"epoch": 0.13583815028901733,
"grad_norm": 1.4711907038839338,
"learning_rate": 9.038461538461538e-07,
"loss": 0.7351399660110474,
"step": 47,
"token_acc": 0.7670848343481196
},
{
"epoch": 0.13872832369942195,
"grad_norm": 1.2965845227191142,
"learning_rate": 9.230769230769231e-07,
"loss": 0.7003874778747559,
"step": 48,
"token_acc": 0.7787950748052811
},
{
"epoch": 0.1416184971098266,
"grad_norm": 1.292104981035939,
"learning_rate": 9.423076923076923e-07,
"loss": 0.7326341867446899,
"step": 49,
"token_acc": 0.7685059219819624
},
{
"epoch": 0.14450867052023122,
"grad_norm": 1.2291132980421766,
"learning_rate": 9.615384615384615e-07,
"loss": 0.6871765851974487,
"step": 50,
"token_acc": 0.7841781074662453
},
{
"epoch": 0.14739884393063585,
"grad_norm": 1.123170268506369,
"learning_rate": 9.807692307692306e-07,
"loss": 0.6960352659225464,
"step": 51,
"token_acc": 0.7801758979708864
},
{
"epoch": 0.15028901734104047,
"grad_norm": 1.00691295990528,
"learning_rate": 1e-06,
"loss": 0.6956222653388977,
"step": 52,
"token_acc": 0.7829201628190622
},
{
"epoch": 0.1531791907514451,
"grad_norm": 0.9370942178938112,
"learning_rate": 9.999974620354198e-07,
"loss": 0.6958713531494141,
"step": 53,
"token_acc": 0.7809317408675194
},
{
"epoch": 0.15606936416184972,
"grad_norm": 1.1057401423493767,
"learning_rate": 9.999898481674446e-07,
"loss": 0.7062472105026245,
"step": 54,
"token_acc": 0.7756643140884724
},
{
"epoch": 0.15895953757225434,
"grad_norm": 0.8619542832761329,
"learning_rate": 9.999771584733693e-07,
"loss": 0.6577130556106567,
"step": 55,
"token_acc": 0.7922278867707445
},
{
"epoch": 0.16184971098265896,
"grad_norm": 0.9166807116221914,
"learning_rate": 9.999593930820181e-07,
"loss": 0.6945655941963196,
"step": 56,
"token_acc": 0.77725851438142
},
{
"epoch": 0.16473988439306358,
"grad_norm": 0.939862155697591,
"learning_rate": 9.999365521737421e-07,
"loss": 0.6921431422233582,
"step": 57,
"token_acc": 0.7773106126184057
},
{
"epoch": 0.1676300578034682,
"grad_norm": 0.9756834016584089,
"learning_rate": 9.999086359804195e-07,
"loss": 0.7256878018379211,
"step": 58,
"token_acc": 0.7686141412007078
},
{
"epoch": 0.17052023121387283,
"grad_norm": 0.8557348808489443,
"learning_rate": 9.99875644785451e-07,
"loss": 0.6813135147094727,
"step": 59,
"token_acc": 0.7843321803650282
},
{
"epoch": 0.17341040462427745,
"grad_norm": 0.8266352802865822,
"learning_rate": 9.998375789237592e-07,
"loss": 0.6513127088546753,
"step": 60,
"token_acc": 0.7914724403689247
},
{
"epoch": 0.17630057803468208,
"grad_norm": 0.8497866635296994,
"learning_rate": 9.99794438781783e-07,
"loss": 0.6605720520019531,
"step": 61,
"token_acc": 0.78915683493063
},
{
"epoch": 0.1791907514450867,
"grad_norm": 0.8351298607584619,
"learning_rate": 9.99746224797475e-07,
"loss": 0.6266233325004578,
"step": 62,
"token_acc": 0.7963586246917163
},
{
"epoch": 0.18208092485549132,
"grad_norm": 0.9019491097127296,
"learning_rate": 9.996929374602968e-07,
"loss": 0.6673212647438049,
"step": 63,
"token_acc": 0.7844323603274962
},
{
"epoch": 0.18497109826589594,
"grad_norm": 0.8813921264261143,
"learning_rate": 9.996345773112138e-07,
"loss": 0.7036587595939636,
"step": 64,
"token_acc": 0.7740703997187025
},
{
"epoch": 0.18786127167630057,
"grad_norm": 0.8869002415681166,
"learning_rate": 9.995711449426901e-07,
"loss": 0.6981368064880371,
"step": 65,
"token_acc": 0.7753412151954072
},
{
"epoch": 0.1907514450867052,
"grad_norm": 0.7752119383387671,
"learning_rate": 9.99502640998682e-07,
"loss": 0.6600744724273682,
"step": 66,
"token_acc": 0.788013646851561
},
{
"epoch": 0.1936416184971098,
"grad_norm": 0.8616071421748983,
"learning_rate": 9.99429066174632e-07,
"loss": 0.6547806262969971,
"step": 67,
"token_acc": 0.7894853017554794
},
{
"epoch": 0.19653179190751446,
"grad_norm": 0.8018562843868764,
"learning_rate": 9.993504212174613e-07,
"loss": 0.6278072595596313,
"step": 68,
"token_acc": 0.7972202882855006
},
{
"epoch": 0.1994219653179191,
"grad_norm": 0.7473736558335493,
"learning_rate": 9.992667069255618e-07,
"loss": 0.6237850785255432,
"step": 69,
"token_acc": 0.7982735792533637
},
{
"epoch": 0.2023121387283237,
"grad_norm": 0.6999587458869299,
"learning_rate": 9.991779241487899e-07,
"loss": 0.6401976346969604,
"step": 70,
"token_acc": 0.7928364264997928
},
{
"epoch": 0.20520231213872833,
"grad_norm": 0.6924984079683673,
"learning_rate": 9.990840737884554e-07,
"loss": 0.6805769205093384,
"step": 71,
"token_acc": 0.7801177818172763
},
{
"epoch": 0.20809248554913296,
"grad_norm": 0.7111004746445246,
"learning_rate": 9.989851567973138e-07,
"loss": 0.697790801525116,
"step": 72,
"token_acc": 0.7760267430754537
},
{
"epoch": 0.21098265895953758,
"grad_norm": 0.6869871346194354,
"learning_rate": 9.988811741795566e-07,
"loss": 0.6186888217926025,
"step": 73,
"token_acc": 0.7994626021789282
},
{
"epoch": 0.2138728323699422,
"grad_norm": 0.6177183453130074,
"learning_rate": 9.987721269908005e-07,
"loss": 0.5868158340454102,
"step": 74,
"token_acc": 0.8114196656276566
},
{
"epoch": 0.21676300578034682,
"grad_norm": 0.6307801092890282,
"learning_rate": 9.98658016338077e-07,
"loss": 0.6723257303237915,
"step": 75,
"token_acc": 0.7827200467097494
},
{
"epoch": 0.21965317919075145,
"grad_norm": 0.6150476355618669,
"learning_rate": 9.985388433798215e-07,
"loss": 0.6530448198318481,
"step": 76,
"token_acc": 0.7907922080887895
},
{
"epoch": 0.22254335260115607,
"grad_norm": 0.5940300278296939,
"learning_rate": 9.984146093258608e-07,
"loss": 0.6855973601341248,
"step": 77,
"token_acc": 0.7784828714678302
},
{
"epoch": 0.2254335260115607,
"grad_norm": 0.9497443806056196,
"learning_rate": 9.982853154374013e-07,
"loss": 0.6745576858520508,
"step": 78,
"token_acc": 0.7854156213413614
},
{
"epoch": 0.22832369942196531,
"grad_norm": 0.6791196750467849,
"learning_rate": 9.981509630270167e-07,
"loss": 0.6383039951324463,
"step": 79,
"token_acc": 0.7940166430627679
},
{
"epoch": 0.23121387283236994,
"grad_norm": 0.6194193913683183,
"learning_rate": 9.980115534586333e-07,
"loss": 0.6046701669692993,
"step": 80,
"token_acc": 0.8031263032947291
},
{
"epoch": 0.23410404624277456,
"grad_norm": 0.584941512318404,
"learning_rate": 9.978670881475172e-07,
"loss": 0.6113057136535645,
"step": 81,
"token_acc": 0.8002890249696458
},
{
"epoch": 0.23699421965317918,
"grad_norm": 0.576070429321087,
"learning_rate": 9.9771756856026e-07,
"loss": 0.6508547067642212,
"step": 82,
"token_acc": 0.7917054316809551
},
{
"epoch": 0.2398843930635838,
"grad_norm": 0.5782915674069733,
"learning_rate": 9.975629962147633e-07,
"loss": 0.6592724323272705,
"step": 83,
"token_acc": 0.7841273280945267
},
{
"epoch": 0.24277456647398843,
"grad_norm": 0.5894596908351907,
"learning_rate": 9.974033726802235e-07,
"loss": 0.5925013422966003,
"step": 84,
"token_acc": 0.8060771521769962
},
{
"epoch": 0.24566473988439305,
"grad_norm": 0.5279159216055382,
"learning_rate": 9.972386995771164e-07,
"loss": 0.6444322466850281,
"step": 85,
"token_acc": 0.7914691943127962
},
{
"epoch": 0.24855491329479767,
"grad_norm": 0.5809453095781784,
"learning_rate": 9.970689785771798e-07,
"loss": 0.6508707404136658,
"step": 86,
"token_acc": 0.7889469472867465
},
{
"epoch": 0.2514450867052023,
"grad_norm": 0.6715617527059413,
"learning_rate": 9.968942114033973e-07,
"loss": 0.5962953567504883,
"step": 87,
"token_acc": 0.8063397578524576
},
{
"epoch": 0.2543352601156069,
"grad_norm": 0.6155392081496504,
"learning_rate": 9.967143998299802e-07,
"loss": 0.6590582132339478,
"step": 88,
"token_acc": 0.786015653473848
},
{
"epoch": 0.25722543352601157,
"grad_norm": 0.6351340196244468,
"learning_rate": 9.965295456823507e-07,
"loss": 0.6178431510925293,
"step": 89,
"token_acc": 0.799615789600598
},
{
"epoch": 0.26011560693641617,
"grad_norm": 0.6389337976646079,
"learning_rate": 9.963396508371217e-07,
"loss": 0.6065088510513306,
"step": 90,
"token_acc": 0.8027006050850137
},
{
"epoch": 0.2630057803468208,
"grad_norm": 0.5682640638544528,
"learning_rate": 9.961447172220785e-07,
"loss": 0.6684330105781555,
"step": 91,
"token_acc": 0.7839487407338119
},
{
"epoch": 0.2658959537572254,
"grad_norm": 0.6029647051880634,
"learning_rate": 9.959447468161596e-07,
"loss": 0.6358112096786499,
"step": 92,
"token_acc": 0.7908192833685276
},
{
"epoch": 0.26878612716763006,
"grad_norm": 0.5632656008285092,
"learning_rate": 9.957397416494366e-07,
"loss": 0.6601473093032837,
"step": 93,
"token_acc": 0.7853722190438847
},
{
"epoch": 0.27167630057803466,
"grad_norm": 0.6013944385740286,
"learning_rate": 9.955297038030926e-07,
"loss": 0.668410062789917,
"step": 94,
"token_acc": 0.7828623747800797
},
{
"epoch": 0.2745664739884393,
"grad_norm": 0.5541440784608198,
"learning_rate": 9.95314635409402e-07,
"loss": 0.6117832660675049,
"step": 95,
"token_acc": 0.7995787198241185
},
{
"epoch": 0.2774566473988439,
"grad_norm": 0.6314740935897156,
"learning_rate": 9.95094538651709e-07,
"loss": 0.6261177062988281,
"step": 96,
"token_acc": 0.7962018726778723
},
{
"epoch": 0.28034682080924855,
"grad_norm": 0.7158918907846333,
"learning_rate": 9.948694157644042e-07,
"loss": 0.6556503772735596,
"step": 97,
"token_acc": 0.7869902468442614
},
{
"epoch": 0.2832369942196532,
"grad_norm": 0.5701552977234003,
"learning_rate": 9.946392690329036e-07,
"loss": 0.6187049746513367,
"step": 98,
"token_acc": 0.8010530865652874
},
{
"epoch": 0.2861271676300578,
"grad_norm": 0.5860362253461248,
"learning_rate": 9.944041007936244e-07,
"loss": 0.5410789847373962,
"step": 99,
"token_acc": 0.8207894360088595
},
{
"epoch": 0.28901734104046245,
"grad_norm": 0.6303808407906236,
"learning_rate": 9.941639134339606e-07,
"loss": 0.5768465399742126,
"step": 100,
"token_acc": 0.8087328873195813
},
{
"epoch": 0.29190751445086704,
"grad_norm": 0.616425173315349,
"learning_rate": 9.939187093922609e-07,
"loss": 0.6295806169509888,
"step": 101,
"token_acc": 0.7958193257384945
},
{
"epoch": 0.2947976878612717,
"grad_norm": 0.5753993917901922,
"learning_rate": 9.936684911578017e-07,
"loss": 0.5983704328536987,
"step": 102,
"token_acc": 0.8031383517086323
},
{
"epoch": 0.2976878612716763,
"grad_norm": 0.6140080800303133,
"learning_rate": 9.93413261270763e-07,
"loss": 0.5729444026947021,
"step": 103,
"token_acc": 0.816418031517547
},
{
"epoch": 0.30057803468208094,
"grad_norm": 0.5607455073068854,
"learning_rate": 9.931530223222026e-07,
"loss": 0.5967170596122742,
"step": 104,
"token_acc": 0.803475704051983
},
{
"epoch": 0.30346820809248554,
"grad_norm": 0.5675327028480304,
"learning_rate": 9.928877769540293e-07,
"loss": 0.6241474151611328,
"step": 105,
"token_acc": 0.7967706129971308
},
{
"epoch": 0.3063583815028902,
"grad_norm": 0.6046538978438704,
"learning_rate": 9.926175278589767e-07,
"loss": 0.6553393602371216,
"step": 106,
"token_acc": 0.7874527013411549
},
{
"epoch": 0.3092485549132948,
"grad_norm": 0.5734166676914433,
"learning_rate": 9.923422777805751e-07,
"loss": 0.6570492386817932,
"step": 107,
"token_acc": 0.7870601190355553
},
{
"epoch": 0.31213872832369943,
"grad_norm": 0.6001726322335739,
"learning_rate": 9.920620295131245e-07,
"loss": 0.6794227361679077,
"step": 108,
"token_acc": 0.7787853169709925
},
{
"epoch": 0.315028901734104,
"grad_norm": 0.6099760009068769,
"learning_rate": 9.917767859016654e-07,
"loss": 0.615708589553833,
"step": 109,
"token_acc": 0.7985643236886592
},
{
"epoch": 0.3179190751445087,
"grad_norm": 0.5778662206360861,
"learning_rate": 9.91486549841951e-07,
"loss": 0.5809392929077148,
"step": 110,
"token_acc": 0.8094654316503208
},
{
"epoch": 0.3208092485549133,
"grad_norm": 0.5704401870141648,
"learning_rate": 9.911913242804158e-07,
"loss": 0.6263046264648438,
"step": 111,
"token_acc": 0.7955055464485222
},
{
"epoch": 0.3236994219653179,
"grad_norm": 0.613652119648305,
"learning_rate": 9.908911122141486e-07,
"loss": 0.5810531377792358,
"step": 112,
"token_acc": 0.8122967000471536
},
{
"epoch": 0.3265895953757225,
"grad_norm": 0.5754148794590288,
"learning_rate": 9.905859166908594e-07,
"loss": 0.6450198888778687,
"step": 113,
"token_acc": 0.787714712471994
},
{
"epoch": 0.32947976878612717,
"grad_norm": 0.8102498152797749,
"learning_rate": 9.902757408088501e-07,
"loss": 0.6492223739624023,
"step": 114,
"token_acc": 0.7880358603802299
},
{
"epoch": 0.33236994219653176,
"grad_norm": 0.525946407195948,
"learning_rate": 9.899605877169824e-07,
"loss": 0.5984295606613159,
"step": 115,
"token_acc": 0.8024764689756009
},
{
"epoch": 0.3352601156069364,
"grad_norm": 0.5751169418426346,
"learning_rate": 9.896404606146455e-07,
"loss": 0.6295244097709656,
"step": 116,
"token_acc": 0.7922646493276646
},
{
"epoch": 0.33815028901734107,
"grad_norm": 0.5079153092397871,
"learning_rate": 9.893153627517248e-07,
"loss": 0.5976470112800598,
"step": 117,
"token_acc": 0.8038826857227929
},
{
"epoch": 0.34104046242774566,
"grad_norm": 0.5841459704013869,
"learning_rate": 9.889852974285672e-07,
"loss": 0.6472890973091125,
"step": 118,
"token_acc": 0.789158388689134
},
{
"epoch": 0.3439306358381503,
"grad_norm": 0.6150844233030651,
"learning_rate": 9.886502679959497e-07,
"loss": 0.5413444638252258,
"step": 119,
"token_acc": 0.8222654666342334
},
{
"epoch": 0.3468208092485549,
"grad_norm": 0.5935208615034318,
"learning_rate": 9.883102778550434e-07,
"loss": 0.663335919380188,
"step": 120,
"token_acc": 0.7862711064419373
},
{
"epoch": 0.34971098265895956,
"grad_norm": 0.6268736075123943,
"learning_rate": 9.879653304573797e-07,
"loss": 0.6072404384613037,
"step": 121,
"token_acc": 0.8010549723328334
},
{
"epoch": 0.35260115606936415,
"grad_norm": 0.5583642618257684,
"learning_rate": 9.876154293048163e-07,
"loss": 0.6144070029258728,
"step": 122,
"token_acc": 0.796381277924315
},
{
"epoch": 0.3554913294797688,
"grad_norm": 0.5410450297039057,
"learning_rate": 9.872605779494997e-07,
"loss": 0.5954463481903076,
"step": 123,
"token_acc": 0.8055216585201416
},
{
"epoch": 0.3583815028901734,
"grad_norm": 0.6425891449290073,
"learning_rate": 9.869007799938305e-07,
"loss": 0.6611199378967285,
"step": 124,
"token_acc": 0.786190934231093
},
{
"epoch": 0.36127167630057805,
"grad_norm": 0.5146021782369569,
"learning_rate": 9.865360390904269e-07,
"loss": 0.6081857085227966,
"step": 125,
"token_acc": 0.8017568952922327
},
{
"epoch": 0.36416184971098264,
"grad_norm": 0.5766433781688939,
"learning_rate": 9.86166358942087e-07,
"loss": 0.609286904335022,
"step": 126,
"token_acc": 0.8002619382070126
},
{
"epoch": 0.3670520231213873,
"grad_norm": 0.5450128204125277,
"learning_rate": 9.857917433017508e-07,
"loss": 0.5991868376731873,
"step": 127,
"token_acc": 0.8008499444919779
},
{
"epoch": 0.3699421965317919,
"grad_norm": 0.5810734133360594,
"learning_rate": 9.854121959724635e-07,
"loss": 0.607757568359375,
"step": 128,
"token_acc": 0.7998384333607254
},
{
"epoch": 0.37283236994219654,
"grad_norm": 0.5770182474218292,
"learning_rate": 9.85027720807336e-07,
"loss": 0.5918303728103638,
"step": 129,
"token_acc": 0.8040288846142103
},
{
"epoch": 0.37572254335260113,
"grad_norm": 0.5360179518405197,
"learning_rate": 9.846383217095051e-07,
"loss": 0.646679162979126,
"step": 130,
"token_acc": 0.7929178624953734
},
{
"epoch": 0.3786127167630058,
"grad_norm": 0.5278251178995469,
"learning_rate": 9.842440026320958e-07,
"loss": 0.6081724166870117,
"step": 131,
"token_acc": 0.7979095393804223
},
{
"epoch": 0.3815028901734104,
"grad_norm": 0.5857831669587502,
"learning_rate": 9.838447675781793e-07,
"loss": 0.5776185989379883,
"step": 132,
"token_acc": 0.8089180214756997
},
{
"epoch": 0.38439306358381503,
"grad_norm": 0.49786698791997097,
"learning_rate": 9.834406206007335e-07,
"loss": 0.6665687561035156,
"step": 133,
"token_acc": 0.7817376207568673
},
{
"epoch": 0.3872832369942196,
"grad_norm": 0.5272403389699103,
"learning_rate": 9.83031565802601e-07,
"loss": 0.607385516166687,
"step": 134,
"token_acc": 0.8027202321406094
},
{
"epoch": 0.3901734104046243,
"grad_norm": 0.5881996711071641,
"learning_rate": 9.826176073364482e-07,
"loss": 0.6304242014884949,
"step": 135,
"token_acc": 0.7967265117890893
},
{
"epoch": 0.3930635838150289,
"grad_norm": 0.5540108888142588,
"learning_rate": 9.821987494047228e-07,
"loss": 0.6314468383789062,
"step": 136,
"token_acc": 0.7919692387557874
},
{
"epoch": 0.3959537572254335,
"grad_norm": 0.5722154073047628,
"learning_rate": 9.817749962596114e-07,
"loss": 0.602054238319397,
"step": 137,
"token_acc": 0.802066245506265
},
{
"epoch": 0.3988439306358382,
"grad_norm": 0.5596376441219622,
"learning_rate": 9.813463522029957e-07,
"loss": 0.640647292137146,
"step": 138,
"token_acc": 0.7918518615352437
},
{
"epoch": 0.40173410404624277,
"grad_norm": 0.5545182797573466,
"learning_rate": 9.809128215864096e-07,
"loss": 0.6066859364509583,
"step": 139,
"token_acc": 0.801196721208976
},
{
"epoch": 0.4046242774566474,
"grad_norm": 0.5784484895204948,
"learning_rate": 9.804744088109941e-07,
"loss": 0.5408949851989746,
"step": 140,
"token_acc": 0.8248328121430766
},
{
"epoch": 0.407514450867052,
"grad_norm": 0.5637555298781167,
"learning_rate": 9.80031118327454e-07,
"loss": 0.6107698678970337,
"step": 141,
"token_acc": 0.7982127620772081
},
{
"epoch": 0.41040462427745666,
"grad_norm": 0.603110232763829,
"learning_rate": 9.795829546360113e-07,
"loss": 0.5912826061248779,
"step": 142,
"token_acc": 0.8041540066906055
},
{
"epoch": 0.41329479768786126,
"grad_norm": 0.5873555056914542,
"learning_rate": 9.791299222863602e-07,
"loss": 0.6161830425262451,
"step": 143,
"token_acc": 0.799708864508567
},
{
"epoch": 0.4161849710982659,
"grad_norm": 0.6843944560990027,
"learning_rate": 9.786720258776213e-07,
"loss": 0.5474255681037903,
"step": 144,
"token_acc": 0.8186930860033726
},
{
"epoch": 0.4190751445086705,
"grad_norm": 0.51545250769897,
"learning_rate": 9.782092700582936e-07,
"loss": 0.6216602325439453,
"step": 145,
"token_acc": 0.7965911940150556
},
{
"epoch": 0.42196531791907516,
"grad_norm": 0.5937549088482647,
"learning_rate": 9.77741659526209e-07,
"loss": 0.6248494386672974,
"step": 146,
"token_acc": 0.7956684720442111
},
{
"epoch": 0.42485549132947975,
"grad_norm": 0.5399979093459059,
"learning_rate": 9.77269199028483e-07,
"loss": 0.6089432239532471,
"step": 147,
"token_acc": 0.796826403459652
},
{
"epoch": 0.4277456647398844,
"grad_norm": 0.5564248028198713,
"learning_rate": 9.76791893361468e-07,
"loss": 0.6312023401260376,
"step": 148,
"token_acc": 0.7918012705466769
},
{
"epoch": 0.430635838150289,
"grad_norm": 0.559936805840691,
"learning_rate": 9.763097473707035e-07,
"loss": 0.619454026222229,
"step": 149,
"token_acc": 0.7984878886834271
},
{
"epoch": 0.43352601156069365,
"grad_norm": 0.6044059322614584,
"learning_rate": 9.758227659508668e-07,
"loss": 0.5221510529518127,
"step": 150,
"token_acc": 0.8266117865021535
},
{
"epoch": 0.43641618497109824,
"grad_norm": 0.5692770162596946,
"learning_rate": 9.753309540457248e-07,
"loss": 0.6139217615127563,
"step": 151,
"token_acc": 0.7982664696096701
},
{
"epoch": 0.4393063583815029,
"grad_norm": 0.5330985388783729,
"learning_rate": 9.748343166480822e-07,
"loss": 0.6154735088348389,
"step": 152,
"token_acc": 0.7984871546515382
},
{
"epoch": 0.4421965317919075,
"grad_norm": 0.6065632918781179,
"learning_rate": 9.743328587997314e-07,
"loss": 0.5449005365371704,
"step": 153,
"token_acc": 0.8221805561096261
},
{
"epoch": 0.44508670520231214,
"grad_norm": 0.6274255114547471,
"learning_rate": 9.738265855914012e-07,
"loss": 0.6112866401672363,
"step": 154,
"token_acc": 0.7997394616484714
},
{
"epoch": 0.4479768786127168,
"grad_norm": 0.6000527996102515,
"learning_rate": 9.733155021627057e-07,
"loss": 0.6302502155303955,
"step": 155,
"token_acc": 0.7939255615270142
},
{
"epoch": 0.4508670520231214,
"grad_norm": 0.5716424963426585,
"learning_rate": 9.727996137020916e-07,
"loss": 0.5589959621429443,
"step": 156,
"token_acc": 0.8167590708119868
},
{
"epoch": 0.45375722543352603,
"grad_norm": 0.5793130145184638,
"learning_rate": 9.722789254467854e-07,
"loss": 0.5811511874198914,
"step": 157,
"token_acc": 0.8068220017796527
},
{
"epoch": 0.45664739884393063,
"grad_norm": 0.6447386736666927,
"learning_rate": 9.717534426827404e-07,
"loss": 0.6125731468200684,
"step": 158,
"token_acc": 0.7982601354147698
},
{
"epoch": 0.4595375722543353,
"grad_norm": 0.5583551050757221,
"learning_rate": 9.712231707445831e-07,
"loss": 0.5681207180023193,
"step": 159,
"token_acc": 0.812138891502776
},
{
"epoch": 0.4624277456647399,
"grad_norm": 0.6227411154474924,
"learning_rate": 9.70688115015559e-07,
"loss": 0.5606650114059448,
"step": 160,
"token_acc": 0.8128119485280195
},
{
"epoch": 0.4653179190751445,
"grad_norm": 0.5637826519102942,
"learning_rate": 9.701482809274787e-07,
"loss": 0.584591269493103,
"step": 161,
"token_acc": 0.809975090499813
},
{
"epoch": 0.4682080924855491,
"grad_norm": 0.5527836562945804,
"learning_rate": 9.696036739606606e-07,
"loss": 0.6178029775619507,
"step": 162,
"token_acc": 0.7982424352237725
},
{
"epoch": 0.47109826589595377,
"grad_norm": 0.5261451706415371,
"learning_rate": 9.690542996438777e-07,
"loss": 0.5772680640220642,
"step": 163,
"token_acc": 0.8055154702213526
},
{
"epoch": 0.47398843930635837,
"grad_norm": 0.598598068991984,
"learning_rate": 9.685001635543005e-07,
"loss": 0.5761500597000122,
"step": 164,
"token_acc": 0.8095295422689632
},
{
"epoch": 0.476878612716763,
"grad_norm": 0.5603114991623558,
"learning_rate": 9.679412713174398e-07,
"loss": 0.6070771217346191,
"step": 165,
"token_acc": 0.7988323213451658
},
{
"epoch": 0.4797687861271676,
"grad_norm": 0.5909619017551228,
"learning_rate": 9.673776286070905e-07,
"loss": 0.5829952955245972,
"step": 166,
"token_acc": 0.8056856359399237
},
{
"epoch": 0.48265895953757226,
"grad_norm": 0.7664205949048083,
"learning_rate": 9.668092411452735e-07,
"loss": 0.591526985168457,
"step": 167,
"token_acc": 0.805959940764539
},
{
"epoch": 0.48554913294797686,
"grad_norm": 0.5816382553386844,
"learning_rate": 9.66236114702178e-07,
"loss": 0.6718764901161194,
"step": 168,
"token_acc": 0.7819054715177417
},
{
"epoch": 0.4884393063583815,
"grad_norm": 0.5443192837285905,
"learning_rate": 9.656582550961018e-07,
"loss": 0.5771794319152832,
"step": 169,
"token_acc": 0.8120637180483624
},
{
"epoch": 0.4913294797687861,
"grad_norm": 0.5439506241087468,
"learning_rate": 9.650756681933947e-07,
"loss": 0.5797525644302368,
"step": 170,
"token_acc": 0.8072481275670452
},
{
"epoch": 0.49421965317919075,
"grad_norm": 0.5750701292908912,
"learning_rate": 9.644883599083957e-07,
"loss": 0.616324782371521,
"step": 171,
"token_acc": 0.7961593487416124
},
{
"epoch": 0.49710982658959535,
"grad_norm": 0.5292422990653295,
"learning_rate": 9.638963362033756e-07,
"loss": 0.6252388954162598,
"step": 172,
"token_acc": 0.7945571248522018
},
{
"epoch": 0.5,
"grad_norm": 0.519900156438812,
"learning_rate": 9.632996030884748e-07,
"loss": 0.6072378158569336,
"step": 173,
"token_acc": 0.7983872825711323
},
{
"epoch": 0.5028901734104047,
"grad_norm": 2.014285868322542,
"learning_rate": 9.626981666216439e-07,
"loss": 0.5167373418807983,
"step": 174,
"token_acc": 0.8304752994472689
},
{
"epoch": 0.5057803468208093,
"grad_norm": 0.6229356072638176,
"learning_rate": 9.620920329085802e-07,
"loss": 0.5613738894462585,
"step": 175,
"token_acc": 0.8164609282841512
},
{
"epoch": 0.5086705202312138,
"grad_norm": 0.6427491754173409,
"learning_rate": 9.614812081026678e-07,
"loss": 0.6089553236961365,
"step": 176,
"token_acc": 0.8013446815125724
},
{
"epoch": 0.5115606936416185,
"grad_norm": 0.4795382180524186,
"learning_rate": 9.608656984049132e-07,
"loss": 0.579177737236023,
"step": 177,
"token_acc": 0.806047379906923
},
{
"epoch": 0.5144508670520231,
"grad_norm": 0.5089663171794683,
"learning_rate": 9.602455100638835e-07,
"loss": 0.5813893675804138,
"step": 178,
"token_acc": 0.8087914556082915
},
{
"epoch": 0.5173410404624278,
"grad_norm": 0.6116010486180593,
"learning_rate": 9.596206493756432e-07,
"loss": 0.5549554824829102,
"step": 179,
"token_acc": 0.8173080502386111
},
{
"epoch": 0.5202312138728323,
"grad_norm": 0.4852226717563288,
"learning_rate": 9.589911226836895e-07,
"loss": 0.5808215737342834,
"step": 180,
"token_acc": 0.8052112098427888
},
{
"epoch": 0.523121387283237,
"grad_norm": 0.5270020853161572,
"learning_rate": 9.583569363788879e-07,
"loss": 0.6398844122886658,
"step": 181,
"token_acc": 0.7898708976833977
},
{
"epoch": 0.5260115606936416,
"grad_norm": 0.5073350335042175,
"learning_rate": 9.577180968994081e-07,
"loss": 0.6154753565788269,
"step": 182,
"token_acc": 0.7993068610377478
},
{
"epoch": 0.5289017341040463,
"grad_norm": 0.5631567506627345,
"learning_rate": 9.57074610730658e-07,
"loss": 0.5920361876487732,
"step": 183,
"token_acc": 0.8048126355828951
},
{
"epoch": 0.5317919075144508,
"grad_norm": 0.4995115799741094,
"learning_rate": 9.56426484405218e-07,
"loss": 0.5912809371948242,
"step": 184,
"token_acc": 0.8075411124942672
},
{
"epoch": 0.5346820809248555,
"grad_norm": 0.560250197890468,
"learning_rate": 9.557737245027746e-07,
"loss": 0.6125437021255493,
"step": 185,
"token_acc": 0.7972027972027972
},
{
"epoch": 0.5375722543352601,
"grad_norm": 0.5819218618969146,
"learning_rate": 9.551163376500542e-07,
"loss": 0.5732159614562988,
"step": 186,
"token_acc": 0.8115202124085258
},
{
"epoch": 0.5404624277456648,
"grad_norm": 0.6129732835255256,
"learning_rate": 9.544543305207546e-07,
"loss": 0.6079097986221313,
"step": 187,
"token_acc": 0.7997229197333102
},
{
"epoch": 0.5433526011560693,
"grad_norm": 0.5263001528585832,
"learning_rate": 9.537877098354784e-07,
"loss": 0.5925722718238831,
"step": 188,
"token_acc": 0.8029342210305924
},
{
"epoch": 0.546242774566474,
"grad_norm": 0.583594997315983,
"learning_rate": 9.531164823616646e-07,
"loss": 0.5865395069122314,
"step": 189,
"token_acc": 0.8063752604903651
},
{
"epoch": 0.5491329479768786,
"grad_norm": 0.5781895560822031,
"learning_rate": 9.524406549135193e-07,
"loss": 0.6117700338363647,
"step": 190,
"token_acc": 0.7980149336253496
},
{
"epoch": 0.5520231213872833,
"grad_norm": 0.4893230139872087,
"learning_rate": 9.517602343519471e-07,
"loss": 0.5652576684951782,
"step": 191,
"token_acc": 0.8107140229095636
},
{
"epoch": 0.5549132947976878,
"grad_norm": 0.5760419810427979,
"learning_rate": 9.510752275844809e-07,
"loss": 0.579891562461853,
"step": 192,
"token_acc": 0.805735200834105
},
{
"epoch": 0.5578034682080925,
"grad_norm": 0.5102671355626198,
"learning_rate": 9.503856415652125e-07,
"loss": 0.5964775681495667,
"step": 193,
"token_acc": 0.8034283288223744
},
{
"epoch": 0.5606936416184971,
"grad_norm": 0.4894002019430091,
"learning_rate": 9.496914832947214e-07,
"loss": 0.6064220666885376,
"step": 194,
"token_acc": 0.799232275930387
},
{
"epoch": 0.5635838150289018,
"grad_norm": 0.5939844831348525,
"learning_rate": 9.489927598200043e-07,
"loss": 0.6116449236869812,
"step": 195,
"token_acc": 0.797429447731885
},
{
"epoch": 0.5664739884393064,
"grad_norm": 0.4783949579372596,
"learning_rate": 9.482894782344024e-07,
"loss": 0.6082786321640015,
"step": 196,
"token_acc": 0.796939850416096
},
{
"epoch": 0.569364161849711,
"grad_norm": 0.5532830089434996,
"learning_rate": 9.475816456775312e-07,
"loss": 0.5998172760009766,
"step": 197,
"token_acc": 0.8034065270191963
},
{
"epoch": 0.5722543352601156,
"grad_norm": 0.5660410481873773,
"learning_rate": 9.468692693352062e-07,
"loss": 0.5715000629425049,
"step": 198,
"token_acc": 0.8105325892615268
},
{
"epoch": 0.5751445086705202,
"grad_norm": 0.5454360730485784,
"learning_rate": 9.461523564393714e-07,
"loss": 0.5121803283691406,
"step": 199,
"token_acc": 0.8285392705145792
},
{
"epoch": 0.5780346820809249,
"grad_norm": 0.5378535866046305,
"learning_rate": 9.454309142680246e-07,
"loss": 0.5945334434509277,
"step": 200,
"token_acc": 0.8058855053489177
},
{
"epoch": 0.5809248554913294,
"grad_norm": 0.569376306217556,
"learning_rate": 9.447049501451447e-07,
"loss": 0.5850614905357361,
"step": 201,
"token_acc": 0.8075420015918657
},
{
"epoch": 0.5838150289017341,
"grad_norm": 0.5596293780541032,
"learning_rate": 9.439744714406166e-07,
"loss": 0.5594047904014587,
"step": 202,
"token_acc": 0.8121667287250859
},
{
"epoch": 0.5867052023121387,
"grad_norm": 0.5138636330605458,
"learning_rate": 9.432394855701568e-07,
"loss": 0.5849941372871399,
"step": 203,
"token_acc": 0.8073615179939259
},
{
"epoch": 0.5895953757225434,
"grad_norm": 0.5804821715876541,
"learning_rate": 9.424999999952374e-07,
"loss": 0.5801274180412292,
"step": 204,
"token_acc": 0.8069783212978903
},
{
"epoch": 0.5924855491329479,
"grad_norm": 0.5724417549737069,
"learning_rate": 9.417560222230114e-07,
"loss": 0.549828827381134,
"step": 205,
"token_acc": 0.8177920383625401
},
{
"epoch": 0.5953757225433526,
"grad_norm": 0.5635873362301451,
"learning_rate": 9.410075598062357e-07,
"loss": 0.6004040241241455,
"step": 206,
"token_acc": 0.8004078427231751
},
{
"epoch": 0.5982658959537572,
"grad_norm": 0.5235901257461258,
"learning_rate": 9.402546203431947e-07,
"loss": 0.5270985960960388,
"step": 207,
"token_acc": 0.8231543624161074
},
{
"epoch": 0.6011560693641619,
"grad_norm": 0.5532559810628388,
"learning_rate": 9.394972114776229e-07,
"loss": 0.574277937412262,
"step": 208,
"token_acc": 0.8074010315538029
},
{
"epoch": 0.6040462427745664,
"grad_norm": 0.5812311718782175,
"learning_rate": 9.387353408986282e-07,
"loss": 0.595463216304779,
"step": 209,
"token_acc": 0.8024861291665605
},
{
"epoch": 0.6069364161849711,
"grad_norm": 0.5142938651985898,
"learning_rate": 9.379690163406128e-07,
"loss": 0.5852739214897156,
"step": 210,
"token_acc": 0.8058286827885552
},
{
"epoch": 0.6098265895953757,
"grad_norm": 0.5954842210532877,
"learning_rate": 9.371982455831946e-07,
"loss": 0.5914256572723389,
"step": 211,
"token_acc": 0.8022748583309552
},
{
"epoch": 0.6127167630057804,
"grad_norm": 0.5993748062356747,
"learning_rate": 9.364230364511295e-07,
"loss": 0.5815471410751343,
"step": 212,
"token_acc": 0.8078214734227942
},
{
"epoch": 0.615606936416185,
"grad_norm": 0.5946619701512068,
"learning_rate": 9.356433968142305e-07,
"loss": 0.5513661503791809,
"step": 213,
"token_acc": 0.8162251537633719
},
{
"epoch": 0.6184971098265896,
"grad_norm": 0.6203774782127278,
"learning_rate": 9.34859334587289e-07,
"loss": 0.5972813367843628,
"step": 214,
"token_acc": 0.8014712230836974
},
{
"epoch": 0.6213872832369942,
"grad_norm": 0.551145459721042,
"learning_rate": 9.340708577299936e-07,
"loss": 0.6008709669113159,
"step": 215,
"token_acc": 0.8010602678571429
},
{
"epoch": 0.6242774566473989,
"grad_norm": 0.5965436915708601,
"learning_rate": 9.332779742468495e-07,
"loss": 0.6075496673583984,
"step": 216,
"token_acc": 0.7974854091642866
},
{
"epoch": 0.6271676300578035,
"grad_norm": 0.5460165665763135,
"learning_rate": 9.324806921870975e-07,
"loss": 0.5693843364715576,
"step": 217,
"token_acc": 0.8103969870963759
},
{
"epoch": 0.630057803468208,
"grad_norm": 0.5966690969554563,
"learning_rate": 9.316790196446323e-07,
"loss": 0.5560802221298218,
"step": 218,
"token_acc": 0.8236988940183998
},
{
"epoch": 0.6329479768786127,
"grad_norm": 0.6560441235449157,
"learning_rate": 9.308729647579199e-07,
"loss": 0.5824184417724609,
"step": 219,
"token_acc": 0.8070714583452526
},
{
"epoch": 0.6358381502890174,
"grad_norm": 0.6006127755099283,
"learning_rate": 9.30062535709915e-07,
"loss": 0.6167861819267273,
"step": 220,
"token_acc": 0.796514221545372
},
{
"epoch": 0.638728323699422,
"grad_norm": 0.5570520813344141,
"learning_rate": 9.292477407279789e-07,
"loss": 0.6107242703437805,
"step": 221,
"token_acc": 0.7990834404515732
},
{
"epoch": 0.6416184971098265,
"grad_norm": 0.5419716560460497,
"learning_rate": 9.284285880837946e-07,
"loss": 0.5959486365318298,
"step": 222,
"token_acc": 0.8022954328356064
},
{
"epoch": 0.6445086705202312,
"grad_norm": 0.6657313771062484,
"learning_rate": 9.276050860932837e-07,
"loss": 0.5727354884147644,
"step": 223,
"token_acc": 0.8082750530162884
},
{
"epoch": 0.6473988439306358,
"grad_norm": 0.512607896262416,
"learning_rate": 9.267772431165218e-07,
"loss": 0.5810614228248596,
"step": 224,
"token_acc": 0.8100355584987692
},
{
"epoch": 0.6502890173410405,
"grad_norm": 0.5208342958049974,
"learning_rate": 9.259450675576535e-07,
"loss": 0.5924381017684937,
"step": 225,
"token_acc": 0.8029396939581946
},
{
"epoch": 0.653179190751445,
"grad_norm": 0.6880250488481687,
"learning_rate": 9.251085678648071e-07,
"loss": 0.6493653059005737,
"step": 226,
"token_acc": 0.7886282137800538
},
{
"epoch": 0.6560693641618497,
"grad_norm": 0.548308907840708,
"learning_rate": 9.242677525300088e-07,
"loss": 0.570950448513031,
"step": 227,
"token_acc": 0.810275809890639
},
{
"epoch": 0.6589595375722543,
"grad_norm": 0.5340467208226745,
"learning_rate": 9.234226300890972e-07,
"loss": 0.565179169178009,
"step": 228,
"token_acc": 0.8106098958194559
},
{
"epoch": 0.661849710982659,
"grad_norm": 0.5609587429682379,
"learning_rate": 9.225732091216354e-07,
"loss": 0.6229733824729919,
"step": 229,
"token_acc": 0.7947594792619757
},
{
"epoch": 0.6647398843930635,
"grad_norm": 0.640345970021987,
"learning_rate": 9.217194982508247e-07,
"loss": 0.556702196598053,
"step": 230,
"token_acc": 0.8141483516483516
},
{
"epoch": 0.6676300578034682,
"grad_norm": 0.551511374308891,
"learning_rate": 9.208615061434166e-07,
"loss": 0.6125736236572266,
"step": 231,
"token_acc": 0.7977603246777648
},
{
"epoch": 0.6705202312138728,
"grad_norm": 0.5163364555056573,
"learning_rate": 9.199992415096259e-07,
"loss": 0.5473246574401855,
"step": 232,
"token_acc": 0.8160722450845908
},
{
"epoch": 0.6734104046242775,
"grad_norm": 0.5669711665664704,
"learning_rate": 9.191327131030406e-07,
"loss": 0.543914794921875,
"step": 233,
"token_acc": 0.8196051836235239
},
{
"epoch": 0.6763005780346821,
"grad_norm": 0.5406802703932962,
"learning_rate": 9.182619297205347e-07,
"loss": 0.5660564303398132,
"step": 234,
"token_acc": 0.8103913761289696
},
{
"epoch": 0.6791907514450867,
"grad_norm": 0.556661118525528,
"learning_rate": 9.173869002021775e-07,
"loss": 0.6406779289245605,
"step": 235,
"token_acc": 0.7926350563544501
},
{
"epoch": 0.6820809248554913,
"grad_norm": 0.5201140983806046,
"learning_rate": 9.165076334311445e-07,
"loss": 0.6177135109901428,
"step": 236,
"token_acc": 0.7982128177119112
},
{
"epoch": 0.684971098265896,
"grad_norm": 0.5850116831250167,
"learning_rate": 9.156241383336278e-07,
"loss": 0.5401256680488586,
"step": 237,
"token_acc": 0.8215590591627244
},
{
"epoch": 0.6878612716763006,
"grad_norm": 0.6403194474900529,
"learning_rate": 9.147364238787443e-07,
"loss": 0.581301212310791,
"step": 238,
"token_acc": 0.8056872398548133
},
{
"epoch": 0.6907514450867052,
"grad_norm": 0.5674551611529516,
"learning_rate": 9.138444990784453e-07,
"loss": 0.6117105484008789,
"step": 239,
"token_acc": 0.7969433519630166
},
{
"epoch": 0.6936416184971098,
"grad_norm": 0.5668476584273359,
"learning_rate": 9.12948372987425e-07,
"loss": 0.6042872071266174,
"step": 240,
"token_acc": 0.8012008915710148
},
{
"epoch": 0.6965317919075145,
"grad_norm": 0.5372423597194518,
"learning_rate": 9.120480547030285e-07,
"loss": 0.5781703591346741,
"step": 241,
"token_acc": 0.8076352705410822
},
{
"epoch": 0.6994219653179191,
"grad_norm": 0.582884431687299,
"learning_rate": 9.111435533651595e-07,
"loss": 0.594234824180603,
"step": 242,
"token_acc": 0.8027408303103587
},
{
"epoch": 0.7023121387283237,
"grad_norm": 0.5468197379764062,
"learning_rate": 9.102348781561875e-07,
"loss": 0.537114143371582,
"step": 243,
"token_acc": 0.8224276312689462
},
{
"epoch": 0.7052023121387283,
"grad_norm": 0.5799094186562964,
"learning_rate": 9.093220383008544e-07,
"loss": 0.5844765901565552,
"step": 244,
"token_acc": 0.8037892679887568
},
{
"epoch": 0.708092485549133,
"grad_norm": 0.5735743433347377,
"learning_rate": 9.084050430661813e-07,
"loss": 0.6163278818130493,
"step": 245,
"token_acc": 0.7963933546643635
},
{
"epoch": 0.7109826589595376,
"grad_norm": 0.5675339701772788,
"learning_rate": 9.074839017613736e-07,
"loss": 0.5186026692390442,
"step": 246,
"token_acc": 0.8264138256627419
},
{
"epoch": 0.7138728323699421,
"grad_norm": 0.5682213760378196,
"learning_rate": 9.065586237377274e-07,
"loss": 0.5759379267692566,
"step": 247,
"token_acc": 0.8082834141978154
},
{
"epoch": 0.7167630057803468,
"grad_norm": 0.5222160620275426,
"learning_rate": 9.056292183885341e-07,
"loss": 0.5911962985992432,
"step": 248,
"token_acc": 0.803399969606123
},
{
"epoch": 0.7196531791907514,
"grad_norm": 0.5098026312902073,
"learning_rate": 9.046956951489852e-07,
"loss": 0.5775253772735596,
"step": 249,
"token_acc": 0.8074704886249294
},
{
"epoch": 0.7225433526011561,
"grad_norm": 0.524303335092293,
"learning_rate": 9.037580634960763e-07,
"loss": 0.5572794675827026,
"step": 250,
"token_acc": 0.8146691719232317
},
{
"epoch": 0.7254335260115607,
"grad_norm": 0.6033497475819745,
"learning_rate": 9.028163329485112e-07,
"loss": 0.5832095742225647,
"step": 251,
"token_acc": 0.8073202656110331
},
{
"epoch": 0.7283236994219653,
"grad_norm": 0.5556496694710653,
"learning_rate": 9.018705130666049e-07,
"loss": 0.5459315776824951,
"step": 252,
"token_acc": 0.8191452178897479
},
{
"epoch": 0.7312138728323699,
"grad_norm": 0.7747218495040153,
"learning_rate": 9.009206134521868e-07,
"loss": 0.5795873999595642,
"step": 253,
"token_acc": 0.8071730383987341
},
{
"epoch": 0.7341040462427746,
"grad_norm": 0.5652371374587928,
"learning_rate": 8.999666437485034e-07,
"loss": 0.5758365392684937,
"step": 254,
"token_acc": 0.811742473608758
},
{
"epoch": 0.7369942196531792,
"grad_norm": 0.5206182140440342,
"learning_rate": 8.990086136401198e-07,
"loss": 0.5303860306739807,
"step": 255,
"token_acc": 0.823020148188528
},
{
"epoch": 0.7398843930635838,
"grad_norm": 0.6450852115537637,
"learning_rate": 8.980465328528218e-07,
"loss": 0.5547192096710205,
"step": 256,
"token_acc": 0.8162106882834197
},
{
"epoch": 0.7427745664739884,
"grad_norm": 0.5196181500327283,
"learning_rate": 8.970804111535175e-07,
"loss": 0.5457019209861755,
"step": 257,
"token_acc": 0.8167301624082492
},
{
"epoch": 0.7456647398843931,
"grad_norm": 0.6356725122188899,
"learning_rate": 8.961102583501375e-07,
"loss": 0.5676227807998657,
"step": 258,
"token_acc": 0.8146457172245137
},
{
"epoch": 0.7485549132947977,
"grad_norm": 0.5766749980898508,
"learning_rate": 8.951360842915355e-07,
"loss": 0.5487492084503174,
"step": 259,
"token_acc": 0.8176302961517421
},
{
"epoch": 0.7514450867052023,
"grad_norm": 0.561193367543964,
"learning_rate": 8.941578988673885e-07,
"loss": 0.5508721470832825,
"step": 260,
"token_acc": 0.8148807459638577
},
{
"epoch": 0.7543352601156069,
"grad_norm": 1.1616614497713094,
"learning_rate": 8.931757120080965e-07,
"loss": 0.5649725794792175,
"step": 261,
"token_acc": 0.8123450235984954
},
{
"epoch": 0.7572254335260116,
"grad_norm": 0.6269083895254,
"learning_rate": 8.921895336846812e-07,
"loss": 0.5234044790267944,
"step": 262,
"token_acc": 0.826336871809926
},
{
"epoch": 0.7601156069364162,
"grad_norm": 0.5491932745407809,
"learning_rate": 8.911993739086852e-07,
"loss": 0.5335085391998291,
"step": 263,
"token_acc": 0.8243787856172078
},
{
"epoch": 0.7630057803468208,
"grad_norm": 0.6001894076535953,
"learning_rate": 8.902052427320703e-07,
"loss": 0.6009457111358643,
"step": 264,
"token_acc": 0.8005332320797702
},
{
"epoch": 0.7658959537572254,
"grad_norm": 0.6105633418239023,
"learning_rate": 8.892071502471154e-07,
"loss": 0.512947678565979,
"step": 265,
"token_acc": 0.8283333333333334
},
{
"epoch": 0.7687861271676301,
"grad_norm": 0.530310690982596,
"learning_rate": 8.882051065863139e-07,
"loss": 0.5578915476799011,
"step": 266,
"token_acc": 0.8134685584406639
},
{
"epoch": 0.7716763005780347,
"grad_norm": 0.6053842724913201,
"learning_rate": 8.871991219222712e-07,
"loss": 0.5307576656341553,
"step": 267,
"token_acc": 0.8237498632235475
},
{
"epoch": 0.7745664739884393,
"grad_norm": 0.5839374903786066,
"learning_rate": 8.861892064676008e-07,
"loss": 0.4724132716655731,
"step": 268,
"token_acc": 0.8406308417366578
},
{
"epoch": 0.7774566473988439,
"grad_norm": 0.5382380436884167,
"learning_rate": 8.851753704748219e-07,
"loss": 0.5864905118942261,
"step": 269,
"token_acc": 0.805320596148614
},
{
"epoch": 0.7803468208092486,
"grad_norm": 0.536612826265518,
"learning_rate": 8.841576242362533e-07,
"loss": 0.5369473695755005,
"step": 270,
"token_acc": 0.8202307927330842
},
{
"epoch": 0.7832369942196532,
"grad_norm": 0.48433135594375987,
"learning_rate": 8.831359780839107e-07,
"loss": 0.5745148062705994,
"step": 271,
"token_acc": 0.8114247865236928
},
{
"epoch": 0.7861271676300579,
"grad_norm": 0.565668286608129,
"learning_rate": 8.821104423894014e-07,
"loss": 0.5306930541992188,
"step": 272,
"token_acc": 0.8240810142731839
},
{
"epoch": 0.7890173410404624,
"grad_norm": 0.5347471169063638,
"learning_rate": 8.810810275638182e-07,
"loss": 0.5508551597595215,
"step": 273,
"token_acc": 0.8150747430289043
},
{
"epoch": 0.791907514450867,
"grad_norm": 0.5872611855148089,
"learning_rate": 8.800477440576346e-07,
"loss": 0.5582222938537598,
"step": 274,
"token_acc": 0.8141057178356111
},
{
"epoch": 0.7947976878612717,
"grad_norm": 0.5930933510081743,
"learning_rate": 8.790106023605985e-07,
"loss": 0.5265220403671265,
"step": 275,
"token_acc": 0.8236343698306786
},
{
"epoch": 0.7976878612716763,
"grad_norm": 0.5326943859900286,
"learning_rate": 8.779696130016252e-07,
"loss": 0.589282751083374,
"step": 276,
"token_acc": 0.8041843462366995
},
{
"epoch": 0.8005780346820809,
"grad_norm": 0.682574668475925,
"learning_rate": 8.769247865486915e-07,
"loss": 0.5634682178497314,
"step": 277,
"token_acc": 0.8131609072741031
},
{
"epoch": 0.8034682080924855,
"grad_norm": 0.6170926445265313,
"learning_rate": 8.758761336087273e-07,
"loss": 0.5282115340232849,
"step": 278,
"token_acc": 0.8240009668063165
},
{
"epoch": 0.8063583815028902,
"grad_norm": 0.5931538447313858,
"learning_rate": 8.748236648275087e-07,
"loss": 0.4907287061214447,
"step": 279,
"token_acc": 0.838809946714032
},
{
"epoch": 0.8092485549132948,
"grad_norm": 0.567206538957563,
"learning_rate": 8.737673908895497e-07,
"loss": 0.6097589731216431,
"step": 280,
"token_acc": 0.7990020422972478
},
{
"epoch": 0.8121387283236994,
"grad_norm": 0.5887119791348107,
"learning_rate": 8.727073225179937e-07,
"loss": 0.5625665187835693,
"step": 281,
"token_acc": 0.8113687537033379
},
{
"epoch": 0.815028901734104,
"grad_norm": 0.5836331757411469,
"learning_rate": 8.716434704745046e-07,
"loss": 0.513110339641571,
"step": 282,
"token_acc": 0.8275925912738822
},
{
"epoch": 0.8179190751445087,
"grad_norm": 0.6054924912257345,
"learning_rate": 8.705758455591576e-07,
"loss": 0.602730393409729,
"step": 283,
"token_acc": 0.8022713898227125
},
{
"epoch": 0.8208092485549133,
"grad_norm": 0.6236226833744741,
"learning_rate": 8.695044586103295e-07,
"loss": 0.5747796893119812,
"step": 284,
"token_acc": 0.8079837217906031
},
{
"epoch": 0.8236994219653179,
"grad_norm": 0.5865612629064065,
"learning_rate": 8.684293205045889e-07,
"loss": 0.6070411205291748,
"step": 285,
"token_acc": 0.7988344760774713
},
{
"epoch": 0.8265895953757225,
"grad_norm": 0.5503455006576133,
"learning_rate": 8.673504421565856e-07,
"loss": 0.5685064792633057,
"step": 286,
"token_acc": 0.8102210757057314
},
{
"epoch": 0.8294797687861272,
"grad_norm": 0.5972785565939337,
"learning_rate": 8.662678345189396e-07,
"loss": 0.46608567237854004,
"step": 287,
"token_acc": 0.8438823801959227
},
{
"epoch": 0.8323699421965318,
"grad_norm": 0.5201509566608107,
"learning_rate": 8.651815085821302e-07,
"loss": 0.5298614501953125,
"step": 288,
"token_acc": 0.8236416811984237
},
{
"epoch": 0.8352601156069365,
"grad_norm": 0.49819051940062725,
"learning_rate": 8.640914753743847e-07,
"loss": 0.5882748365402222,
"step": 289,
"token_acc": 0.8065492356638473
},
{
"epoch": 0.838150289017341,
"grad_norm": 0.6397626208223341,
"learning_rate": 8.629977459615654e-07,
"loss": 0.604642927646637,
"step": 290,
"token_acc": 0.798697597059869
},
{
"epoch": 0.8410404624277457,
"grad_norm": 0.5735121088769557,
"learning_rate": 8.619003314470586e-07,
"loss": 0.5657530426979065,
"step": 291,
"token_acc": 0.8134929241446619
},
{
"epoch": 0.8439306358381503,
"grad_norm": 0.6029592728755434,
"learning_rate": 8.607992429716608e-07,
"loss": 0.5807414054870605,
"step": 292,
"token_acc": 0.8062111084672681
},
{
"epoch": 0.846820809248555,
"grad_norm": 0.5204268288621456,
"learning_rate": 8.596944917134666e-07,
"loss": 0.5696761608123779,
"step": 293,
"token_acc": 0.8102849975611456
},
{
"epoch": 0.8497109826589595,
"grad_norm": 0.570216087116967,
"learning_rate": 8.585860888877536e-07,
"loss": 0.6144391298294067,
"step": 294,
"token_acc": 0.7976966055615415
},
{
"epoch": 0.8526011560693642,
"grad_norm": 0.525009085518107,
"learning_rate": 8.574740457468708e-07,
"loss": 0.5926086902618408,
"step": 295,
"token_acc": 0.8030848268880814
},
{
"epoch": 0.8554913294797688,
"grad_norm": 0.5397367841143723,
"learning_rate": 8.563583735801223e-07,
"loss": 0.5647125244140625,
"step": 296,
"token_acc": 0.8113542939673369
},
{
"epoch": 0.8583815028901735,
"grad_norm": 0.5453044997059636,
"learning_rate": 8.55239083713654e-07,
"loss": 0.5306450128555298,
"step": 297,
"token_acc": 0.8242952898276619
},
{
"epoch": 0.861271676300578,
"grad_norm": 0.49382426600759494,
"learning_rate": 8.541161875103379e-07,
"loss": 0.5655560493469238,
"step": 298,
"token_acc": 0.81170671232068
},
{
"epoch": 0.8641618497109826,
"grad_norm": 0.5609985492228051,
"learning_rate": 8.529896963696576e-07,
"loss": 0.5431415438652039,
"step": 299,
"token_acc": 0.8162933876284661
},
{
"epoch": 0.8670520231213873,
"grad_norm": 0.5476351474370762,
"learning_rate": 8.51859621727591e-07,
"loss": 0.5872442126274109,
"step": 300,
"token_acc": 0.8065929411453266
},
{
"epoch": 0.869942196531792,
"grad_norm": 0.5282221087597836,
"learning_rate": 8.507259750564961e-07,
"loss": 0.5451909899711609,
"step": 301,
"token_acc": 0.8188552557155108
},
{
"epoch": 0.8728323699421965,
"grad_norm": 0.503389270767867,
"learning_rate": 8.495887678649932e-07,
"loss": 0.5154858231544495,
"step": 302,
"token_acc": 0.8274329950559459
},
{
"epoch": 0.8757225433526011,
"grad_norm": 0.518940089504941,
"learning_rate": 8.484480116978486e-07,
"loss": 0.5244746208190918,
"step": 303,
"token_acc": 0.8264815952633637
},
{
"epoch": 0.8786127167630058,
"grad_norm": 0.573024895950047,
"learning_rate": 8.473037181358573e-07,
"loss": 0.592721700668335,
"step": 304,
"token_acc": 0.8035201013934049
},
{
"epoch": 0.8815028901734104,
"grad_norm": 0.5039735997055694,
"learning_rate": 8.461558987957252e-07,
"loss": 0.5656961798667908,
"step": 305,
"token_acc": 0.8130110070213994
},
{
"epoch": 0.884393063583815,
"grad_norm": 0.5476756827664239,
"learning_rate": 8.45004565329952e-07,
"loss": 0.5374190807342529,
"step": 306,
"token_acc": 0.820976424170279
},
{
"epoch": 0.8872832369942196,
"grad_norm": 0.5275746578408953,
"learning_rate": 8.438497294267116e-07,
"loss": 0.5982400178909302,
"step": 307,
"token_acc": 0.7999831918648626
},
{
"epoch": 0.8901734104046243,
"grad_norm": 0.532750300928086,
"learning_rate": 8.426914028097347e-07,
"loss": 0.584047794342041,
"step": 308,
"token_acc": 0.8066207177537092
},
{
"epoch": 0.8930635838150289,
"grad_norm": 0.5003914631256399,
"learning_rate": 8.415295972381889e-07,
"loss": 0.6089476346969604,
"step": 309,
"token_acc": 0.7978914509526754
},
{
"epoch": 0.8959537572254336,
"grad_norm": 0.6278624794022574,
"learning_rate": 8.403643245065597e-07,
"loss": 0.5697731375694275,
"step": 310,
"token_acc": 0.8108995234993658
},
{
"epoch": 0.8988439306358381,
"grad_norm": 0.6052633593556834,
"learning_rate": 8.391955964445309e-07,
"loss": 0.5913630723953247,
"step": 311,
"token_acc": 0.8023921969586315
},
{
"epoch": 0.9017341040462428,
"grad_norm": 0.5312386556419646,
"learning_rate": 8.38023424916864e-07,
"loss": 0.5818167924880981,
"step": 312,
"token_acc": 0.8053130715134147
},
{
"epoch": 0.9046242774566474,
"grad_norm": 0.5377630147019918,
"learning_rate": 8.368478218232787e-07,
"loss": 0.5994030237197876,
"step": 313,
"token_acc": 0.8010770419994847
},
{
"epoch": 0.9075144508670521,
"grad_norm": 0.6387143665462728,
"learning_rate": 8.356687990983305e-07,
"loss": 0.5747004747390747,
"step": 314,
"token_acc": 0.8103654791154791
},
{
"epoch": 0.9104046242774566,
"grad_norm": 0.5539012149779035,
"learning_rate": 8.344863687112913e-07,
"loss": 0.5109165906906128,
"step": 315,
"token_acc": 0.8275082819675849
},
{
"epoch": 0.9132947976878613,
"grad_norm": 0.5431996662851367,
"learning_rate": 8.333005426660271e-07,
"loss": 0.4984626770019531,
"step": 316,
"token_acc": 0.8326753471796506
},
{
"epoch": 0.9161849710982659,
"grad_norm": 0.5476844147731238,
"learning_rate": 8.321113330008756e-07,
"loss": 0.5582059025764465,
"step": 317,
"token_acc": 0.8131992060627932
},
{
"epoch": 0.9190751445086706,
"grad_norm": 0.5288904758826702,
"learning_rate": 8.309187517885249e-07,
"loss": 0.5965433120727539,
"step": 318,
"token_acc": 0.8015113167980331
},
{
"epoch": 0.9219653179190751,
"grad_norm": 0.5061439317002303,
"learning_rate": 8.297228111358906e-07,
"loss": 0.50608229637146,
"step": 319,
"token_acc": 0.8302445369795833
},
{
"epoch": 0.9248554913294798,
"grad_norm": 0.49043399117893216,
"learning_rate": 8.285235231839927e-07,
"loss": 0.5492719411849976,
"step": 320,
"token_acc": 0.8174581468830556
},
{
"epoch": 0.9277456647398844,
"grad_norm": 0.6174249587001943,
"learning_rate": 8.273209001078324e-07,
"loss": 0.553361177444458,
"step": 321,
"token_acc": 0.8119886458507264
},
{
"epoch": 0.930635838150289,
"grad_norm": 0.5616150428871276,
"learning_rate": 8.261149541162691e-07,
"loss": 0.6025636196136475,
"step": 322,
"token_acc": 0.8005087935801005
},
{
"epoch": 0.9335260115606936,
"grad_norm": 0.6478516612944865,
"learning_rate": 8.249056974518954e-07,
"loss": 0.5491775274276733,
"step": 323,
"token_acc": 0.8185532095041541
},
{
"epoch": 0.9364161849710982,
"grad_norm": 0.5031858383227522,
"learning_rate": 8.236931423909138e-07,
"loss": 0.6022853255271912,
"step": 324,
"token_acc": 0.8037384243419552
},
{
"epoch": 0.9393063583815029,
"grad_norm": 0.5752991697267287,
"learning_rate": 8.224773012430114e-07,
"loss": 0.5954960584640503,
"step": 325,
"token_acc": 0.8036680189317106
},
{
"epoch": 0.9421965317919075,
"grad_norm": 0.5295029516066992,
"learning_rate": 8.212581863512353e-07,
"loss": 0.5488483309745789,
"step": 326,
"token_acc": 0.8157750324575375
},
{
"epoch": 0.9450867052023122,
"grad_norm": 0.5368502799479243,
"learning_rate": 8.20035810091867e-07,
"loss": 0.5652696490287781,
"step": 327,
"token_acc": 0.8106361614705574
},
{
"epoch": 0.9479768786127167,
"grad_norm": 0.5847097314866032,
"learning_rate": 8.188101848742974e-07,
"loss": 0.544079065322876,
"step": 328,
"token_acc": 0.819971546427805
},
{
"epoch": 0.9508670520231214,
"grad_norm": 0.5255181020508993,
"learning_rate": 8.175813231408999e-07,
"loss": 0.4978986382484436,
"step": 329,
"token_acc": 0.8333199723062348
},
{
"epoch": 0.953757225433526,
"grad_norm": 0.5127048703010287,
"learning_rate": 8.163492373669047e-07,
"loss": 0.5805110931396484,
"step": 330,
"token_acc": 0.8056335113743647
},
{
"epoch": 0.9566473988439307,
"grad_norm": 0.652335019028349,
"learning_rate": 8.15113940060272e-07,
"loss": 0.5597442388534546,
"step": 331,
"token_acc": 0.8161630076551519
},
{
"epoch": 0.9595375722543352,
"grad_norm": 0.5947335075670345,
"learning_rate": 8.13875443761565e-07,
"loss": 0.5277099609375,
"step": 332,
"token_acc": 0.8274886297575488
},
{
"epoch": 0.9624277456647399,
"grad_norm": 0.5459606580402216,
"learning_rate": 8.126337610438229e-07,
"loss": 0.5635240077972412,
"step": 333,
"token_acc": 0.8108978939573075
},
{
"epoch": 0.9653179190751445,
"grad_norm": 0.5488564858287155,
"learning_rate": 8.113889045124323e-07,
"loss": 0.49523666501045227,
"step": 334,
"token_acc": 0.8329320341089853
},
{
"epoch": 0.9682080924855492,
"grad_norm": 0.5694023522198697,
"learning_rate": 8.101408868050008e-07,
"loss": 0.5316784381866455,
"step": 335,
"token_acc": 0.8213875427499967
},
{
"epoch": 0.9710982658959537,
"grad_norm": 0.5290670622343212,
"learning_rate": 8.088897205912271e-07,
"loss": 0.5768337249755859,
"step": 336,
"token_acc": 0.808409267610014
},
{
"epoch": 0.9739884393063584,
"grad_norm": 0.5630882737173935,
"learning_rate": 8.076354185727734e-07,
"loss": 0.5607028007507324,
"step": 337,
"token_acc": 0.8111738071422572
},
{
"epoch": 0.976878612716763,
"grad_norm": 0.5389758264031266,
"learning_rate": 8.06377993483136e-07,
"loss": 0.5800102949142456,
"step": 338,
"token_acc": 0.8064102564102564
},
{
"epoch": 0.9797687861271677,
"grad_norm": 0.6483925804091112,
"learning_rate": 8.051174580875163e-07,
"loss": 0.5936282873153687,
"step": 339,
"token_acc": 0.8033736003463585
},
{
"epoch": 0.9826589595375722,
"grad_norm": 0.5683588968241811,
"learning_rate": 8.038538251826912e-07,
"loss": 0.5602604150772095,
"step": 340,
"token_acc": 0.8103426182505487
},
{
"epoch": 0.9855491329479769,
"grad_norm": 0.4984007019353715,
"learning_rate": 8.025871075968826e-07,
"loss": 0.559136152267456,
"step": 341,
"token_acc": 0.8140824580290378
},
{
"epoch": 0.9884393063583815,
"grad_norm": 1.1899348194485317,
"learning_rate": 8.013173181896282e-07,
"loss": 0.5955883860588074,
"step": 342,
"token_acc": 0.8027926447988978
},
{
"epoch": 0.9913294797687862,
"grad_norm": 0.5388156404908695,
"learning_rate": 8.0004446985165e-07,
"loss": 0.5661012530326843,
"step": 343,
"token_acc": 0.8099668055056346
},
{
"epoch": 0.9942196531791907,
"grad_norm": 0.5412535831553995,
"learning_rate": 7.987685755047242e-07,
"loss": 0.6086287498474121,
"step": 344,
"token_acc": 0.7963722407145177
},
{
"epoch": 0.9971098265895953,
"grad_norm": 0.696761929081249,
"learning_rate": 7.974896481015494e-07,
"loss": 0.5823131799697876,
"step": 345,
"token_acc": 0.8073882514689755
},
{
"epoch": 1.0,
"grad_norm": 0.4953947640304795,
"learning_rate": 7.962077006256153e-07,
"loss": 0.5682995319366455,
"step": 346,
"token_acc": 0.8121095151492658
},
{
"epoch": 1.0028901734104045,
"grad_norm": 0.7111654355632505,
"learning_rate": 7.94922746091071e-07,
"loss": 0.6060156226158142,
"step": 347,
"token_acc": 0.8014354938608955
},
{
"epoch": 1.0057803468208093,
"grad_norm": 0.5507935056779134,
"learning_rate": 7.93634797542593e-07,
"loss": 0.5295247435569763,
"step": 348,
"token_acc": 0.8211228506318624
},
{
"epoch": 1.0086705202312138,
"grad_norm": 0.6189562361784823,
"learning_rate": 7.923438680552525e-07,
"loss": 0.5647916197776794,
"step": 349,
"token_acc": 0.8137873547100433
},
{
"epoch": 1.0115606936416186,
"grad_norm": 0.6801159002216328,
"learning_rate": 7.910499707343828e-07,
"loss": 0.590101420879364,
"step": 350,
"token_acc": 0.803803399890662
},
{
"epoch": 1.0144508670520231,
"grad_norm": 0.6049076830653918,
"learning_rate": 7.897531187154458e-07,
"loss": 0.5088500380516052,
"step": 351,
"token_acc": 0.8279876049759735
},
{
"epoch": 1.0173410404624277,
"grad_norm": 0.5654302790773965,
"learning_rate": 7.884533251638999e-07,
"loss": 0.5929542779922485,
"step": 352,
"token_acc": 0.8047063731856507
},
{
"epoch": 1.0202312138728324,
"grad_norm": 0.5880451344105353,
"learning_rate": 7.87150603275065e-07,
"loss": 0.5749261379241943,
"step": 353,
"token_acc": 0.8056116433808085
},
{
"epoch": 1.023121387283237,
"grad_norm": 0.5426830225682386,
"learning_rate": 7.85844966273989e-07,
"loss": 0.5945314168930054,
"step": 354,
"token_acc": 0.800486217737808
},
{
"epoch": 1.0260115606936415,
"grad_norm": 0.49678361176775165,
"learning_rate": 7.845364274153139e-07,
"loss": 0.4898013472557068,
"step": 355,
"token_acc": 0.8352619622320034
},
{
"epoch": 1.0289017341040463,
"grad_norm": 0.6954304853085829,
"learning_rate": 7.832249999831406e-07,
"loss": 0.5588274598121643,
"step": 356,
"token_acc": 0.8166684201080533
},
{
"epoch": 1.0317919075144508,
"grad_norm": 0.5310648615446059,
"learning_rate": 7.819106972908949e-07,
"loss": 0.5819897651672363,
"step": 357,
"token_acc": 0.8045070775826193
},
{
"epoch": 1.0346820809248556,
"grad_norm": 0.5923922817451516,
"learning_rate": 7.805935326811912e-07,
"loss": 0.5737313032150269,
"step": 358,
"token_acc": 0.8051378103467133
},
{
"epoch": 1.0375722543352601,
"grad_norm": 0.5178307979556245,
"learning_rate": 7.79273519525698e-07,
"loss": 0.5936248302459717,
"step": 359,
"token_acc": 0.8025767773866199
},
{
"epoch": 1.0404624277456647,
"grad_norm": 0.5286013733045867,
"learning_rate": 7.779506712250022e-07,
"loss": 0.5494135618209839,
"step": 360,
"token_acc": 0.8171926851655723
},
{
"epoch": 1.0433526011560694,
"grad_norm": 0.49585832807282065,
"learning_rate": 7.766250012084722e-07,
"loss": 0.5698336958885193,
"step": 361,
"token_acc": 0.8116101814090845
},
{
"epoch": 1.046242774566474,
"grad_norm": 0.6962712390013456,
"learning_rate": 7.752965229341219e-07,
"loss": 0.535956621170044,
"step": 362,
"token_acc": 0.822281059722762
},
{
"epoch": 1.0491329479768785,
"grad_norm": 0.5694059644679526,
"learning_rate": 7.739652498884747e-07,
"loss": 0.5675574541091919,
"step": 363,
"token_acc": 0.8093009931245225
},
{
"epoch": 1.0520231213872833,
"grad_norm": 0.5547323442483891,
"learning_rate": 7.726311955864261e-07,
"loss": 0.5611029863357544,
"step": 364,
"token_acc": 0.8125364888148433
},
{
"epoch": 1.0549132947976878,
"grad_norm": 0.5476729662614271,
"learning_rate": 7.712943735711062e-07,
"loss": 0.5374180674552917,
"step": 365,
"token_acc": 0.8212820320132261
},
{
"epoch": 1.0578034682080926,
"grad_norm": 0.5180731484879565,
"learning_rate": 7.699547974137426e-07,
"loss": 0.5433316230773926,
"step": 366,
"token_acc": 0.8200906177478174
},
{
"epoch": 1.060693641618497,
"grad_norm": 0.5798685069888638,
"learning_rate": 7.686124807135228e-07,
"loss": 0.5966153740882874,
"step": 367,
"token_acc": 0.8028633971139337
},
{
"epoch": 1.0635838150289016,
"grad_norm": 0.5594356403434023,
"learning_rate": 7.672674370974558e-07,
"loss": 0.5133764743804932,
"step": 368,
"token_acc": 0.8287475052817048
},
{
"epoch": 1.0664739884393064,
"grad_norm": 0.5414940672989453,
"learning_rate": 7.659196802202338e-07,
"loss": 0.5794786214828491,
"step": 369,
"token_acc": 0.8080960204454181
},
{
"epoch": 1.069364161849711,
"grad_norm": 0.5596146246622683,
"learning_rate": 7.645692237640937e-07,
"loss": 0.6179242134094238,
"step": 370,
"token_acc": 0.7978232829012561
},
{
"epoch": 1.0722543352601157,
"grad_norm": 0.5658616759599563,
"learning_rate": 7.632160814386779e-07,
"loss": 0.5489234924316406,
"step": 371,
"token_acc": 0.818960201793722
},
{
"epoch": 1.0751445086705202,
"grad_norm": 0.5583854062469837,
"learning_rate": 7.618602669808957e-07,
"loss": 0.5576378703117371,
"step": 372,
"token_acc": 0.8134194149383499
},
{
"epoch": 1.0780346820809248,
"grad_norm": 0.5709606663652054,
"learning_rate": 7.605017941547835e-07,
"loss": 0.5531469583511353,
"step": 373,
"token_acc": 0.8139197537682152
},
{
"epoch": 1.0809248554913296,
"grad_norm": 0.5401961587153568,
"learning_rate": 7.591406767513648e-07,
"loss": 0.5335639715194702,
"step": 374,
"token_acc": 0.8189074796640434
},
{
"epoch": 1.083815028901734,
"grad_norm": 0.5776452597256104,
"learning_rate": 7.577769285885108e-07,
"loss": 0.5792023539543152,
"step": 375,
"token_acc": 0.8059631052038535
},
{
"epoch": 1.0867052023121386,
"grad_norm": 0.6631103343737483,
"learning_rate": 7.564105635107996e-07,
"loss": 0.5358845591545105,
"step": 376,
"token_acc": 0.8186349045446866
},
{
"epoch": 1.0895953757225434,
"grad_norm": 0.49688934026931153,
"learning_rate": 7.550415953893756e-07,
"loss": 0.5017120242118835,
"step": 377,
"token_acc": 0.8296466328279073
},
{
"epoch": 1.092485549132948,
"grad_norm": 0.5499825048622536,
"learning_rate": 7.536700381218097e-07,
"loss": 0.5757490396499634,
"step": 378,
"token_acc": 0.8071212248675023
},
{
"epoch": 1.0953757225433527,
"grad_norm": 0.5724354451620394,
"learning_rate": 7.522959056319564e-07,
"loss": 0.5289810299873352,
"step": 379,
"token_acc": 0.8224057244166174
},
{
"epoch": 1.0982658959537572,
"grad_norm": 0.5295598164095123,
"learning_rate": 7.509192118698145e-07,
"loss": 0.5217394828796387,
"step": 380,
"token_acc": 0.8247749871572029
},
{
"epoch": 1.1011560693641618,
"grad_norm": 0.6732543146745934,
"learning_rate": 7.49539970811384e-07,
"loss": 0.5446665287017822,
"step": 381,
"token_acc": 0.8187780645617508
},
{
"epoch": 1.1040462427745665,
"grad_norm": 0.593141398734888,
"learning_rate": 7.481581964585244e-07,
"loss": 0.6174026131629944,
"step": 382,
"token_acc": 0.7958839535507607
},
{
"epoch": 1.106936416184971,
"grad_norm": 0.5915717748635032,
"learning_rate": 7.467739028388133e-07,
"loss": 0.5956196784973145,
"step": 383,
"token_acc": 0.8005577327975455
},
{
"epoch": 1.1098265895953756,
"grad_norm": 0.5486121690897104,
"learning_rate": 7.453871040054037e-07,
"loss": 0.602386474609375,
"step": 384,
"token_acc": 0.7985531236588805
},
{
"epoch": 1.1127167630057804,
"grad_norm": 0.6468015023115512,
"learning_rate": 7.439978140368803e-07,
"loss": 0.5264239311218262,
"step": 385,
"token_acc": 0.8247053516043534
},
{
"epoch": 1.115606936416185,
"grad_norm": 0.5396942599943407,
"learning_rate": 7.426060470371185e-07,
"loss": 0.5322436094284058,
"step": 386,
"token_acc": 0.8225644386194845
},
{
"epoch": 1.1184971098265897,
"grad_norm": 0.546318443194639,
"learning_rate": 7.412118171351395e-07,
"loss": 0.5636791586875916,
"step": 387,
"token_acc": 0.8132001591389744
},
{
"epoch": 1.1213872832369942,
"grad_norm": 0.5681580355518231,
"learning_rate": 7.398151384849679e-07,
"loss": 0.5519202351570129,
"step": 388,
"token_acc": 0.8136924046076314
},
{
"epoch": 1.1242774566473988,
"grad_norm": 0.5949989948835427,
"learning_rate": 7.384160252654873e-07,
"loss": 0.5511115789413452,
"step": 389,
"token_acc": 0.8144513354081949
},
{
"epoch": 1.1271676300578035,
"grad_norm": 0.4837423293992909,
"learning_rate": 7.370144916802969e-07,
"loss": 0.5643985867500305,
"step": 390,
"token_acc": 0.8112824957599688
},
{
"epoch": 1.130057803468208,
"grad_norm": 0.5611205998910804,
"learning_rate": 7.356105519575671e-07,
"loss": 0.5409538745880127,
"step": 391,
"token_acc": 0.8188429729320618
},
{
"epoch": 1.1329479768786128,
"grad_norm": 0.5181274015479428,
"learning_rate": 7.342042203498951e-07,
"loss": 0.5411881804466248,
"step": 392,
"token_acc": 0.8171947300974061
},
{
"epoch": 1.1358381502890174,
"grad_norm": 0.5497633972492808,
"learning_rate": 7.327955111341601e-07,
"loss": 0.5626124143600464,
"step": 393,
"token_acc": 0.8131716531422224
},
{
"epoch": 1.138728323699422,
"grad_norm": 0.569806645978514,
"learning_rate": 7.313844386113783e-07,
"loss": 0.533359169960022,
"step": 394,
"token_acc": 0.8227007051547947
},
{
"epoch": 1.1416184971098267,
"grad_norm": 0.5809695758427657,
"learning_rate": 7.299710171065584e-07,
"loss": 0.5428122282028198,
"step": 395,
"token_acc": 0.8167381946213591
},
{
"epoch": 1.1445086705202312,
"grad_norm": 0.5685994639717983,
"learning_rate": 7.28555260968555e-07,
"loss": 0.5661939382553101,
"step": 396,
"token_acc": 0.8107361575857062
},
{
"epoch": 1.147398843930636,
"grad_norm": 0.5687294924284086,
"learning_rate": 7.271371845699241e-07,
"loss": 0.4796743392944336,
"step": 397,
"token_acc": 0.8378044059980814
},
{
"epoch": 1.1502890173410405,
"grad_norm": 0.5570998116553988,
"learning_rate": 7.257168023067759e-07,
"loss": 0.5698948502540588,
"step": 398,
"token_acc": 0.8108394509164174
},
{
"epoch": 1.153179190751445,
"grad_norm": 0.5764653559793665,
"learning_rate": 7.242941285986303e-07,
"loss": 0.5216134190559387,
"step": 399,
"token_acc": 0.8264347873981053
},
{
"epoch": 1.1560693641618498,
"grad_norm": 0.5519714242613649,
"learning_rate": 7.228691778882692e-07,
"loss": 0.5965580940246582,
"step": 400,
"token_acc": 0.8008848328263255
},
{
"epoch": 1.1589595375722543,
"grad_norm": 0.5713833806622776,
"learning_rate": 7.2144196464159e-07,
"loss": 0.530504584312439,
"step": 401,
"token_acc": 0.8193537207392506
},
{
"epoch": 1.1618497109826589,
"grad_norm": 0.5112285942897958,
"learning_rate": 7.200125033474598e-07,
"loss": 0.5425513982772827,
"step": 402,
"token_acc": 0.8176038122905598
},
{
"epoch": 1.1647398843930636,
"grad_norm": 0.5891524284010872,
"learning_rate": 7.185808085175668e-07,
"loss": 0.5737115740776062,
"step": 403,
"token_acc": 0.811070949924867
},
{
"epoch": 1.1676300578034682,
"grad_norm": 0.8927491774092401,
"learning_rate": 7.171468946862743e-07,
"loss": 0.5100395083427429,
"step": 404,
"token_acc": 0.8297666772416578
},
{
"epoch": 1.1705202312138727,
"grad_norm": 0.6290027028336996,
"learning_rate": 7.157107764104723e-07,
"loss": 0.5254942178726196,
"step": 405,
"token_acc": 0.8239488461275081
},
{
"epoch": 1.1734104046242775,
"grad_norm": 0.5413566372730959,
"learning_rate": 7.142724682694299e-07,
"loss": 0.5764940977096558,
"step": 406,
"token_acc": 0.8086516073191842
},
{
"epoch": 1.176300578034682,
"grad_norm": 0.5581695811593094,
"learning_rate": 7.128319848646477e-07,
"loss": 0.5500423312187195,
"step": 407,
"token_acc": 0.8153743413040916
},
{
"epoch": 1.1791907514450868,
"grad_norm": 0.4681952163328979,
"learning_rate": 7.113893408197091e-07,
"loss": 0.5582858324050903,
"step": 408,
"token_acc": 0.8114563586911728
},
{
"epoch": 1.1820809248554913,
"grad_norm": 0.6826359609914151,
"learning_rate": 7.099445507801323e-07,
"loss": 0.49809369444847107,
"step": 409,
"token_acc": 0.8353448588307781
},
{
"epoch": 1.1849710982658959,
"grad_norm": 0.5090205197384219,
"learning_rate": 7.084976294132207e-07,
"loss": 0.6029922962188721,
"step": 410,
"token_acc": 0.7973656093105548
},
{
"epoch": 1.1878612716763006,
"grad_norm": 0.5269042882225241,
"learning_rate": 7.070485914079151e-07,
"loss": 0.5927149057388306,
"step": 411,
"token_acc": 0.8014037282759605
},
{
"epoch": 1.1907514450867052,
"grad_norm": 0.49950817881103576,
"learning_rate": 7.055974514746445e-07,
"loss": 0.5837708711624146,
"step": 412,
"token_acc": 0.8074309042384765
},
{
"epoch": 1.19364161849711,
"grad_norm": 0.5860116475494397,
"learning_rate": 7.041442243451752e-07,
"loss": 0.5210489630699158,
"step": 413,
"token_acc": 0.8244094424028096
},
{
"epoch": 1.1965317919075145,
"grad_norm": 0.5718657608384051,
"learning_rate": 7.026889247724635e-07,
"loss": 0.5820956230163574,
"step": 414,
"token_acc": 0.8042295599535557
},
{
"epoch": 1.199421965317919,
"grad_norm": 0.5054409513703455,
"learning_rate": 7.012315675305045e-07,
"loss": 0.5862281918525696,
"step": 415,
"token_acc": 0.8023793187527289
},
{
"epoch": 1.2023121387283238,
"grad_norm": 0.5766487774658408,
"learning_rate": 6.997721674141822e-07,
"loss": 0.520296037197113,
"step": 416,
"token_acc": 0.8252748600155311
},
{
"epoch": 1.2052023121387283,
"grad_norm": 0.537979220335716,
"learning_rate": 6.983107392391202e-07,
"loss": 0.5797343850135803,
"step": 417,
"token_acc": 0.80571660344046
},
{
"epoch": 1.208092485549133,
"grad_norm": 0.5396946740305607,
"learning_rate": 6.9684729784153e-07,
"loss": 0.6153110265731812,
"step": 418,
"token_acc": 0.7969049998485812
},
{
"epoch": 1.2109826589595376,
"grad_norm": 0.5642823581815699,
"learning_rate": 6.953818580780613e-07,
"loss": 0.5325438976287842,
"step": 419,
"token_acc": 0.8222246858832225
},
{
"epoch": 1.2138728323699421,
"grad_norm": 0.5535087521581403,
"learning_rate": 6.939144348256511e-07,
"loss": 0.5709867477416992,
"step": 420,
"token_acc": 0.8069591256176074
},
{
"epoch": 1.216763005780347,
"grad_norm": 0.572340555748076,
"learning_rate": 6.924450429813723e-07,
"loss": 0.5548975467681885,
"step": 421,
"token_acc": 0.8185377583894686
},
{
"epoch": 1.2196531791907514,
"grad_norm": 0.5155912490897337,
"learning_rate": 6.909736974622826e-07,
"loss": 0.5856627225875854,
"step": 422,
"token_acc": 0.8058833037013092
},
{
"epoch": 1.222543352601156,
"grad_norm": 0.5287358182605065,
"learning_rate": 6.895004132052735e-07,
"loss": 0.530200719833374,
"step": 423,
"token_acc": 0.822671307855992
},
{
"epoch": 1.2254335260115607,
"grad_norm": 0.5377464968526829,
"learning_rate": 6.88025205166918e-07,
"loss": 0.6028895974159241,
"step": 424,
"token_acc": 0.8013212984612038
},
{
"epoch": 1.2283236994219653,
"grad_norm": 0.5204405657753005,
"learning_rate": 6.865480883233189e-07,
"loss": 0.5590497851371765,
"step": 425,
"token_acc": 0.8117163218535146
},
{
"epoch": 1.2312138728323698,
"grad_norm": 0.45493496853760634,
"learning_rate": 6.850690776699573e-07,
"loss": 0.5726251602172852,
"step": 426,
"token_acc": 0.8084424978300127
},
{
"epoch": 1.2341040462427746,
"grad_norm": 0.6240376452291253,
"learning_rate": 6.835881882215395e-07,
"loss": 0.5343113541603088,
"step": 427,
"token_acc": 0.8196929353326794
},
{
"epoch": 1.2369942196531791,
"grad_norm": 0.5773298029457239,
"learning_rate": 6.821054350118458e-07,
"loss": 0.5317709445953369,
"step": 428,
"token_acc": 0.8196335435275461
},
{
"epoch": 1.239884393063584,
"grad_norm": 0.5477278016005382,
"learning_rate": 6.806208330935766e-07,
"loss": 0.5721542835235596,
"step": 429,
"token_acc": 0.8069397675429067
},
{
"epoch": 1.2427745664739884,
"grad_norm": 0.5954432022727356,
"learning_rate": 6.791343975381999e-07,
"loss": 0.59670090675354,
"step": 430,
"token_acc": 0.8028038691690053
},
{
"epoch": 1.245664739884393,
"grad_norm": 0.6299231511446614,
"learning_rate": 6.776461434357993e-07,
"loss": 0.5712985396385193,
"step": 431,
"token_acc": 0.8093430920755399
},
{
"epoch": 1.2485549132947977,
"grad_norm": 0.5405979300580379,
"learning_rate": 6.761560858949192e-07,
"loss": 0.5809611082077026,
"step": 432,
"token_acc": 0.8070006162733515
},
{
"epoch": 1.2514450867052023,
"grad_norm": 0.5516822339033575,
"learning_rate": 6.746642400424131e-07,
"loss": 0.5620344281196594,
"step": 433,
"token_acc": 0.8121798185065721
},
{
"epoch": 1.254335260115607,
"grad_norm": 0.5284837836987685,
"learning_rate": 6.731706210232882e-07,
"loss": 0.5855224132537842,
"step": 434,
"token_acc": 0.8044497743554139
},
{
"epoch": 1.2572254335260116,
"grad_norm": 0.5627730241670859,
"learning_rate": 6.716752440005537e-07,
"loss": 0.5670550465583801,
"step": 435,
"token_acc": 0.8096381386958137
},
{
"epoch": 1.260115606936416,
"grad_norm": 0.538509679886266,
"learning_rate": 6.701781241550648e-07,
"loss": 0.5526491403579712,
"step": 436,
"token_acc": 0.8155125315340866
},
{
"epoch": 1.2630057803468209,
"grad_norm": 0.4771561540026018,
"learning_rate": 6.686792766853705e-07,
"loss": 0.5505247712135315,
"step": 437,
"token_acc": 0.8138159537283621
},
{
"epoch": 1.2658959537572254,
"grad_norm": 0.5223829257694631,
"learning_rate": 6.671787168075575e-07,
"loss": 0.5447695255279541,
"step": 438,
"token_acc": 0.8178192464935741
},
{
"epoch": 1.2687861271676302,
"grad_norm": 0.5159364504277794,
"learning_rate": 6.656764597550975e-07,
"loss": 0.5982085466384888,
"step": 439,
"token_acc": 0.8001320834327017
},
{
"epoch": 1.2716763005780347,
"grad_norm": 0.5310637224775283,
"learning_rate": 6.641725207786909e-07,
"loss": 0.5778173208236694,
"step": 440,
"token_acc": 0.8066611125837846
},
{
"epoch": 1.2745664739884393,
"grad_norm": 0.56776340532874,
"learning_rate": 6.626669151461133e-07,
"loss": 0.5481947660446167,
"step": 441,
"token_acc": 0.8165455226676658
},
{
"epoch": 1.2774566473988438,
"grad_norm": 0.5289033874903101,
"learning_rate": 6.611596581420599e-07,
"loss": 0.5178524255752563,
"step": 442,
"token_acc": 0.8276837132314907
},
{
"epoch": 1.2803468208092486,
"grad_norm": 0.6054263277819003,
"learning_rate": 6.596507650679899e-07,
"loss": 0.5819660425186157,
"step": 443,
"token_acc": 0.8038088791803834
},
{
"epoch": 1.2832369942196533,
"grad_norm": 0.5487293303925478,
"learning_rate": 6.581402512419723e-07,
"loss": 0.5847280621528625,
"step": 444,
"token_acc": 0.80743134495099
},
{
"epoch": 1.2861271676300579,
"grad_norm": 0.5388475336099026,
"learning_rate": 6.566281319985295e-07,
"loss": 0.5863124132156372,
"step": 445,
"token_acc": 0.8067254504627854
},
{
"epoch": 1.2890173410404624,
"grad_norm": 0.5538452871257553,
"learning_rate": 6.551144226884815e-07,
"loss": 0.5669398307800293,
"step": 446,
"token_acc": 0.8087953975429001
},
{
"epoch": 1.291907514450867,
"grad_norm": 0.557772227891473,
"learning_rate": 6.53599138678791e-07,
"loss": 0.5209745764732361,
"step": 447,
"token_acc": 0.8239799595072235
},
{
"epoch": 1.2947976878612717,
"grad_norm": 0.6127169435529054,
"learning_rate": 6.520822953524065e-07,
"loss": 0.5106294751167297,
"step": 448,
"token_acc": 0.8277936680145971
},
{
"epoch": 1.2976878612716762,
"grad_norm": 0.5375147488907324,
"learning_rate": 6.505639081081066e-07,
"loss": 0.5071303844451904,
"step": 449,
"token_acc": 0.8268003446613994
},
{
"epoch": 1.300578034682081,
"grad_norm": 0.5553311529997369,
"learning_rate": 6.490439923603435e-07,
"loss": 0.5532734394073486,
"step": 450,
"token_acc": 0.8134406172882417
},
{
"epoch": 1.3034682080924855,
"grad_norm": 0.5998759397432016,
"learning_rate": 6.475225635390863e-07,
"loss": 0.5865392088890076,
"step": 451,
"token_acc": 0.8023424626486245
},
{
"epoch": 1.30635838150289,
"grad_norm": 0.5417420736704273,
"learning_rate": 6.459996370896652e-07,
"loss": 0.546296238899231,
"step": 452,
"token_acc": 0.8187062949013282
},
{
"epoch": 1.3092485549132948,
"grad_norm": 0.5655148261341275,
"learning_rate": 6.444752284726135e-07,
"loss": 0.5877007246017456,
"step": 453,
"token_acc": 0.8039364919354839
},
{
"epoch": 1.3121387283236994,
"grad_norm": 0.6144864679165839,
"learning_rate": 6.429493531635114e-07,
"loss": 0.5454727411270142,
"step": 454,
"token_acc": 0.8179015382597002
},
{
"epoch": 1.3150289017341041,
"grad_norm": 0.5513024274913209,
"learning_rate": 6.414220266528291e-07,
"loss": 0.553301215171814,
"step": 455,
"token_acc": 0.8119396930565884
},
{
"epoch": 1.3179190751445087,
"grad_norm": 0.5291432658218749,
"learning_rate": 6.398932644457689e-07,
"loss": 0.5474492311477661,
"step": 456,
"token_acc": 0.8148487159928808
},
{
"epoch": 1.3208092485549132,
"grad_norm": 0.5239384490420579,
"learning_rate": 6.383630820621081e-07,
"loss": 0.5769109725952148,
"step": 457,
"token_acc": 0.8075285980313913
},
{
"epoch": 1.323699421965318,
"grad_norm": 0.5372997474035569,
"learning_rate": 6.368314950360415e-07,
"loss": 0.5458542108535767,
"step": 458,
"token_acc": 0.818262614678899
},
{
"epoch": 1.3265895953757225,
"grad_norm": 0.5222784886904625,
"learning_rate": 6.352985189160234e-07,
"loss": 0.543486475944519,
"step": 459,
"token_acc": 0.8140883445049911
},
{
"epoch": 1.3294797687861273,
"grad_norm": 0.5656149822293426,
"learning_rate": 6.337641692646106e-07,
"loss": 0.5165099501609802,
"step": 460,
"token_acc": 0.8232782145649256
},
{
"epoch": 1.3323699421965318,
"grad_norm": 0.5339208409670375,
"learning_rate": 6.322284616583026e-07,
"loss": 0.568447470664978,
"step": 461,
"token_acc": 0.8107062348801407
},
{
"epoch": 1.3352601156069364,
"grad_norm": 0.534789315369846,
"learning_rate": 6.306914116873862e-07,
"loss": 0.5637167692184448,
"step": 462,
"token_acc": 0.8118799414154401
},
{
"epoch": 1.3381502890173411,
"grad_norm": 0.5013992587561265,
"learning_rate": 6.291530349557749e-07,
"loss": 0.6041359305381775,
"step": 463,
"token_acc": 0.8002847429734529
},
{
"epoch": 1.3410404624277457,
"grad_norm": 0.6327002649058038,
"learning_rate": 6.27613347080851e-07,
"loss": 0.5996913909912109,
"step": 464,
"token_acc": 0.8028000921266601
},
{
"epoch": 1.3439306358381504,
"grad_norm": 0.47925020942862323,
"learning_rate": 6.260723636933076e-07,
"loss": 0.5272285342216492,
"step": 465,
"token_acc": 0.8219443104776792
},
{
"epoch": 1.346820809248555,
"grad_norm": 0.5418997127974843,
"learning_rate": 6.2453010043699e-07,
"loss": 0.5982799530029297,
"step": 466,
"token_acc": 0.8018455748733745
},
{
"epoch": 1.3497109826589595,
"grad_norm": 0.511563505395346,
"learning_rate": 6.22986572968736e-07,
"loss": 0.5489825010299683,
"step": 467,
"token_acc": 0.8149126753184632
},
{
"epoch": 1.352601156069364,
"grad_norm": 0.6199984691110088,
"learning_rate": 6.214417969582181e-07,
"loss": 0.5509693622589111,
"step": 468,
"token_acc": 0.8135395589697864
},
{
"epoch": 1.3554913294797688,
"grad_norm": 0.9112236282410355,
"learning_rate": 6.198957880877833e-07,
"loss": 0.5764250755310059,
"step": 469,
"token_acc": 0.8059208967249633
},
{
"epoch": 1.3583815028901733,
"grad_norm": 0.5989342589849401,
"learning_rate": 6.183485620522946e-07,
"loss": 0.5593207478523254,
"step": 470,
"token_acc": 0.8130887081520711
},
{
"epoch": 1.361271676300578,
"grad_norm": 0.539630418011966,
"learning_rate": 6.168001345589715e-07,
"loss": 0.5798720121383667,
"step": 471,
"token_acc": 0.8067868478007105
},
{
"epoch": 1.3641618497109826,
"grad_norm": 0.5728505086100849,
"learning_rate": 6.152505213272307e-07,
"loss": 0.5105577707290649,
"step": 472,
"token_acc": 0.8268291947926711
},
{
"epoch": 1.3670520231213872,
"grad_norm": 0.5731864783632108,
"learning_rate": 6.136997380885259e-07,
"loss": 0.505968451499939,
"step": 473,
"token_acc": 0.8303852677489701
},
{
"epoch": 1.369942196531792,
"grad_norm": 0.5786843206230191,
"learning_rate": 6.12147800586189e-07,
"loss": 0.570541501045227,
"step": 474,
"token_acc": 0.8074693848475233
},
{
"epoch": 1.3728323699421965,
"grad_norm": 0.5300325283027945,
"learning_rate": 6.105947245752696e-07,
"loss": 0.5622447729110718,
"step": 475,
"token_acc": 0.8132824737156444
},
{
"epoch": 1.3757225433526012,
"grad_norm": 0.5128915878177316,
"learning_rate": 6.090405258223756e-07,
"loss": 0.5856798887252808,
"step": 476,
"token_acc": 0.8047204813663714
},
{
"epoch": 1.3786127167630058,
"grad_norm": 0.6515506714427548,
"learning_rate": 6.074852201055121e-07,
"loss": 0.5826733112335205,
"step": 477,
"token_acc": 0.8034700052323068
},
{
"epoch": 1.3815028901734103,
"grad_norm": 0.5411318320511171,
"learning_rate": 6.059288232139225e-07,
"loss": 0.5210794806480408,
"step": 478,
"token_acc": 0.8267671925390047
},
{
"epoch": 1.384393063583815,
"grad_norm": 0.5057150421228545,
"learning_rate": 6.043713509479277e-07,
"loss": 0.5771398544311523,
"step": 479,
"token_acc": 0.805000332629771
},
{
"epoch": 1.3872832369942196,
"grad_norm": 0.5683244072025584,
"learning_rate": 6.028128191187653e-07,
"loss": 0.5385507941246033,
"step": 480,
"token_acc": 0.8176837578528416
},
{
"epoch": 1.3901734104046244,
"grad_norm": 0.5074682411792649,
"learning_rate": 6.012532435484297e-07,
"loss": 0.5577852725982666,
"step": 481,
"token_acc": 0.8137183546223177
},
{
"epoch": 1.393063583815029,
"grad_norm": 0.5087445776495183,
"learning_rate": 5.996926400695113e-07,
"loss": 0.5707537531852722,
"step": 482,
"token_acc": 0.8081652461733929
},
{
"epoch": 1.3959537572254335,
"grad_norm": 0.6070774288583548,
"learning_rate": 5.981310245250351e-07,
"loss": 0.5291765928268433,
"step": 483,
"token_acc": 0.8198564644248993
},
{
"epoch": 1.3988439306358382,
"grad_norm": 0.6426245006677934,
"learning_rate": 5.965684127683012e-07,
"loss": 0.5093721151351929,
"step": 484,
"token_acc": 0.8283427901813247
},
{
"epoch": 1.4017341040462428,
"grad_norm": 0.5235719939982498,
"learning_rate": 5.950048206627228e-07,
"loss": 0.5404484272003174,
"step": 485,
"token_acc": 0.8198352412538783
},
{
"epoch": 1.4046242774566475,
"grad_norm": 0.5330826415435456,
"learning_rate": 5.934402640816651e-07,
"loss": 0.6019877195358276,
"step": 486,
"token_acc": 0.7991196347629723
},
{
"epoch": 1.407514450867052,
"grad_norm": 0.5402624006228682,
"learning_rate": 5.918747589082852e-07,
"loss": 0.512151300907135,
"step": 487,
"token_acc": 0.8255600510667488
},
{
"epoch": 1.4104046242774566,
"grad_norm": 0.6034074325578554,
"learning_rate": 5.903083210353695e-07,
"loss": 0.5242146253585815,
"step": 488,
"token_acc": 0.821293480679374
},
{
"epoch": 1.4132947976878611,
"grad_norm": 0.6270290133131012,
"learning_rate": 5.887409663651736e-07,
"loss": 0.5783629417419434,
"step": 489,
"token_acc": 0.8050973979809469
},
{
"epoch": 1.416184971098266,
"grad_norm": 0.5697754520754279,
"learning_rate": 5.8717271080926e-07,
"loss": 0.5560973882675171,
"step": 490,
"token_acc": 0.8151623266302166
},
{
"epoch": 1.4190751445086704,
"grad_norm": 0.5157305125572653,
"learning_rate": 5.856035702883368e-07,
"loss": 0.5741870999336243,
"step": 491,
"token_acc": 0.8082165363392618
},
{
"epoch": 1.4219653179190752,
"grad_norm": 0.552078767595136,
"learning_rate": 5.840335607320963e-07,
"loss": 0.5855275392532349,
"step": 492,
"token_acc": 0.8052095872614805
},
{
"epoch": 1.4248554913294798,
"grad_norm": 0.503224099727086,
"learning_rate": 5.824626980790532e-07,
"loss": 0.5036199688911438,
"step": 493,
"token_acc": 0.8295647769617597
},
{
"epoch": 1.4277456647398843,
"grad_norm": 0.535330314229148,
"learning_rate": 5.808909982763825e-07,
"loss": 0.5614448189735413,
"step": 494,
"token_acc": 0.8112195584194068
},
{
"epoch": 1.430635838150289,
"grad_norm": 0.5656493275743161,
"learning_rate": 5.793184772797577e-07,
"loss": 0.5648437142372131,
"step": 495,
"token_acc": 0.809333342296497
},
{
"epoch": 1.4335260115606936,
"grad_norm": 0.591964902056671,
"learning_rate": 5.777451510531894e-07,
"loss": 0.4516139626502991,
"step": 496,
"token_acc": 0.8457953488372093
},
{
"epoch": 1.4364161849710984,
"grad_norm": 0.5299075126510611,
"learning_rate": 5.761710355688627e-07,
"loss": 0.4779651165008545,
"step": 497,
"token_acc": 0.8387296285988187
},
{
"epoch": 1.439306358381503,
"grad_norm": 0.5231792243250346,
"learning_rate": 5.745961468069749e-07,
"loss": 0.5104596614837646,
"step": 498,
"token_acc": 0.8271942849713633
},
{
"epoch": 1.4421965317919074,
"grad_norm": 0.6000529888737813,
"learning_rate": 5.730205007555733e-07,
"loss": 0.6098222136497498,
"step": 499,
"token_acc": 0.797237394529817
},
{
"epoch": 1.4450867052023122,
"grad_norm": 0.5852668345047015,
"learning_rate": 5.714441134103936e-07,
"loss": 0.5637513995170593,
"step": 500,
"token_acc": 0.8103524746275665
},
{
"epoch": 1.4450867052023122,
"eval_loss": 0.5809597969055176,
"eval_runtime": 69.4729,
"eval_samples_per_second": 1.583,
"eval_steps_per_second": 0.202,
"eval_token_acc": 0.8065338513984092,
"step": 500
},
{
"epoch": 1.4479768786127167,
"grad_norm": 0.4857410434739766,
"learning_rate": 5.698670007746966e-07,
"loss": 0.5209301710128784,
"step": 501,
"token_acc": 0.8231232032245636
},
{
"epoch": 1.4508670520231215,
"grad_norm": 0.4784985253670375,
"learning_rate": 5.682891788591065e-07,
"loss": 0.5571726560592651,
"step": 502,
"token_acc": 0.8108515538539766
},
{
"epoch": 1.453757225433526,
"grad_norm": 0.5367888787021339,
"learning_rate": 5.66710663681448e-07,
"loss": 0.49731090664863586,
"step": 503,
"token_acc": 0.8312890657633916
},
{
"epoch": 1.4566473988439306,
"grad_norm": 0.610496198886357,
"learning_rate": 5.651314712665832e-07,
"loss": 0.5665647387504578,
"step": 504,
"token_acc": 0.8079466209795078
},
{
"epoch": 1.4595375722543353,
"grad_norm": 1.0378355029945652,
"learning_rate": 5.635516176462501e-07,
"loss": 0.5903141498565674,
"step": 505,
"token_acc": 0.8006371133060007
},
{
"epoch": 1.4624277456647399,
"grad_norm": 0.5540207350664488,
"learning_rate": 5.619711188588986e-07,
"loss": 0.5362493991851807,
"step": 506,
"token_acc": 0.820343725019984
},
{
"epoch": 1.4653179190751446,
"grad_norm": 0.5144874632858891,
"learning_rate": 5.603899909495283e-07,
"loss": 0.5462620258331299,
"step": 507,
"token_acc": 0.8145612480715733
},
{
"epoch": 1.4682080924855492,
"grad_norm": 0.5592031195717259,
"learning_rate": 5.58808249969526e-07,
"loss": 0.5476292371749878,
"step": 508,
"token_acc": 0.8147672146736102
},
{
"epoch": 1.4710982658959537,
"grad_norm": 0.8823564949169135,
"learning_rate": 5.57225911976502e-07,
"loss": 0.5868964195251465,
"step": 509,
"token_acc": 0.804788821591468
},
{
"epoch": 1.4739884393063583,
"grad_norm": 0.5547092232918307,
"learning_rate": 5.556429930341273e-07,
"loss": 0.5038424134254456,
"step": 510,
"token_acc": 0.8334411359013724
},
{
"epoch": 1.476878612716763,
"grad_norm": 0.5799136969979296,
"learning_rate": 5.540595092119708e-07,
"loss": 0.5707584619522095,
"step": 511,
"token_acc": 0.810527226273487
},
{
"epoch": 1.4797687861271676,
"grad_norm": 0.5034147261225864,
"learning_rate": 5.52475476585336e-07,
"loss": 0.5583351850509644,
"step": 512,
"token_acc": 0.8103952305319614
},
{
"epoch": 1.4826589595375723,
"grad_norm": 0.5160609299204681,
"learning_rate": 5.508909112350976e-07,
"loss": 0.5299844145774841,
"step": 513,
"token_acc": 0.8211946274807083
},
{
"epoch": 1.4855491329479769,
"grad_norm": 0.48690505381618093,
"learning_rate": 5.493058292475387e-07,
"loss": 0.5815989375114441,
"step": 514,
"token_acc": 0.8052997388378583
},
{
"epoch": 1.4884393063583814,
"grad_norm": 0.5497798749732475,
"learning_rate": 5.477202467141864e-07,
"loss": 0.5317429900169373,
"step": 515,
"token_acc": 0.8238470637503765
},
{
"epoch": 1.4913294797687862,
"grad_norm": 0.6297718557593524,
"learning_rate": 5.46134179731651e-07,
"loss": 0.5170228481292725,
"step": 516,
"token_acc": 0.8249339191625676
},
{
"epoch": 1.4942196531791907,
"grad_norm": 0.5879194826209626,
"learning_rate": 5.445476444014591e-07,
"loss": 0.5530685186386108,
"step": 517,
"token_acc": 0.8124287116369134
},
{
"epoch": 1.4971098265895955,
"grad_norm": 0.7172467911918745,
"learning_rate": 5.429606568298925e-07,
"loss": 0.5767130851745605,
"step": 518,
"token_acc": 0.8076758697324558
},
{
"epoch": 1.5,
"grad_norm": 0.47225756227931015,
"learning_rate": 5.413732331278248e-07,
"loss": 0.5357682704925537,
"step": 519,
"token_acc": 0.818332255376673
},
{
"epoch": 1.5028901734104045,
"grad_norm": 0.5315785549808126,
"learning_rate": 5.397853894105559e-07,
"loss": 0.5103631019592285,
"step": 520,
"token_acc": 0.8272772712126261
},
{
"epoch": 1.5057803468208093,
"grad_norm": 0.5411201442197484,
"learning_rate": 5.381971417976505e-07,
"loss": 0.6071707606315613,
"step": 521,
"token_acc": 0.7980659432441779
},
{
"epoch": 1.5086705202312138,
"grad_norm": 0.48884895821181845,
"learning_rate": 5.366085064127734e-07,
"loss": 0.5692754983901978,
"step": 522,
"token_acc": 0.8067921134275868
},
{
"epoch": 1.5115606936416186,
"grad_norm": 0.5118700142105465,
"learning_rate": 5.350194993835257e-07,
"loss": 0.5697520971298218,
"step": 523,
"token_acc": 0.8085179483452373
},
{
"epoch": 1.5144508670520231,
"grad_norm": 0.5936738113687722,
"learning_rate": 5.33430136841282e-07,
"loss": 0.5466612577438354,
"step": 524,
"token_acc": 0.8143604233276328
},
{
"epoch": 1.5173410404624277,
"grad_norm": 0.48377394646569144,
"learning_rate": 5.318404349210255e-07,
"loss": 0.5685998201370239,
"step": 525,
"token_acc": 0.808461779914424
},
{
"epoch": 1.5202312138728322,
"grad_norm": 0.5190453952524928,
"learning_rate": 5.302504097611846e-07,
"loss": 0.5479923486709595,
"step": 526,
"token_acc": 0.8150076205934166
},
{
"epoch": 1.523121387283237,
"grad_norm": 0.6234920552697755,
"learning_rate": 5.286600775034699e-07,
"loss": 0.5165071487426758,
"step": 527,
"token_acc": 0.8261736549800983
},
{
"epoch": 1.5260115606936417,
"grad_norm": 0.5881560338514248,
"learning_rate": 5.270694542927088e-07,
"loss": 0.5723020434379578,
"step": 528,
"token_acc": 0.808951938948829
},
{
"epoch": 1.5289017341040463,
"grad_norm": 0.5975961668165296,
"learning_rate": 5.254785562766829e-07,
"loss": 0.5684691667556763,
"step": 529,
"token_acc": 0.8089175396185871
},
{
"epoch": 1.5317919075144508,
"grad_norm": 0.6478162796925766,
"learning_rate": 5.238873996059637e-07,
"loss": 0.49971041083335876,
"step": 530,
"token_acc": 0.8301960912691917
},
{
"epoch": 1.5346820809248554,
"grad_norm": 0.6430164741639133,
"learning_rate": 5.222960004337476e-07,
"loss": 0.539410412311554,
"step": 531,
"token_acc": 0.8203352152694456
},
{
"epoch": 1.5375722543352601,
"grad_norm": 0.5795319284660402,
"learning_rate": 5.207043749156944e-07,
"loss": 0.5065566897392273,
"step": 532,
"token_acc": 0.8278279073124954
},
{
"epoch": 1.5404624277456649,
"grad_norm": 0.5464317098167678,
"learning_rate": 5.191125392097604e-07,
"loss": 0.5445448160171509,
"step": 533,
"token_acc": 0.8166163521084138
},
{
"epoch": 1.5433526011560694,
"grad_norm": 0.5152523695934649,
"learning_rate": 5.175205094760361e-07,
"loss": 0.5751731991767883,
"step": 534,
"token_acc": 0.8060461344386376
},
{
"epoch": 1.546242774566474,
"grad_norm": 0.5393208162828292,
"learning_rate": 5.159283018765819e-07,
"loss": 0.5777266621589661,
"step": 535,
"token_acc": 0.8069097414119084
},
{
"epoch": 1.5491329479768785,
"grad_norm": 0.5264135658228388,
"learning_rate": 5.143359325752638e-07,
"loss": 0.555731475353241,
"step": 536,
"token_acc": 0.8131375804713217
},
{
"epoch": 1.5520231213872833,
"grad_norm": 0.553539191702997,
"learning_rate": 5.127434177375893e-07,
"loss": 0.5539097189903259,
"step": 537,
"token_acc": 0.812809830006887
},
{
"epoch": 1.5549132947976878,
"grad_norm": 0.8304938276922723,
"learning_rate": 5.111507735305434e-07,
"loss": 0.535222589969635,
"step": 538,
"token_acc": 0.8182165566153093
},
{
"epoch": 1.5578034682080926,
"grad_norm": 0.5723817981155602,
"learning_rate": 5.095580161224244e-07,
"loss": 0.5616499185562134,
"step": 539,
"token_acc": 0.8143216251104015
},
{
"epoch": 1.560693641618497,
"grad_norm": 0.5555286868999088,
"learning_rate": 5.079651616826801e-07,
"loss": 0.5724209547042847,
"step": 540,
"token_acc": 0.8077700594252842
},
{
"epoch": 1.5635838150289016,
"grad_norm": 0.576829331739999,
"learning_rate": 5.063722263817427e-07,
"loss": 0.5502010583877563,
"step": 541,
"token_acc": 0.8148729355841307
},
{
"epoch": 1.5664739884393064,
"grad_norm": 0.6980607962330599,
"learning_rate": 5.047792263908659e-07,
"loss": 0.5372669696807861,
"step": 542,
"token_acc": 0.8214026830309711
},
{
"epoch": 1.569364161849711,
"grad_norm": 0.5728162578490732,
"learning_rate": 5.031861778819601e-07,
"loss": 0.5055459141731262,
"step": 543,
"token_acc": 0.8276528811478554
},
{
"epoch": 1.5722543352601157,
"grad_norm": 0.588844313912188,
"learning_rate": 5.015930970274277e-07,
"loss": 0.5107961893081665,
"step": 544,
"token_acc": 0.8256070951933737
},
{
"epoch": 1.5751445086705202,
"grad_norm": 0.49402725355257393,
"learning_rate": 5e-07,
"loss": 0.5780792236328125,
"step": 545,
"token_acc": 0.80536919727071
},
{
"epoch": 1.5780346820809248,
"grad_norm": 0.5150482904703839,
"learning_rate": 4.984069029725722e-07,
"loss": 0.5730597972869873,
"step": 546,
"token_acc": 0.8071878326447399
},
{
"epoch": 1.5809248554913293,
"grad_norm": 0.6120632562818131,
"learning_rate": 4.968138221180401e-07,
"loss": 0.48976290225982666,
"step": 547,
"token_acc": 0.8329062019477191
},
{
"epoch": 1.583815028901734,
"grad_norm": 0.5693765551777754,
"learning_rate": 4.95220773609134e-07,
"loss": 0.5690828561782837,
"step": 548,
"token_acc": 0.8103057397715957
},
{
"epoch": 1.5867052023121389,
"grad_norm": 0.5356011166477922,
"learning_rate": 4.936277736182573e-07,
"loss": 0.5775788426399231,
"step": 549,
"token_acc": 0.8077360101658677
},
{
"epoch": 1.5895953757225434,
"grad_norm": 0.5425409660783537,
"learning_rate": 4.9203483831732e-07,
"loss": 0.5838006138801575,
"step": 550,
"token_acc": 0.8051269382791122
},
{
"epoch": 1.592485549132948,
"grad_norm": 0.531381332935958,
"learning_rate": 4.904419838775755e-07,
"loss": 0.528168797492981,
"step": 551,
"token_acc": 0.8208675592063154
},
{
"epoch": 1.5953757225433525,
"grad_norm": 0.5122923018471659,
"learning_rate": 4.888492264694565e-07,
"loss": 0.5490496158599854,
"step": 552,
"token_acc": 0.8156343068498415
},
{
"epoch": 1.5982658959537572,
"grad_norm": 0.5537803937619057,
"learning_rate": 4.872565822624106e-07,
"loss": 0.5283633470535278,
"step": 553,
"token_acc": 0.8213697374264063
},
{
"epoch": 1.601156069364162,
"grad_norm": 0.533996696099063,
"learning_rate": 4.856640674247363e-07,
"loss": 0.5403317213058472,
"step": 554,
"token_acc": 0.8186500168747891
},
{
"epoch": 1.6040462427745665,
"grad_norm": 0.6968861408661483,
"learning_rate": 4.840716981234181e-07,
"loss": 0.5232794880867004,
"step": 555,
"token_acc": 0.8258206662354464
},
{
"epoch": 1.606936416184971,
"grad_norm": 0.5457170981213912,
"learning_rate": 4.82479490523964e-07,
"loss": 0.5531569123268127,
"step": 556,
"token_acc": 0.8132714653155657
},
{
"epoch": 1.6098265895953756,
"grad_norm": 0.5611664995745906,
"learning_rate": 4.808874607902397e-07,
"loss": 0.580593466758728,
"step": 557,
"token_acc": 0.8061331347873197
},
{
"epoch": 1.6127167630057804,
"grad_norm": 0.49146546445526984,
"learning_rate": 4.792956250843055e-07,
"loss": 0.5263780355453491,
"step": 558,
"token_acc": 0.8212147967727204
},
{
"epoch": 1.6156069364161851,
"grad_norm": 0.5650532769234693,
"learning_rate": 4.777039995662522e-07,
"loss": 0.535209596157074,
"step": 559,
"token_acc": 0.8197443965795302
},
{
"epoch": 1.6184971098265897,
"grad_norm": 0.6028109251795714,
"learning_rate": 4.7611260039403655e-07,
"loss": 0.5842093825340271,
"step": 560,
"token_acc": 0.8079101659544867
},
{
"epoch": 1.6213872832369942,
"grad_norm": 0.59069578828569,
"learning_rate": 4.7452144372331715e-07,
"loss": 0.49987393617630005,
"step": 561,
"token_acc": 0.8312081956170992
},
{
"epoch": 1.6242774566473988,
"grad_norm": 0.5752034924536564,
"learning_rate": 4.7293054570729126e-07,
"loss": 0.5631648302078247,
"step": 562,
"token_acc": 0.8105449311754528
},
{
"epoch": 1.6271676300578035,
"grad_norm": 0.48011026987442956,
"learning_rate": 4.7133992249653026e-07,
"loss": 0.6020775437355042,
"step": 563,
"token_acc": 0.80207682093969
},
{
"epoch": 1.630057803468208,
"grad_norm": 0.6157896994330491,
"learning_rate": 4.697495902388154e-07,
"loss": 0.5418002009391785,
"step": 564,
"token_acc": 0.8178849600782141
},
{
"epoch": 1.6329479768786128,
"grad_norm": 0.5711847053504078,
"learning_rate": 4.681595650789746e-07,
"loss": 0.5428210496902466,
"step": 565,
"token_acc": 0.815186965701749
},
{
"epoch": 1.6358381502890174,
"grad_norm": 0.5202306815183112,
"learning_rate": 4.6656986315871815e-07,
"loss": 0.5333169102668762,
"step": 566,
"token_acc": 0.8192019018509085
},
{
"epoch": 1.638728323699422,
"grad_norm": 0.5862764371195341,
"learning_rate": 4.649805006164743e-07,
"loss": 0.5256876349449158,
"step": 567,
"token_acc": 0.8224795998947091
},
{
"epoch": 1.6416184971098264,
"grad_norm": 0.5972850501922398,
"learning_rate": 4.6339149358722675e-07,
"loss": 0.4838550388813019,
"step": 568,
"token_acc": 0.8348972296693477
},
{
"epoch": 1.6445086705202312,
"grad_norm": 0.5597928387418396,
"learning_rate": 4.618028582023495e-07,
"loss": 0.5284090042114258,
"step": 569,
"token_acc": 0.8216369128482156
},
{
"epoch": 1.647398843930636,
"grad_norm": 0.6008687154199086,
"learning_rate": 4.6021461058944415e-07,
"loss": 0.5147076845169067,
"step": 570,
"token_acc": 0.8275472384008092
},
{
"epoch": 1.6502890173410405,
"grad_norm": 0.6575913400532123,
"learning_rate": 4.5862676687217526e-07,
"loss": 0.5117477178573608,
"step": 571,
"token_acc": 0.8287706152259228
},
{
"epoch": 1.653179190751445,
"grad_norm": 0.5137586329958652,
"learning_rate": 4.5703934317010727e-07,
"loss": 0.5332241058349609,
"step": 572,
"token_acc": 0.8202151610509888
},
{
"epoch": 1.6560693641618496,
"grad_norm": 0.565500132263929,
"learning_rate": 4.5545235559854105e-07,
"loss": 0.5527046918869019,
"step": 573,
"token_acc": 0.8138320979141528
},
{
"epoch": 1.6589595375722543,
"grad_norm": 0.5302962565332909,
"learning_rate": 4.5386582026834904e-07,
"loss": 0.5092106461524963,
"step": 574,
"token_acc": 0.8281128993919504
},
{
"epoch": 1.661849710982659,
"grad_norm": 0.5821742123016643,
"learning_rate": 4.5227975328581335e-07,
"loss": 0.5064735412597656,
"step": 575,
"token_acc": 0.827575659879804
},
{
"epoch": 1.6647398843930636,
"grad_norm": 0.5963479290796274,
"learning_rate": 4.5069417075246146e-07,
"loss": 0.4928985834121704,
"step": 576,
"token_acc": 0.8335413266775463
},
{
"epoch": 1.6676300578034682,
"grad_norm": 0.6048528428075496,
"learning_rate": 4.491090887649024e-07,
"loss": 0.49480709433555603,
"step": 577,
"token_acc": 0.8347347057118005
},
{
"epoch": 1.6705202312138727,
"grad_norm": 0.6285946360216301,
"learning_rate": 4.475245234146639e-07,
"loss": 0.49079689383506775,
"step": 578,
"token_acc": 0.83443186255369
},
{
"epoch": 1.6734104046242775,
"grad_norm": 0.5603272652152215,
"learning_rate": 4.459404907880292e-07,
"loss": 0.5334948897361755,
"step": 579,
"token_acc": 0.8186869024041065
},
{
"epoch": 1.6763005780346822,
"grad_norm": 0.5366750310588114,
"learning_rate": 4.443570069658727e-07,
"loss": 0.5434994101524353,
"step": 580,
"token_acc": 0.816468327847366
},
{
"epoch": 1.6791907514450868,
"grad_norm": 0.5467060355475981,
"learning_rate": 4.42774088023498e-07,
"loss": 0.5757695436477661,
"step": 581,
"token_acc": 0.8080333034841515
},
{
"epoch": 1.6820809248554913,
"grad_norm": 0.6184966009398549,
"learning_rate": 4.4119175003047407e-07,
"loss": 0.5647035241127014,
"step": 582,
"token_acc": 0.8111076384093734
},
{
"epoch": 1.6849710982658959,
"grad_norm": 0.5185867079907565,
"learning_rate": 4.396100090504717e-07,
"loss": 0.5796575546264648,
"step": 583,
"token_acc": 0.8038202807075824
},
{
"epoch": 1.6878612716763006,
"grad_norm": 0.813643580955912,
"learning_rate": 4.380288811411015e-07,
"loss": 0.4743460416793823,
"step": 584,
"token_acc": 0.8386408207372227
},
{
"epoch": 1.6907514450867052,
"grad_norm": 0.5897820309260559,
"learning_rate": 4.364483823537498e-07,
"loss": 0.5133877992630005,
"step": 585,
"token_acc": 0.8280596690740123
},
{
"epoch": 1.69364161849711,
"grad_norm": 0.5045181308055782,
"learning_rate": 4.3486852873341675e-07,
"loss": 0.4322221279144287,
"step": 586,
"token_acc": 0.8542273580630543
},
{
"epoch": 1.6965317919075145,
"grad_norm": 0.5368324019397285,
"learning_rate": 4.3328933631855195e-07,
"loss": 0.5392330884933472,
"step": 587,
"token_acc": 0.8167310479753804
},
{
"epoch": 1.699421965317919,
"grad_norm": 0.6325523087901944,
"learning_rate": 4.317108211408933e-07,
"loss": 0.5353363752365112,
"step": 588,
"token_acc": 0.8181194354468216
},
{
"epoch": 1.7023121387283235,
"grad_norm": 0.5524128184191415,
"learning_rate": 4.301329992253034e-07,
"loss": 0.49616819620132446,
"step": 589,
"token_acc": 0.8328951746002753
},
{
"epoch": 1.7052023121387283,
"grad_norm": 0.5034001899067154,
"learning_rate": 4.285558865896065e-07,
"loss": 0.60711270570755,
"step": 590,
"token_acc": 0.79853336934882
},
{
"epoch": 1.708092485549133,
"grad_norm": 0.5374954529356852,
"learning_rate": 4.2697949924442667e-07,
"loss": 0.5293912291526794,
"step": 591,
"token_acc": 0.823666171683991
},
{
"epoch": 1.7109826589595376,
"grad_norm": 0.5635901606786159,
"learning_rate": 4.2540385319302524e-07,
"loss": 0.5353492498397827,
"step": 592,
"token_acc": 0.8201790482173709
},
{
"epoch": 1.7138728323699421,
"grad_norm": 0.5253802438717141,
"learning_rate": 4.2382896443113723e-07,
"loss": 0.5334903001785278,
"step": 593,
"token_acc": 0.818032814303156
},
{
"epoch": 1.7167630057803467,
"grad_norm": 0.4950360437778214,
"learning_rate": 4.222548489468105e-07,
"loss": 0.5341077446937561,
"step": 594,
"token_acc": 0.8223698601883738
},
{
"epoch": 1.7196531791907514,
"grad_norm": 0.5514023397940045,
"learning_rate": 4.2068152272024233e-07,
"loss": 0.5363609194755554,
"step": 595,
"token_acc": 0.8196168676738834
},
{
"epoch": 1.7225433526011562,
"grad_norm": 0.5623269464968738,
"learning_rate": 4.1910900172361763e-07,
"loss": 0.5504116415977478,
"step": 596,
"token_acc": 0.8151576025420944
},
{
"epoch": 1.7254335260115607,
"grad_norm": 0.5274551240137945,
"learning_rate": 4.175373019209468e-07,
"loss": 0.5549143552780151,
"step": 597,
"token_acc": 0.8107931600579981
},
{
"epoch": 1.7283236994219653,
"grad_norm": 0.5704477484512106,
"learning_rate": 4.159664392679038e-07,
"loss": 0.5494258403778076,
"step": 598,
"token_acc": 0.8168460618486246
},
{
"epoch": 1.7312138728323698,
"grad_norm": 0.6161778636830428,
"learning_rate": 4.143964297116633e-07,
"loss": 0.5577751994132996,
"step": 599,
"token_acc": 0.8121810843728358
},
{
"epoch": 1.7341040462427746,
"grad_norm": 0.6075742333688984,
"learning_rate": 4.1282728919074005e-07,
"loss": 0.5403814315795898,
"step": 600,
"token_acc": 0.821105101452986
},
{
"epoch": 1.7369942196531793,
"grad_norm": 0.6520533036933062,
"learning_rate": 4.1125903363482634e-07,
"loss": 0.47892940044403076,
"step": 601,
"token_acc": 0.8369930163846361
},
{
"epoch": 1.739884393063584,
"grad_norm": 0.5680876440782588,
"learning_rate": 4.0969167896463046e-07,
"loss": 0.5336910486221313,
"step": 602,
"token_acc": 0.8216713342322719
},
{
"epoch": 1.7427745664739884,
"grad_norm": 0.7080634828510891,
"learning_rate": 4.0812524109171475e-07,
"loss": 0.524694561958313,
"step": 603,
"token_acc": 0.8261413383364603
},
{
"epoch": 1.745664739884393,
"grad_norm": 0.528594204710658,
"learning_rate": 4.0655973591833475e-07,
"loss": 0.5086634755134583,
"step": 604,
"token_acc": 0.8286352131054758
},
{
"epoch": 1.7485549132947977,
"grad_norm": 0.6260551904964319,
"learning_rate": 4.0499517933727727e-07,
"loss": 0.48479533195495605,
"step": 605,
"token_acc": 0.8348625638530771
},
{
"epoch": 1.7514450867052023,
"grad_norm": 0.5425421161730628,
"learning_rate": 4.034315872316987e-07,
"loss": 0.5817371606826782,
"step": 606,
"token_acc": 0.8068743095851797
},
{
"epoch": 1.754335260115607,
"grad_norm": 0.5183265889747526,
"learning_rate": 4.018689754749648e-07,
"loss": 0.508335292339325,
"step": 607,
"token_acc": 0.8271757714886951
},
{
"epoch": 1.7572254335260116,
"grad_norm": 0.5542866259664111,
"learning_rate": 4.0030735993048884e-07,
"loss": 0.5586389899253845,
"step": 608,
"token_acc": 0.8166898202884842
},
{
"epoch": 1.760115606936416,
"grad_norm": 0.5411864859640132,
"learning_rate": 3.987467564515703e-07,
"loss": 0.4601624608039856,
"step": 609,
"token_acc": 0.84508010404543
},
{
"epoch": 1.7630057803468207,
"grad_norm": 0.524886018198833,
"learning_rate": 3.971871808812347e-07,
"loss": 0.6006595492362976,
"step": 610,
"token_acc": 0.8011782786885245
},
{
"epoch": 1.7658959537572254,
"grad_norm": 0.6317327126827325,
"learning_rate": 3.956286490520724e-07,
"loss": 0.509284496307373,
"step": 611,
"token_acc": 0.8325460029684483
},
{
"epoch": 1.7687861271676302,
"grad_norm": 0.5390581631300952,
"learning_rate": 3.9407117678607756e-07,
"loss": 0.4938768744468689,
"step": 612,
"token_acc": 0.8321855607688815
},
{
"epoch": 1.7716763005780347,
"grad_norm": 0.6560783619375582,
"learning_rate": 3.9251477989448795e-07,
"loss": 0.517693042755127,
"step": 613,
"token_acc": 0.8247808891627084
},
{
"epoch": 1.7745664739884393,
"grad_norm": 0.5602632255167417,
"learning_rate": 3.909594741776246e-07,
"loss": 0.5566587448120117,
"step": 614,
"token_acc": 0.812049268832398
},
{
"epoch": 1.7774566473988438,
"grad_norm": 0.5947561408697656,
"learning_rate": 3.8940527542473033e-07,
"loss": 0.5609596967697144,
"step": 615,
"token_acc": 0.8135071333264908
},
{
"epoch": 1.7803468208092486,
"grad_norm": 0.5666442289982523,
"learning_rate": 3.8785219941381096e-07,
"loss": 0.5130019187927246,
"step": 616,
"token_acc": 0.8260872845234054
},
{
"epoch": 1.7832369942196533,
"grad_norm": 0.5455613722107414,
"learning_rate": 3.8630026191147405e-07,
"loss": 0.5589362978935242,
"step": 617,
"token_acc": 0.812414640315063
},
{
"epoch": 1.7861271676300579,
"grad_norm": 0.550217294387885,
"learning_rate": 3.8474947867276943e-07,
"loss": 0.5442770719528198,
"step": 618,
"token_acc": 0.8159889681462442
},
{
"epoch": 1.7890173410404624,
"grad_norm": 0.6147473096977814,
"learning_rate": 3.8319986544102843e-07,
"loss": 0.5019974708557129,
"step": 619,
"token_acc": 0.8287660341354818
},
{
"epoch": 1.791907514450867,
"grad_norm": 0.5247209374319454,
"learning_rate": 3.8165143794770536e-07,
"loss": 0.5381553769111633,
"step": 620,
"token_acc": 0.8177024482109227
},
{
"epoch": 1.7947976878612717,
"grad_norm": 0.5828193451002669,
"learning_rate": 3.8010421191221684e-07,
"loss": 0.523591160774231,
"step": 621,
"token_acc": 0.8240329148286393
},
{
"epoch": 1.7976878612716765,
"grad_norm": 0.6015955817395803,
"learning_rate": 3.78558203041782e-07,
"loss": 0.539184033870697,
"step": 622,
"token_acc": 0.8198696606927818
},
{
"epoch": 1.800578034682081,
"grad_norm": 0.6008612726420935,
"learning_rate": 3.7701342703126394e-07,
"loss": 0.48327842354774475,
"step": 623,
"token_acc": 0.8381134839691216
},
{
"epoch": 1.8034682080924855,
"grad_norm": 0.6147376285603221,
"learning_rate": 3.754698995630101e-07,
"loss": 0.5317155122756958,
"step": 624,
"token_acc": 0.8217411222039681
},
{
"epoch": 1.80635838150289,
"grad_norm": 0.6052477258361706,
"learning_rate": 3.7392763630669243e-07,
"loss": 0.5276878476142883,
"step": 625,
"token_acc": 0.8253162139403252
},
{
"epoch": 1.8092485549132948,
"grad_norm": 0.6010435836572232,
"learning_rate": 3.7238665291914906e-07,
"loss": 0.5263775587081909,
"step": 626,
"token_acc": 0.8255283062505889
},
{
"epoch": 1.8121387283236994,
"grad_norm": 0.46459212133429395,
"learning_rate": 3.7084696504422525e-07,
"loss": 0.547301173210144,
"step": 627,
"token_acc": 0.8155224935354174
},
{
"epoch": 1.8150289017341041,
"grad_norm": 0.567681963556663,
"learning_rate": 3.693085883126137e-07,
"loss": 0.504138708114624,
"step": 628,
"token_acc": 0.8300083822296731
},
{
"epoch": 1.8179190751445087,
"grad_norm": 0.5584446222303159,
"learning_rate": 3.6777153834169726e-07,
"loss": 0.5485329031944275,
"step": 629,
"token_acc": 0.8132374537904492
},
{
"epoch": 1.8208092485549132,
"grad_norm": 0.5610791187838037,
"learning_rate": 3.6623583073538965e-07,
"loss": 0.5641239881515503,
"step": 630,
"token_acc": 0.8092657184953543
},
{
"epoch": 1.8236994219653178,
"grad_norm": 0.5571741993799751,
"learning_rate": 3.647014810839766e-07,
"loss": 0.5435695648193359,
"step": 631,
"token_acc": 0.8177736577401747
},
{
"epoch": 1.8265895953757225,
"grad_norm": 0.49451328689884416,
"learning_rate": 3.6316850496395855e-07,
"loss": 0.5079208612442017,
"step": 632,
"token_acc": 0.8277710403419788
},
{
"epoch": 1.8294797687861273,
"grad_norm": 0.731312278004029,
"learning_rate": 3.6163691793789183e-07,
"loss": 0.5612790584564209,
"step": 633,
"token_acc": 0.8145309625996321
},
{
"epoch": 1.8323699421965318,
"grad_norm": 0.5433070122384833,
"learning_rate": 3.6010673555423116e-07,
"loss": 0.5702831149101257,
"step": 634,
"token_acc": 0.8084171358992268
},
{
"epoch": 1.8352601156069364,
"grad_norm": 0.5731111882216399,
"learning_rate": 3.585779733471709e-07,
"loss": 0.5208647847175598,
"step": 635,
"token_acc": 0.8247836812568473
},
{
"epoch": 1.838150289017341,
"grad_norm": 0.5863236667781423,
"learning_rate": 3.5705064683648855e-07,
"loss": 0.5619288682937622,
"step": 636,
"token_acc": 0.8113308744654901
},
{
"epoch": 1.8410404624277457,
"grad_norm": 0.5914772914689451,
"learning_rate": 3.555247715273867e-07,
"loss": 0.49036872386932373,
"step": 637,
"token_acc": 0.8374078180826161
},
{
"epoch": 1.8439306358381504,
"grad_norm": 0.5295217861583622,
"learning_rate": 3.5400036291033485e-07,
"loss": 0.5192829966545105,
"step": 638,
"token_acc": 0.8258416465326863
},
{
"epoch": 1.846820809248555,
"grad_norm": 0.5366095434473555,
"learning_rate": 3.5247743646091367e-07,
"loss": 0.48854076862335205,
"step": 639,
"token_acc": 0.8355026160864565
},
{
"epoch": 1.8497109826589595,
"grad_norm": 0.552265227323895,
"learning_rate": 3.509560076396567e-07,
"loss": 0.5541850924491882,
"step": 640,
"token_acc": 0.8161763703067251
},
{
"epoch": 1.852601156069364,
"grad_norm": 0.5766930712255567,
"learning_rate": 3.4943609189189345e-07,
"loss": 0.49490103125572205,
"step": 641,
"token_acc": 0.8331491368709432
},
{
"epoch": 1.8554913294797688,
"grad_norm": 0.535142297976956,
"learning_rate": 3.4791770464759347e-07,
"loss": 0.4898555278778076,
"step": 642,
"token_acc": 0.8374039851247991
},
{
"epoch": 1.8583815028901736,
"grad_norm": 0.6183254820329128,
"learning_rate": 3.4640086132120906e-07,
"loss": 0.5269954800605774,
"step": 643,
"token_acc": 0.8234169800850853
},
{
"epoch": 1.861271676300578,
"grad_norm": 0.5689322137373185,
"learning_rate": 3.4488557731151845e-07,
"loss": 0.5776628851890564,
"step": 644,
"token_acc": 0.8088350364511105
},
{
"epoch": 1.8641618497109826,
"grad_norm": 0.6658391987358445,
"learning_rate": 3.433718680014705e-07,
"loss": 0.5674536228179932,
"step": 645,
"token_acc": 0.8111267784268523
},
{
"epoch": 1.8670520231213872,
"grad_norm": 0.5702895217250429,
"learning_rate": 3.418597487580277e-07,
"loss": 0.5942685008049011,
"step": 646,
"token_acc": 0.8022179198440608
},
{
"epoch": 1.869942196531792,
"grad_norm": 0.5309534408388851,
"learning_rate": 3.4034923493201007e-07,
"loss": 0.5299490690231323,
"step": 647,
"token_acc": 0.821584668833352
},
{
"epoch": 1.8728323699421965,
"grad_norm": 0.5410494679792496,
"learning_rate": 3.388403418579401e-07,
"loss": 0.606309175491333,
"step": 648,
"token_acc": 0.798714223159906
},
{
"epoch": 1.8757225433526012,
"grad_norm": 0.5885088182247251,
"learning_rate": 3.3733308485388654e-07,
"loss": 0.5152050256729126,
"step": 649,
"token_acc": 0.8267703435171321
},
{
"epoch": 1.8786127167630058,
"grad_norm": 0.5654387308838804,
"learning_rate": 3.3582747922130903e-07,
"loss": 0.5702789425849915,
"step": 650,
"token_acc": 0.8114149857200532
},
{
"epoch": 1.8815028901734103,
"grad_norm": 0.5850200396224108,
"learning_rate": 3.343235402449025e-07,
"loss": 0.5715373754501343,
"step": 651,
"token_acc": 0.809812202628705
},
{
"epoch": 1.8843930635838149,
"grad_norm": 0.556702805056612,
"learning_rate": 3.3282128319244237e-07,
"loss": 0.5341757535934448,
"step": 652,
"token_acc": 0.8190304033783219
},
{
"epoch": 1.8872832369942196,
"grad_norm": 0.5947101357097584,
"learning_rate": 3.313207233146296e-07,
"loss": 0.5120434165000916,
"step": 653,
"token_acc": 0.8284752116658459
},
{
"epoch": 1.8901734104046244,
"grad_norm": 0.582059481324802,
"learning_rate": 3.2982187584493516e-07,
"loss": 0.55910724401474,
"step": 654,
"token_acc": 0.8136601394849785
},
{
"epoch": 1.893063583815029,
"grad_norm": 0.5455003297751219,
"learning_rate": 3.283247559994463e-07,
"loss": 0.4808557629585266,
"step": 655,
"token_acc": 0.8359401998347231
},
{
"epoch": 1.8959537572254335,
"grad_norm": 0.5917330827702398,
"learning_rate": 3.268293789767118e-07,
"loss": 0.5275037288665771,
"step": 656,
"token_acc": 0.8203649654462709
},
{
"epoch": 1.898843930635838,
"grad_norm": 0.604537834207858,
"learning_rate": 3.2533575995758694e-07,
"loss": 0.536374568939209,
"step": 657,
"token_acc": 0.8204949969817257
},
{
"epoch": 1.9017341040462428,
"grad_norm": 0.4877298329861977,
"learning_rate": 3.2384391410508066e-07,
"loss": 0.5517327785491943,
"step": 658,
"token_acc": 0.8144875608045037
},
{
"epoch": 1.9046242774566475,
"grad_norm": 0.5138107466063505,
"learning_rate": 3.223538565642009e-07,
"loss": 0.5936318635940552,
"step": 659,
"token_acc": 0.8033954818487206
},
{
"epoch": 1.907514450867052,
"grad_norm": 0.6408117816293808,
"learning_rate": 3.2086560246180016e-07,
"loss": 0.5199168920516968,
"step": 660,
"token_acc": 0.823338105590611
},
{
"epoch": 1.9104046242774566,
"grad_norm": 0.6769271622378699,
"learning_rate": 3.1937916690642355e-07,
"loss": 0.5296117067337036,
"step": 661,
"token_acc": 0.8234518795819685
},
{
"epoch": 1.9132947976878611,
"grad_norm": 0.5205148500482691,
"learning_rate": 3.178945649881543e-07,
"loss": 0.4881097674369812,
"step": 662,
"token_acc": 0.8381457544657637
},
{
"epoch": 1.916184971098266,
"grad_norm": 0.533469943639252,
"learning_rate": 3.1641181177846046e-07,
"loss": 0.5646488666534424,
"step": 663,
"token_acc": 0.8092274601183008
},
{
"epoch": 1.9190751445086707,
"grad_norm": 0.5079029266136241,
"learning_rate": 3.1493092233004277e-07,
"loss": 0.565247654914856,
"step": 664,
"token_acc": 0.8091681448977687
},
{
"epoch": 1.9219653179190752,
"grad_norm": 0.5846146749149876,
"learning_rate": 3.1345191167668106e-07,
"loss": 0.46707916259765625,
"step": 665,
"token_acc": 0.8448507638926736
},
{
"epoch": 1.9248554913294798,
"grad_norm": 0.6115493897752081,
"learning_rate": 3.119747948330821e-07,
"loss": 0.49020782113075256,
"step": 666,
"token_acc": 0.8343801519151217
},
{
"epoch": 1.9277456647398843,
"grad_norm": 0.5665579491864339,
"learning_rate": 3.1049958679472645e-07,
"loss": 0.4773547649383545,
"step": 667,
"token_acc": 0.840464602970709
},
{
"epoch": 1.930635838150289,
"grad_norm": 0.5428950150023341,
"learning_rate": 3.0902630253771725e-07,
"loss": 0.5331814885139465,
"step": 668,
"token_acc": 0.8203493165709791
},
{
"epoch": 1.9335260115606936,
"grad_norm": 0.535673154611531,
"learning_rate": 3.0755495701862785e-07,
"loss": 0.5440840125083923,
"step": 669,
"token_acc": 0.8188541358240693
},
{
"epoch": 1.9364161849710984,
"grad_norm": 0.4836434667966126,
"learning_rate": 3.06085565174349e-07,
"loss": 0.5037864446640015,
"step": 670,
"token_acc": 0.8303648820337454
},
{
"epoch": 1.939306358381503,
"grad_norm": 0.6272828775317285,
"learning_rate": 3.046181419219386e-07,
"loss": 0.5913348197937012,
"step": 671,
"token_acc": 0.804053529366086
},
{
"epoch": 1.9421965317919074,
"grad_norm": 0.47821443556435045,
"learning_rate": 3.031527021584701e-07,
"loss": 0.5496195554733276,
"step": 672,
"token_acc": 0.8131932821607896
},
{
"epoch": 1.9450867052023122,
"grad_norm": 0.5368717641927174,
"learning_rate": 3.0168926076087986e-07,
"loss": 0.5248396396636963,
"step": 673,
"token_acc": 0.8238304421235078
},
{
"epoch": 1.9479768786127167,
"grad_norm": 0.5546004209488442,
"learning_rate": 3.002278325858177e-07,
"loss": 0.5503116846084595,
"step": 674,
"token_acc": 0.8154341018265293
},
{
"epoch": 1.9508670520231215,
"grad_norm": 0.5406553961850177,
"learning_rate": 2.987684324694957e-07,
"loss": 0.5093920230865479,
"step": 675,
"token_acc": 0.8285504848168147
},
{
"epoch": 1.953757225433526,
"grad_norm": 0.5070602927484339,
"learning_rate": 2.9731107522753654e-07,
"loss": 0.6153904795646667,
"step": 676,
"token_acc": 0.7934051997463538
},
{
"epoch": 1.9566473988439306,
"grad_norm": 0.6200327187024355,
"learning_rate": 2.9585577565482484e-07,
"loss": 0.49602842330932617,
"step": 677,
"token_acc": 0.8349261185482811
},
{
"epoch": 1.9595375722543351,
"grad_norm": 0.5432813085052021,
"learning_rate": 2.944025485253557e-07,
"loss": 0.5533842444419861,
"step": 678,
"token_acc": 0.8136697934557625
},
{
"epoch": 1.9624277456647399,
"grad_norm": 0.5655183170978749,
"learning_rate": 2.929514085920848e-07,
"loss": 0.5408231019973755,
"step": 679,
"token_acc": 0.8149668765846079
},
{
"epoch": 1.9653179190751446,
"grad_norm": 0.5348380476951098,
"learning_rate": 2.915023705867793e-07,
"loss": 0.5112613439559937,
"step": 680,
"token_acc": 0.8288466633304877
},
{
"epoch": 1.9682080924855492,
"grad_norm": 0.5587948082197168,
"learning_rate": 2.900554492198677e-07,
"loss": 0.5132273435592651,
"step": 681,
"token_acc": 0.8262983388869136
},
{
"epoch": 1.9710982658959537,
"grad_norm": 0.6468264753422917,
"learning_rate": 2.886106591802908e-07,
"loss": 0.49628451466560364,
"step": 682,
"token_acc": 0.8309623989848394
},
{
"epoch": 1.9739884393063583,
"grad_norm": 0.8088000703258003,
"learning_rate": 2.871680151353523e-07,
"loss": 0.566349983215332,
"step": 683,
"token_acc": 0.813486073930626
},
{
"epoch": 1.976878612716763,
"grad_norm": 0.5639785659667156,
"learning_rate": 2.8572753173057e-07,
"loss": 0.5700632333755493,
"step": 684,
"token_acc": 0.8086862859910506
},
{
"epoch": 1.9797687861271678,
"grad_norm": 0.5543121051930197,
"learning_rate": 2.842892235895279e-07,
"loss": 0.5271592140197754,
"step": 685,
"token_acc": 0.8250378942459045
},
{
"epoch": 1.9826589595375723,
"grad_norm": 0.5567574729556525,
"learning_rate": 2.828531053137257e-07,
"loss": 0.528691828250885,
"step": 686,
"token_acc": 0.8240472063720813
},
{
"epoch": 1.9855491329479769,
"grad_norm": 0.582442051806669,
"learning_rate": 2.814191914824332e-07,
"loss": 0.5287505388259888,
"step": 687,
"token_acc": 0.821006600414202
},
{
"epoch": 1.9884393063583814,
"grad_norm": 0.5452501250540314,
"learning_rate": 2.799874966525403e-07,
"loss": 0.5334792733192444,
"step": 688,
"token_acc": 0.8213241825401043
},
{
"epoch": 1.9913294797687862,
"grad_norm": 0.5482828728372189,
"learning_rate": 2.785580353584099e-07,
"loss": 0.5632658004760742,
"step": 689,
"token_acc": 0.8116547561426986
},
{
"epoch": 1.9942196531791907,
"grad_norm": 0.633529877080459,
"learning_rate": 2.771308221117309e-07,
"loss": 0.516349196434021,
"step": 690,
"token_acc": 0.8251189141964578
},
{
"epoch": 1.9971098265895955,
"grad_norm": 0.5330351124089759,
"learning_rate": 2.757058714013697e-07,
"loss": 0.5631735324859619,
"step": 691,
"token_acc": 0.8110226467289205
},
{
"epoch": 2.0,
"grad_norm": 0.5696997466472099,
"learning_rate": 2.7428319769322415e-07,
"loss": 0.5440479516983032,
"step": 692,
"token_acc": 0.8158318122461348
},
{
"epoch": 2.0028901734104045,
"grad_norm": 0.5585685445254689,
"learning_rate": 2.7286281543007597e-07,
"loss": 0.5391400456428528,
"step": 693,
"token_acc": 0.8175343274767459
},
{
"epoch": 2.005780346820809,
"grad_norm": 0.4706256621473158,
"learning_rate": 2.714447390314449e-07,
"loss": 0.5360602140426636,
"step": 694,
"token_acc": 0.8195729923051913
},
{
"epoch": 2.008670520231214,
"grad_norm": 0.4975918712102163,
"learning_rate": 2.700289828934416e-07,
"loss": 0.5223442316055298,
"step": 695,
"token_acc": 0.8266022386843656
},
{
"epoch": 2.0115606936416186,
"grad_norm": 0.6855664652178536,
"learning_rate": 2.686155613886215e-07,
"loss": 0.5413398146629333,
"step": 696,
"token_acc": 0.8206837181461728
},
{
"epoch": 2.014450867052023,
"grad_norm": 0.48324739879314504,
"learning_rate": 2.672044888658399e-07,
"loss": 0.5646222829818726,
"step": 697,
"token_acc": 0.8079876543209876
},
{
"epoch": 2.0173410404624277,
"grad_norm": 0.5416524165161476,
"learning_rate": 2.65795779650105e-07,
"loss": 0.5677503347396851,
"step": 698,
"token_acc": 0.8107366402887164
},
{
"epoch": 2.020231213872832,
"grad_norm": 0.5180032228711846,
"learning_rate": 2.64389448042433e-07,
"loss": 0.5446953773498535,
"step": 699,
"token_acc": 0.8148853386782998
},
{
"epoch": 2.023121387283237,
"grad_norm": 0.5242926098982621,
"learning_rate": 2.6298550831970307e-07,
"loss": 0.5251763463020325,
"step": 700,
"token_acc": 0.8224519443333264
},
{
"epoch": 2.0260115606936417,
"grad_norm": 0.52590432100961,
"learning_rate": 2.615839747345127e-07,
"loss": 0.5811551809310913,
"step": 701,
"token_acc": 0.8070368200019533
},
{
"epoch": 2.0289017341040463,
"grad_norm": 0.5346477392780163,
"learning_rate": 2.6018486151503213e-07,
"loss": 0.5263258218765259,
"step": 702,
"token_acc": 0.8226229312836096
},
{
"epoch": 2.031791907514451,
"grad_norm": 0.6702369614403866,
"learning_rate": 2.5878818286486026e-07,
"loss": 0.4835773706436157,
"step": 703,
"token_acc": 0.8352293317787196
},
{
"epoch": 2.0346820809248554,
"grad_norm": 0.5810005206971598,
"learning_rate": 2.573939529628816e-07,
"loss": 0.5316369533538818,
"step": 704,
"token_acc": 0.8213102951763859
},
{
"epoch": 2.03757225433526,
"grad_norm": 0.5814408850367526,
"learning_rate": 2.560021859631196e-07,
"loss": 0.531090259552002,
"step": 705,
"token_acc": 0.8247005161281525
},
{
"epoch": 2.040462427745665,
"grad_norm": 0.5620278975131617,
"learning_rate": 2.5461289599459646e-07,
"loss": 0.4695814847946167,
"step": 706,
"token_acc": 0.8385467145834584
},
{
"epoch": 2.0433526011560694,
"grad_norm": 0.5109837854766828,
"learning_rate": 2.532260971611867e-07,
"loss": 0.5594449043273926,
"step": 707,
"token_acc": 0.8109966953664819
},
{
"epoch": 2.046242774566474,
"grad_norm": 0.5657246379091214,
"learning_rate": 2.5184180354147554e-07,
"loss": 0.520602285861969,
"step": 708,
"token_acc": 0.8247487538513655
},
{
"epoch": 2.0491329479768785,
"grad_norm": 0.4918673470663886,
"learning_rate": 2.5046002918861606e-07,
"loss": 0.5579814910888672,
"step": 709,
"token_acc": 0.8135782994649099
},
{
"epoch": 2.052023121387283,
"grad_norm": 0.48477796977022586,
"learning_rate": 2.490807881301855e-07,
"loss": 0.5919597744941711,
"step": 710,
"token_acc": 0.8019583967529172
},
{
"epoch": 2.054913294797688,
"grad_norm": 0.6496075635378676,
"learning_rate": 2.477040943680436e-07,
"loss": 0.48429036140441895,
"step": 711,
"token_acc": 0.8355824403733149
},
{
"epoch": 2.0578034682080926,
"grad_norm": 0.5519540209458493,
"learning_rate": 2.4632996187819034e-07,
"loss": 0.506065309047699,
"step": 712,
"token_acc": 0.8278258846453057
},
{
"epoch": 2.060693641618497,
"grad_norm": 0.5287310217228682,
"learning_rate": 2.4495840461062433e-07,
"loss": 0.5793042778968811,
"step": 713,
"token_acc": 0.8061971483241775
},
{
"epoch": 2.0635838150289016,
"grad_norm": 0.5904419866749646,
"learning_rate": 2.435894364892005e-07,
"loss": 0.573466420173645,
"step": 714,
"token_acc": 0.8098105997674032
},
{
"epoch": 2.066473988439306,
"grad_norm": 0.6225416912989975,
"learning_rate": 2.4222307141148906e-07,
"loss": 0.48143109679222107,
"step": 715,
"token_acc": 0.836179983151357
},
{
"epoch": 2.069364161849711,
"grad_norm": 0.5109219477999456,
"learning_rate": 2.4085932324863507e-07,
"loss": 0.544453501701355,
"step": 716,
"token_acc": 0.8168550972356652
},
{
"epoch": 2.0722543352601157,
"grad_norm": 0.544868652560984,
"learning_rate": 2.394982058452165e-07,
"loss": 0.550638735294342,
"step": 717,
"token_acc": 0.813385770281816
},
{
"epoch": 2.0751445086705202,
"grad_norm": 0.5334855839219953,
"learning_rate": 2.3813973301910427e-07,
"loss": 0.484441876411438,
"step": 718,
"token_acc": 0.8346531540424537
},
{
"epoch": 2.078034682080925,
"grad_norm": 0.5494544655057828,
"learning_rate": 2.3678391856132202e-07,
"loss": 0.5680737495422363,
"step": 719,
"token_acc": 0.8124086743334372
},
{
"epoch": 2.0809248554913293,
"grad_norm": 0.6045748429466216,
"learning_rate": 2.3543077623590635e-07,
"loss": 0.5128438472747803,
"step": 720,
"token_acc": 0.8279022575462924
},
{
"epoch": 2.0838150289017343,
"grad_norm": 0.48256069429990633,
"learning_rate": 2.3408031977976623e-07,
"loss": 0.5861136317253113,
"step": 721,
"token_acc": 0.8029797322959706
},
{
"epoch": 2.086705202312139,
"grad_norm": 0.5653447327029175,
"learning_rate": 2.3273256290254402e-07,
"loss": 0.537794828414917,
"step": 722,
"token_acc": 0.8187106929644486
},
{
"epoch": 2.0895953757225434,
"grad_norm": 0.511608140122125,
"learning_rate": 2.3138751928647727e-07,
"loss": 0.5536022782325745,
"step": 723,
"token_acc": 0.8143630972354428
},
{
"epoch": 2.092485549132948,
"grad_norm": 0.6461334504435571,
"learning_rate": 2.3004520258625737e-07,
"loss": 0.547166645526886,
"step": 724,
"token_acc": 0.8144167909990558
},
{
"epoch": 2.0953757225433525,
"grad_norm": 0.5280363246093879,
"learning_rate": 2.2870562642889392e-07,
"loss": 0.5407837629318237,
"step": 725,
"token_acc": 0.81717697615801
},
{
"epoch": 2.098265895953757,
"grad_norm": 0.5895491785859862,
"learning_rate": 2.2736880441357398e-07,
"loss": 0.5352712273597717,
"step": 726,
"token_acc": 0.8206253892344479
},
{
"epoch": 2.101156069364162,
"grad_norm": 0.510490807616544,
"learning_rate": 2.2603475011152517e-07,
"loss": 0.5849488973617554,
"step": 727,
"token_acc": 0.8032212807794704
},
{
"epoch": 2.1040462427745665,
"grad_norm": 0.5074478903676131,
"learning_rate": 2.247034770658781e-07,
"loss": 0.5740774869918823,
"step": 728,
"token_acc": 0.8094154108581142
},
{
"epoch": 2.106936416184971,
"grad_norm": 0.49465264402350506,
"learning_rate": 2.2337499879152772e-07,
"loss": 0.5517815351486206,
"step": 729,
"token_acc": 0.8150811818935997
},
{
"epoch": 2.1098265895953756,
"grad_norm": 0.5409252325098711,
"learning_rate": 2.2204932877499778e-07,
"loss": 0.5680674314498901,
"step": 730,
"token_acc": 0.8076237225087722
},
{
"epoch": 2.11271676300578,
"grad_norm": 0.5667599272734437,
"learning_rate": 2.2072648047430182e-07,
"loss": 0.546800971031189,
"step": 731,
"token_acc": 0.8193202586524828
},
{
"epoch": 2.115606936416185,
"grad_norm": 0.5820288457006244,
"learning_rate": 2.1940646731880885e-07,
"loss": 0.5512528419494629,
"step": 732,
"token_acc": 0.8157494966528321
},
{
"epoch": 2.1184971098265897,
"grad_norm": 0.4949523232866875,
"learning_rate": 2.180893027091052e-07,
"loss": 0.5347863435745239,
"step": 733,
"token_acc": 0.8186724373395966
},
{
"epoch": 2.121387283236994,
"grad_norm": 0.5570654028702667,
"learning_rate": 2.1677500001685946e-07,
"loss": 0.5904409289360046,
"step": 734,
"token_acc": 0.80330335262698
},
{
"epoch": 2.1242774566473988,
"grad_norm": 0.5169029043729536,
"learning_rate": 2.154635725846861e-07,
"loss": 0.516341507434845,
"step": 735,
"token_acc": 0.8256773697978942
},
{
"epoch": 2.1271676300578033,
"grad_norm": 0.5202271523957221,
"learning_rate": 2.1415503372601096e-07,
"loss": 0.5516679286956787,
"step": 736,
"token_acc": 0.8166926940731877
},
{
"epoch": 2.1300578034682083,
"grad_norm": 0.5270674995884185,
"learning_rate": 2.1284939672493506e-07,
"loss": 0.5113083124160767,
"step": 737,
"token_acc": 0.8254448999891605
},
{
"epoch": 2.132947976878613,
"grad_norm": 0.5738812261029933,
"learning_rate": 2.1154667483609994e-07,
"loss": 0.5508044958114624,
"step": 738,
"token_acc": 0.8145577840874766
},
{
"epoch": 2.1358381502890174,
"grad_norm": 0.5552867531342636,
"learning_rate": 2.1024688128455432e-07,
"loss": 0.5606477856636047,
"step": 739,
"token_acc": 0.8107334996977912
},
{
"epoch": 2.138728323699422,
"grad_norm": 0.6511169378075016,
"learning_rate": 2.0895002926561733e-07,
"loss": 0.5715325474739075,
"step": 740,
"token_acc": 0.808644395970687
},
{
"epoch": 2.1416184971098264,
"grad_norm": 0.5104195470816412,
"learning_rate": 2.0765613194474756e-07,
"loss": 0.5317230224609375,
"step": 741,
"token_acc": 0.8196870394179812
},
{
"epoch": 2.1445086705202314,
"grad_norm": 0.5222197914536979,
"learning_rate": 2.0636520245740708e-07,
"loss": 0.581384003162384,
"step": 742,
"token_acc": 0.8044084027512044
},
{
"epoch": 2.147398843930636,
"grad_norm": 0.5216435736648604,
"learning_rate": 2.0507725390892895e-07,
"loss": 0.5070130825042725,
"step": 743,
"token_acc": 0.8285304030472848
},
{
"epoch": 2.1502890173410405,
"grad_norm": 0.5689993002879171,
"learning_rate": 2.0379229937438475e-07,
"loss": 0.5079813599586487,
"step": 744,
"token_acc": 0.8282544832726795
},
{
"epoch": 2.153179190751445,
"grad_norm": 0.5478897581085619,
"learning_rate": 2.0251035189845045e-07,
"loss": 0.5614432692527771,
"step": 745,
"token_acc": 0.8101714880561034
},
{
"epoch": 2.1560693641618496,
"grad_norm": 0.5625549603262265,
"learning_rate": 2.012314244952758e-07,
"loss": 0.46915191411972046,
"step": 746,
"token_acc": 0.8398674842185119
},
{
"epoch": 2.1589595375722546,
"grad_norm": 0.5888007906160326,
"learning_rate": 1.9995553014834986e-07,
"loss": 0.5621305704116821,
"step": 747,
"token_acc": 0.8091583390025296
},
{
"epoch": 2.161849710982659,
"grad_norm": 0.5611702979006163,
"learning_rate": 1.9868268181037184e-07,
"loss": 0.5150927901268005,
"step": 748,
"token_acc": 0.8226671153861205
},
{
"epoch": 2.1647398843930636,
"grad_norm": 0.5111806577194473,
"learning_rate": 1.9741289240311754e-07,
"loss": 0.5273150205612183,
"step": 749,
"token_acc": 0.822871650821089
},
{
"epoch": 2.167630057803468,
"grad_norm": 0.5196873584862519,
"learning_rate": 1.9614617481730882e-07,
"loss": 0.5140695571899414,
"step": 750,
"token_acc": 0.8273383116061258
},
{
"epoch": 2.1705202312138727,
"grad_norm": 0.5735974858092083,
"learning_rate": 1.948825419124837e-07,
"loss": 0.5572013854980469,
"step": 751,
"token_acc": 0.8135551173589466
},
{
"epoch": 2.1734104046242773,
"grad_norm": 0.5173068836847717,
"learning_rate": 1.9362200651686406e-07,
"loss": 0.4991053640842438,
"step": 752,
"token_acc": 0.8299385295624275
},
{
"epoch": 2.1763005780346822,
"grad_norm": 0.5835529062955169,
"learning_rate": 1.9236458142722672e-07,
"loss": 0.4967957139015198,
"step": 753,
"token_acc": 0.8307953955965303
},
{
"epoch": 2.179190751445087,
"grad_norm": 0.5877111733686488,
"learning_rate": 1.9111027940877283e-07,
"loss": 0.5488715767860413,
"step": 754,
"token_acc": 0.8119714508486775
},
{
"epoch": 2.1820809248554913,
"grad_norm": 0.5937906866706819,
"learning_rate": 1.898591131949992e-07,
"loss": 0.5290513038635254,
"step": 755,
"token_acc": 0.8182620202911337
},
{
"epoch": 2.184971098265896,
"grad_norm": 0.5973610860546952,
"learning_rate": 1.8861109548756764e-07,
"loss": 0.5482075810432434,
"step": 756,
"token_acc": 0.8168008865903214
},
{
"epoch": 2.1878612716763004,
"grad_norm": 0.6092890006866195,
"learning_rate": 1.873662389561771e-07,
"loss": 0.5488214492797852,
"step": 757,
"token_acc": 0.8205397467749234
},
{
"epoch": 2.1907514450867054,
"grad_norm": 0.5100060557982842,
"learning_rate": 1.861245562384351e-07,
"loss": 0.5582944750785828,
"step": 758,
"token_acc": 0.8142653999590552
},
{
"epoch": 2.19364161849711,
"grad_norm": 0.5534172002173429,
"learning_rate": 1.8488605993972806e-07,
"loss": 0.5284197926521301,
"step": 759,
"token_acc": 0.8226439546852772
},
{
"epoch": 2.1965317919075145,
"grad_norm": 0.5676418034969823,
"learning_rate": 1.8365076263309542e-07,
"loss": 0.5176257491111755,
"step": 760,
"token_acc": 0.8240463351308168
},
{
"epoch": 2.199421965317919,
"grad_norm": 0.5273849733875124,
"learning_rate": 1.8241867685910007e-07,
"loss": 0.5415469408035278,
"step": 761,
"token_acc": 0.8159108203203757
},
{
"epoch": 2.2023121387283235,
"grad_norm": 0.5675178250606417,
"learning_rate": 1.8118981512570254e-07,
"loss": 0.495791494846344,
"step": 762,
"token_acc": 0.833165862256412
},
{
"epoch": 2.2052023121387285,
"grad_norm": 0.5356879254901209,
"learning_rate": 1.7996418990813293e-07,
"loss": 0.5700979828834534,
"step": 763,
"token_acc": 0.8082553122201417
},
{
"epoch": 2.208092485549133,
"grad_norm": 0.5440506283017456,
"learning_rate": 1.7874181364876462e-07,
"loss": 0.5215957164764404,
"step": 764,
"token_acc": 0.8242129054849903
},
{
"epoch": 2.2109826589595376,
"grad_norm": 0.48724727796349754,
"learning_rate": 1.7752269875698872e-07,
"loss": 0.48275503516197205,
"step": 765,
"token_acc": 0.8372185670308444
},
{
"epoch": 2.213872832369942,
"grad_norm": 0.6530933074612743,
"learning_rate": 1.763068576090862e-07,
"loss": 0.5122123956680298,
"step": 766,
"token_acc": 0.8289117165401221
},
{
"epoch": 2.2167630057803467,
"grad_norm": 0.5132130783753541,
"learning_rate": 1.750943025481046e-07,
"loss": 0.5450626611709595,
"step": 767,
"token_acc": 0.8163703808809519
},
{
"epoch": 2.2196531791907512,
"grad_norm": 0.5763340107528144,
"learning_rate": 1.73885045883731e-07,
"loss": 0.5134228467941284,
"step": 768,
"token_acc": 0.8268736586467864
},
{
"epoch": 2.222543352601156,
"grad_norm": 0.5678033281126066,
"learning_rate": 1.726790998921675e-07,
"loss": 0.5369815826416016,
"step": 769,
"token_acc": 0.8197942785502621
},
{
"epoch": 2.2254335260115607,
"grad_norm": 0.5494081888054269,
"learning_rate": 1.7147647681600735e-07,
"loss": 0.583419144153595,
"step": 770,
"token_acc": 0.8045412637492227
},
{
"epoch": 2.2283236994219653,
"grad_norm": 0.5002570926978792,
"learning_rate": 1.7027718886410948e-07,
"loss": 0.5762687921524048,
"step": 771,
"token_acc": 0.8050788141720897
},
{
"epoch": 2.23121387283237,
"grad_norm": 0.5621625282852232,
"learning_rate": 1.6908124821147517e-07,
"loss": 0.5734193325042725,
"step": 772,
"token_acc": 0.8072726721307747
},
{
"epoch": 2.2341040462427744,
"grad_norm": 0.5805542620358577,
"learning_rate": 1.6788866699912434e-07,
"loss": 0.5245779156684875,
"step": 773,
"token_acc": 0.8224566435530849
},
{
"epoch": 2.2369942196531793,
"grad_norm": 0.5784351770858037,
"learning_rate": 1.6669945733397288e-07,
"loss": 0.5163431763648987,
"step": 774,
"token_acc": 0.8234030645429656
},
{
"epoch": 2.239884393063584,
"grad_norm": 0.5443607425066719,
"learning_rate": 1.6551363128870866e-07,
"loss": 0.48509231209754944,
"step": 775,
"token_acc": 0.8364400070660744
},
{
"epoch": 2.2427745664739884,
"grad_norm": 0.5838705468342498,
"learning_rate": 1.643312009016694e-07,
"loss": 0.5485388040542603,
"step": 776,
"token_acc": 0.814316289454411
},
{
"epoch": 2.245664739884393,
"grad_norm": 0.5113123373755981,
"learning_rate": 1.631521781767214e-07,
"loss": 0.5461674928665161,
"step": 777,
"token_acc": 0.8178670064564116
},
{
"epoch": 2.2485549132947975,
"grad_norm": 0.5316036267961789,
"learning_rate": 1.6197657508313595e-07,
"loss": 0.5362288951873779,
"step": 778,
"token_acc": 0.8175199117906136
},
{
"epoch": 2.2514450867052025,
"grad_norm": 0.6922569927006882,
"learning_rate": 1.608044035554692e-07,
"loss": 0.5441286563873291,
"step": 779,
"token_acc": 0.8158920316612874
},
{
"epoch": 2.254335260115607,
"grad_norm": 0.6638081905493092,
"learning_rate": 1.5963567549344026e-07,
"loss": 0.5481600761413574,
"step": 780,
"token_acc": 0.8147708894878706
},
{
"epoch": 2.2572254335260116,
"grad_norm": 0.5594541395187226,
"learning_rate": 1.5847040276181113e-07,
"loss": 0.5381879210472107,
"step": 781,
"token_acc": 0.8191574437700821
},
{
"epoch": 2.260115606936416,
"grad_norm": 0.6007103186375023,
"learning_rate": 1.5730859719026535e-07,
"loss": 0.537074863910675,
"step": 782,
"token_acc": 0.8190765218606167
},
{
"epoch": 2.2630057803468207,
"grad_norm": 0.5565956593496582,
"learning_rate": 1.561502705732883e-07,
"loss": 0.4965110719203949,
"step": 783,
"token_acc": 0.8309357060849598
},
{
"epoch": 2.2658959537572256,
"grad_norm": 0.5642893968640419,
"learning_rate": 1.5499543467004812e-07,
"loss": 0.5519629120826721,
"step": 784,
"token_acc": 0.8145803817619548
},
{
"epoch": 2.26878612716763,
"grad_norm": 0.6562655659982366,
"learning_rate": 1.538441012042747e-07,
"loss": 0.5342061519622803,
"step": 785,
"token_acc": 0.8214097726480007
},
{
"epoch": 2.2716763005780347,
"grad_norm": 0.5502255728162866,
"learning_rate": 1.526962818641428e-07,
"loss": 0.5008838176727295,
"step": 786,
"token_acc": 0.8290141252177352
},
{
"epoch": 2.2745664739884393,
"grad_norm": 0.5549954985905744,
"learning_rate": 1.5155198830215144e-07,
"loss": 0.4954628348350525,
"step": 787,
"token_acc": 0.8334000233928208
},
{
"epoch": 2.277456647398844,
"grad_norm": 0.6131059587737819,
"learning_rate": 1.5041123213500673e-07,
"loss": 0.5419051647186279,
"step": 788,
"token_acc": 0.8164740751406938
},
{
"epoch": 2.2803468208092488,
"grad_norm": 0.6247230822104177,
"learning_rate": 1.4927402494350383e-07,
"loss": 0.5040674805641174,
"step": 789,
"token_acc": 0.8298278970337606
},
{
"epoch": 2.2832369942196533,
"grad_norm": 0.5169557886712214,
"learning_rate": 1.4814037827240894e-07,
"loss": 0.4267565608024597,
"step": 790,
"token_acc": 0.85461239288595
},
{
"epoch": 2.286127167630058,
"grad_norm": 0.5453091300597913,
"learning_rate": 1.4701030363034244e-07,
"loss": 0.5594276189804077,
"step": 791,
"token_acc": 0.8131839426158908
},
{
"epoch": 2.2890173410404624,
"grad_norm": 0.5304410532256004,
"learning_rate": 1.4588381248966185e-07,
"loss": 0.5278592109680176,
"step": 792,
"token_acc": 0.8218627568498552
},
{
"epoch": 2.291907514450867,
"grad_norm": 0.6120665191114517,
"learning_rate": 1.4476091628634597e-07,
"loss": 0.575430691242218,
"step": 793,
"token_acc": 0.807088911218437
},
{
"epoch": 2.294797687861272,
"grad_norm": 0.5799839527530729,
"learning_rate": 1.4364162641987776e-07,
"loss": 0.5156550407409668,
"step": 794,
"token_acc": 0.8260783412329787
},
{
"epoch": 2.2976878612716765,
"grad_norm": 0.5602063299660717,
"learning_rate": 1.425259542531293e-07,
"loss": 0.5343849658966064,
"step": 795,
"token_acc": 0.8199821131979047
},
{
"epoch": 2.300578034682081,
"grad_norm": 0.4887450635971321,
"learning_rate": 1.414139111122463e-07,
"loss": 0.5308408141136169,
"step": 796,
"token_acc": 0.8229694371764182
},
{
"epoch": 2.3034682080924855,
"grad_norm": 0.4993867501606219,
"learning_rate": 1.4030550828653354e-07,
"loss": 0.5518777966499329,
"step": 797,
"token_acc": 0.8136998348383776
},
{
"epoch": 2.30635838150289,
"grad_norm": 0.5067023143157817,
"learning_rate": 1.3920075702833918e-07,
"loss": 0.5633761882781982,
"step": 798,
"token_acc": 0.8110373410357782
},
{
"epoch": 2.3092485549132946,
"grad_norm": 0.49845534995334795,
"learning_rate": 1.380996685529413e-07,
"loss": 0.5841176509857178,
"step": 799,
"token_acc": 0.8055892737380623
},
{
"epoch": 2.3121387283236996,
"grad_norm": 0.5671598446889555,
"learning_rate": 1.370022540384347e-07,
"loss": 0.5178837180137634,
"step": 800,
"token_acc": 0.8236206769170149
},
{
"epoch": 2.315028901734104,
"grad_norm": 0.4945445707298972,
"learning_rate": 1.3590852462561536e-07,
"loss": 0.5855327844619751,
"step": 801,
"token_acc": 0.8038555657047487
},
{
"epoch": 2.3179190751445087,
"grad_norm": 0.5806465370535545,
"learning_rate": 1.3481849141786977e-07,
"loss": 0.5570707321166992,
"step": 802,
"token_acc": 0.8127311126755344
},
{
"epoch": 2.320809248554913,
"grad_norm": 0.6159090128169195,
"learning_rate": 1.337321654810605e-07,
"loss": 0.510475754737854,
"step": 803,
"token_acc": 0.8252182347235694
},
{
"epoch": 2.3236994219653178,
"grad_norm": 0.5376860591208902,
"learning_rate": 1.3264955784341436e-07,
"loss": 0.5326089859008789,
"step": 804,
"token_acc": 0.8201670917441944
},
{
"epoch": 2.3265895953757223,
"grad_norm": 0.673299584166168,
"learning_rate": 1.3157067949541108e-07,
"loss": 0.58345627784729,
"step": 805,
"token_acc": 0.8029432260094861
},
{
"epoch": 2.3294797687861273,
"grad_norm": 0.5206280305901979,
"learning_rate": 1.304955413896705e-07,
"loss": 0.574557900428772,
"step": 806,
"token_acc": 0.8069745418082558
},
{
"epoch": 2.332369942196532,
"grad_norm": 0.5136292360134201,
"learning_rate": 1.294241544408425e-07,
"loss": 0.5320082902908325,
"step": 807,
"token_acc": 0.8200797060551261
},
{
"epoch": 2.3352601156069364,
"grad_norm": 0.6862994942563941,
"learning_rate": 1.2835652952549535e-07,
"loss": 0.506873607635498,
"step": 808,
"token_acc": 0.8275425473721735
},
{
"epoch": 2.338150289017341,
"grad_norm": 0.512551355029386,
"learning_rate": 1.272926774820063e-07,
"loss": 0.5066085457801819,
"step": 809,
"token_acc": 0.8297983521714544
},
{
"epoch": 2.3410404624277454,
"grad_norm": 0.5604007523428769,
"learning_rate": 1.2623260911045032e-07,
"loss": 0.5025891065597534,
"step": 810,
"token_acc": 0.829209325638134
},
{
"epoch": 2.3439306358381504,
"grad_norm": 0.5268748443036352,
"learning_rate": 1.251763351724912e-07,
"loss": 0.4720842242240906,
"step": 811,
"token_acc": 0.8390679336697509
},
{
"epoch": 2.346820809248555,
"grad_norm": 0.5272184591480457,
"learning_rate": 1.241238663912727e-07,
"loss": 0.5422724485397339,
"step": 812,
"token_acc": 0.8181165262000732
},
{
"epoch": 2.3497109826589595,
"grad_norm": 0.6478156561205365,
"learning_rate": 1.2307521345130856e-07,
"loss": 0.4997095465660095,
"step": 813,
"token_acc": 0.83579220127889
},
{
"epoch": 2.352601156069364,
"grad_norm": 0.5596818812581189,
"learning_rate": 1.2203038699837482e-07,
"loss": 0.5354875326156616,
"step": 814,
"token_acc": 0.8179522864334984
},
{
"epoch": 2.3554913294797686,
"grad_norm": 0.5092123540436737,
"learning_rate": 1.2098939763940146e-07,
"loss": 0.5460278987884521,
"step": 815,
"token_acc": 0.8163918561804444
},
{
"epoch": 2.3583815028901736,
"grad_norm": 0.5800331579268285,
"learning_rate": 1.1995225594236535e-07,
"loss": 0.5022585988044739,
"step": 816,
"token_acc": 0.8274375641464249
},
{
"epoch": 2.361271676300578,
"grad_norm": 0.5756167659083334,
"learning_rate": 1.1891897243618183e-07,
"loss": 0.5118639469146729,
"step": 817,
"token_acc": 0.8277416762854647
},
{
"epoch": 2.3641618497109826,
"grad_norm": 0.7044868964257237,
"learning_rate": 1.1788955761059848e-07,
"loss": 0.5586499571800232,
"step": 818,
"token_acc": 0.8113651781794964
},
{
"epoch": 2.367052023121387,
"grad_norm": 0.5795349651059425,
"learning_rate": 1.168640219160893e-07,
"loss": 0.46478456258773804,
"step": 819,
"token_acc": 0.8425433103736172
},
{
"epoch": 2.3699421965317917,
"grad_norm": 0.5417472517233258,
"learning_rate": 1.1584237576374672e-07,
"loss": 0.5370988845825195,
"step": 820,
"token_acc": 0.8190044958253051
},
{
"epoch": 2.3728323699421967,
"grad_norm": 0.5406033227296971,
"learning_rate": 1.1482462952517819e-07,
"loss": 0.5212105512619019,
"step": 821,
"token_acc": 0.8224046418092507
},
{
"epoch": 2.3757225433526012,
"grad_norm": 0.6158759615805948,
"learning_rate": 1.1381079353239915e-07,
"loss": 0.5457302331924438,
"step": 822,
"token_acc": 0.8143862498308296
},
{
"epoch": 2.378612716763006,
"grad_norm": 0.5823036775149597,
"learning_rate": 1.1280087807772881e-07,
"loss": 0.5847820043563843,
"step": 823,
"token_acc": 0.8055109662743706
},
{
"epoch": 2.3815028901734103,
"grad_norm": 0.5934874612721635,
"learning_rate": 1.1179489341368614e-07,
"loss": 0.527098536491394,
"step": 824,
"token_acc": 0.8198975500818406
},
{
"epoch": 2.384393063583815,
"grad_norm": 0.48776844524252105,
"learning_rate": 1.1079284975288456e-07,
"loss": 0.5120328068733215,
"step": 825,
"token_acc": 0.8243783599233836
},
{
"epoch": 2.38728323699422,
"grad_norm": 0.6146965565569307,
"learning_rate": 1.097947572679298e-07,
"loss": 0.5407025814056396,
"step": 826,
"token_acc": 0.8166508538899431
},
{
"epoch": 2.3901734104046244,
"grad_norm": 0.5334859468151563,
"learning_rate": 1.0880062609131485e-07,
"loss": 0.5002784729003906,
"step": 827,
"token_acc": 0.8304765759384802
},
{
"epoch": 2.393063583815029,
"grad_norm": 0.5390442828664261,
"learning_rate": 1.0781046631531887e-07,
"loss": 0.539802074432373,
"step": 828,
"token_acc": 0.8201954263661371
},
{
"epoch": 2.3959537572254335,
"grad_norm": 0.5913404588285502,
"learning_rate": 1.0682428799190357e-07,
"loss": 0.5389546155929565,
"step": 829,
"token_acc": 0.8186631949877636
},
{
"epoch": 2.398843930635838,
"grad_norm": 0.5442985144352179,
"learning_rate": 1.0584210113261138e-07,
"loss": 0.5016453862190247,
"step": 830,
"token_acc": 0.8323601673886272
},
{
"epoch": 2.401734104046243,
"grad_norm": 0.5335838263183578,
"learning_rate": 1.0486391570846447e-07,
"loss": 0.5271462202072144,
"step": 831,
"token_acc": 0.8242358536755963
},
{
"epoch": 2.4046242774566475,
"grad_norm": 0.49716550117440406,
"learning_rate": 1.0388974164986247e-07,
"loss": 0.55882728099823,
"step": 832,
"token_acc": 0.8099962892130277
},
{
"epoch": 2.407514450867052,
"grad_norm": 0.47857456778328644,
"learning_rate": 1.0291958884648244e-07,
"loss": 0.49896830320358276,
"step": 833,
"token_acc": 0.8291924229963124
},
{
"epoch": 2.4104046242774566,
"grad_norm": 0.5097765363216997,
"learning_rate": 1.0195346714717812e-07,
"loss": 0.5477476716041565,
"step": 834,
"token_acc": 0.8156213758444858
},
{
"epoch": 2.413294797687861,
"grad_norm": 0.5235000424585246,
"learning_rate": 1.0099138635988024e-07,
"loss": 0.5449202060699463,
"step": 835,
"token_acc": 0.8174131547081592
},
{
"epoch": 2.416184971098266,
"grad_norm": 0.5918110484158251,
"learning_rate": 1.0003335625149667e-07,
"loss": 0.47566699981689453,
"step": 836,
"token_acc": 0.8377055807323248
},
{
"epoch": 2.4190751445086707,
"grad_norm": 0.5851719068244339,
"learning_rate": 9.907938654781306e-08,
"loss": 0.5465905666351318,
"step": 837,
"token_acc": 0.8147972978299083
},
{
"epoch": 2.421965317919075,
"grad_norm": 0.5682204824677508,
"learning_rate": 9.812948693339518e-08,
"loss": 0.5738434791564941,
"step": 838,
"token_acc": 0.8094719444296344
},
{
"epoch": 2.4248554913294798,
"grad_norm": 0.49007877801128724,
"learning_rate": 9.718366705148878e-08,
"loss": 0.5543205738067627,
"step": 839,
"token_acc": 0.8132528289037656
},
{
"epoch": 2.4277456647398843,
"grad_norm": 0.5842704513292558,
"learning_rate": 9.62419365039237e-08,
"loss": 0.5389681458473206,
"step": 840,
"token_acc": 0.8200700065948241
},
{
"epoch": 2.430635838150289,
"grad_norm": 0.5770762126755756,
"learning_rate": 9.530430485101477e-08,
"loss": 0.5231157541275024,
"step": 841,
"token_acc": 0.8205874308194584
},
{
"epoch": 2.433526011560694,
"grad_norm": 0.7677432650260306,
"learning_rate": 9.437078161146589e-08,
"loss": 0.48806625604629517,
"step": 842,
"token_acc": 0.8331080698798665
},
{
"epoch": 2.4364161849710984,
"grad_norm": 0.644925234497109,
"learning_rate": 9.344137626227266e-08,
"loss": 0.5736875534057617,
"step": 843,
"token_acc": 0.8089128548407091
},
{
"epoch": 2.439306358381503,
"grad_norm": 0.7396158526047033,
"learning_rate": 9.251609823862638e-08,
"loss": 0.4797173738479614,
"step": 844,
"token_acc": 0.8373787499437789
},
{
"epoch": 2.4421965317919074,
"grad_norm": 0.5468960652000051,
"learning_rate": 9.15949569338188e-08,
"loss": 0.5192615985870361,
"step": 845,
"token_acc": 0.8244522788344224
},
{
"epoch": 2.445086705202312,
"grad_norm": 0.5315006428054552,
"learning_rate": 9.067796169914549e-08,
"loss": 0.5097811222076416,
"step": 846,
"token_acc": 0.827042571766035
},
{
"epoch": 2.447976878612717,
"grad_norm": 0.7439553982785114,
"learning_rate": 8.976512184381246e-08,
"loss": 0.49079883098602295,
"step": 847,
"token_acc": 0.8330292060799148
},
{
"epoch": 2.4508670520231215,
"grad_norm": 0.6047154396535889,
"learning_rate": 8.885644663484049e-08,
"loss": 0.5638853311538696,
"step": 848,
"token_acc": 0.8139317111350264
},
{
"epoch": 2.453757225433526,
"grad_norm": 0.5113685852977929,
"learning_rate": 8.795194529697148e-08,
"loss": 0.5080073475837708,
"step": 849,
"token_acc": 0.8294516082294987
},
{
"epoch": 2.4566473988439306,
"grad_norm": 0.5784270460360631,
"learning_rate": 8.705162701257501e-08,
"loss": 0.4831171929836273,
"step": 850,
"token_acc": 0.8367839034908794
},
{
"epoch": 2.459537572254335,
"grad_norm": 0.8859232576451248,
"learning_rate": 8.615550092155477e-08,
"loss": 0.49585288763046265,
"step": 851,
"token_acc": 0.8318051901511245
},
{
"epoch": 2.4624277456647397,
"grad_norm": 0.5397198676813016,
"learning_rate": 8.526357612125573e-08,
"loss": 0.5402971506118774,
"step": 852,
"token_acc": 0.8140772038815954
},
{
"epoch": 2.4653179190751446,
"grad_norm": 0.5962698285712602,
"learning_rate": 8.437586166637206e-08,
"loss": 0.4982019066810608,
"step": 853,
"token_acc": 0.8291487495756479
},
{
"epoch": 2.468208092485549,
"grad_norm": 0.639088875669763,
"learning_rate": 8.349236656885544e-08,
"loss": 0.5227348804473877,
"step": 854,
"token_acc": 0.8234732997252996
},
{
"epoch": 2.4710982658959537,
"grad_norm": 0.5125821343592164,
"learning_rate": 8.261309979782255e-08,
"loss": 0.5540283918380737,
"step": 855,
"token_acc": 0.8137015888618007
},
{
"epoch": 2.4739884393063583,
"grad_norm": 0.6336792834178986,
"learning_rate": 8.173807027946528e-08,
"loss": 0.5213714838027954,
"step": 856,
"token_acc": 0.8260184658469347
},
{
"epoch": 2.476878612716763,
"grad_norm": 0.741297514751174,
"learning_rate": 8.086728689695921e-08,
"loss": 0.4948037564754486,
"step": 857,
"token_acc": 0.8296993252484727
},
{
"epoch": 2.479768786127168,
"grad_norm": 0.5470631077862728,
"learning_rate": 8.000075849037408e-08,
"loss": 0.5469754934310913,
"step": 858,
"token_acc": 0.8164498833341608
},
{
"epoch": 2.4826589595375723,
"grad_norm": 0.4864695217391108,
"learning_rate": 7.913849385658333e-08,
"loss": 0.5522366762161255,
"step": 859,
"token_acc": 0.8114838802706048
},
{
"epoch": 2.485549132947977,
"grad_norm": 0.6284131013971183,
"learning_rate": 7.828050174917527e-08,
"loss": 0.5867525935173035,
"step": 860,
"token_acc": 0.8053583956414843
},
{
"epoch": 2.4884393063583814,
"grad_norm": 0.6601691347825654,
"learning_rate": 7.742679087836462e-08,
"loss": 0.4591352045536041,
"step": 861,
"token_acc": 0.8464259952598495
},
{
"epoch": 2.491329479768786,
"grad_norm": 0.5223754803762156,
"learning_rate": 7.657736991090263e-08,
"loss": 0.5479453206062317,
"step": 862,
"token_acc": 0.8136173830420323
},
{
"epoch": 2.494219653179191,
"grad_norm": 0.6063178523383044,
"learning_rate": 7.573224746999107e-08,
"loss": 0.4984654486179352,
"step": 863,
"token_acc": 0.8310789771475875
},
{
"epoch": 2.4971098265895955,
"grad_norm": 0.5664401315392263,
"learning_rate": 7.4891432135193e-08,
"loss": 0.5375936031341553,
"step": 864,
"token_acc": 0.8193700891772278
},
{
"epoch": 2.5,
"grad_norm": 0.5684032151067252,
"learning_rate": 7.405493244234651e-08,
"loss": 0.5382214188575745,
"step": 865,
"token_acc": 0.8159053497942387
},
{
"epoch": 2.5028901734104045,
"grad_norm": 1.6304188232278813,
"learning_rate": 7.322275688347818e-08,
"loss": 0.5420823097229004,
"step": 866,
"token_acc": 0.8175298965740142
},
{
"epoch": 2.505780346820809,
"grad_norm": 0.5256843006054661,
"learning_rate": 7.239491390671631e-08,
"loss": 0.5603017807006836,
"step": 867,
"token_acc": 0.8130635711477354
},
{
"epoch": 2.508670520231214,
"grad_norm": 0.5290906377318529,
"learning_rate": 7.157141191620548e-08,
"loss": 0.4974015951156616,
"step": 868,
"token_acc": 0.8317996586674097
},
{
"epoch": 2.5115606936416186,
"grad_norm": 0.5009279956947961,
"learning_rate": 7.075225927202105e-08,
"loss": 0.5346574187278748,
"step": 869,
"token_acc": 0.8163790337713909
},
{
"epoch": 2.514450867052023,
"grad_norm": 0.4774847145184863,
"learning_rate": 6.993746429008496e-08,
"loss": 0.5793315768241882,
"step": 870,
"token_acc": 0.8044435794476767
},
{
"epoch": 2.5173410404624277,
"grad_norm": 0.579794607346244,
"learning_rate": 6.912703524208019e-08,
"loss": 0.4764576852321625,
"step": 871,
"token_acc": 0.8377503092002259
},
{
"epoch": 2.520231213872832,
"grad_norm": 0.5013881127258889,
"learning_rate": 6.832098035536759e-08,
"loss": 0.525843620300293,
"step": 872,
"token_acc": 0.8231466097001345
},
{
"epoch": 2.523121387283237,
"grad_norm": 0.48167613678527704,
"learning_rate": 6.751930781290238e-08,
"loss": 0.5380637049674988,
"step": 873,
"token_acc": 0.8183076636731655
},
{
"epoch": 2.5260115606936417,
"grad_norm": 0.4540447849829041,
"learning_rate": 6.672202575315044e-08,
"loss": 0.49698758125305176,
"step": 874,
"token_acc": 0.831075612916876
},
{
"epoch": 2.5289017341040463,
"grad_norm": 0.6661593346201325,
"learning_rate": 6.59291422700064e-08,
"loss": 0.4850313663482666,
"step": 875,
"token_acc": 0.8362135876193946
},
{
"epoch": 2.531791907514451,
"grad_norm": 0.505051966727968,
"learning_rate": 6.514066541271085e-08,
"loss": 0.499431312084198,
"step": 876,
"token_acc": 0.831420351210136
},
{
"epoch": 2.5346820809248554,
"grad_norm": 0.5882259006732896,
"learning_rate": 6.435660318576935e-08,
"loss": 0.5504227876663208,
"step": 877,
"token_acc": 0.8158776668803223
},
{
"epoch": 2.5375722543352603,
"grad_norm": 0.5391399587353708,
"learning_rate": 6.357696354887049e-08,
"loss": 0.5507422685623169,
"step": 878,
"token_acc": 0.8168785222461945
},
{
"epoch": 2.540462427745665,
"grad_norm": 0.5480460384925314,
"learning_rate": 6.28017544168053e-08,
"loss": 0.5473015308380127,
"step": 879,
"token_acc": 0.8178865534976365
},
{
"epoch": 2.5433526011560694,
"grad_norm": 0.5389986372049553,
"learning_rate": 6.20309836593873e-08,
"loss": 0.5189315676689148,
"step": 880,
"token_acc": 0.8252666894202909
},
{
"epoch": 2.546242774566474,
"grad_norm": 0.5707417078989917,
"learning_rate": 6.126465910137163e-08,
"loss": 0.5234180092811584,
"step": 881,
"token_acc": 0.8232250912282323
},
{
"epoch": 2.5491329479768785,
"grad_norm": 0.5632951051957191,
"learning_rate": 6.0502788522377e-08,
"loss": 0.5196454524993896,
"step": 882,
"token_acc": 0.8240517651811349
},
{
"epoch": 2.5520231213872835,
"grad_norm": 0.5312909361373286,
"learning_rate": 5.974537965680537e-08,
"loss": 0.5485826134681702,
"step": 883,
"token_acc": 0.8127245781077416
},
{
"epoch": 2.5549132947976876,
"grad_norm": 0.6429627848350591,
"learning_rate": 5.899244019376426e-08,
"loss": 0.5010867714881897,
"step": 884,
"token_acc": 0.8311800993506927
},
{
"epoch": 2.5578034682080926,
"grad_norm": 0.5223405882575716,
"learning_rate": 5.824397777698858e-08,
"loss": 0.5297751426696777,
"step": 885,
"token_acc": 0.8206137655553849
},
{
"epoch": 2.560693641618497,
"grad_norm": 0.8020502475631341,
"learning_rate": 5.7500000004762574e-08,
"loss": 0.5593537092208862,
"step": 886,
"token_acc": 0.811829619947517
},
{
"epoch": 2.5635838150289016,
"grad_norm": 0.6258112537179114,
"learning_rate": 5.676051442984325e-08,
"loss": 0.5434359908103943,
"step": 887,
"token_acc": 0.8160674580340842
},
{
"epoch": 2.5664739884393066,
"grad_norm": 0.5482233640675082,
"learning_rate": 5.602552855938325e-08,
"loss": 0.5392587780952454,
"step": 888,
"token_acc": 0.8183432292939603
},
{
"epoch": 2.5693641618497107,
"grad_norm": 0.5339167311609386,
"learning_rate": 5.529504985485528e-08,
"loss": 0.5843528509140015,
"step": 889,
"token_acc": 0.8041726059349488
},
{
"epoch": 2.5722543352601157,
"grad_norm": 0.5526129075488465,
"learning_rate": 5.456908573197544e-08,
"loss": 0.4785343408584595,
"step": 890,
"token_acc": 0.8354585097240348
},
{
"epoch": 2.5751445086705202,
"grad_norm": 0.5932930782479724,
"learning_rate": 5.384764356062865e-08,
"loss": 0.501940131187439,
"step": 891,
"token_acc": 0.8283741560885075
},
{
"epoch": 2.578034682080925,
"grad_norm": 0.5946977220929661,
"learning_rate": 5.313073066479379e-08,
"loss": 0.5379625558853149,
"step": 892,
"token_acc": 0.8177655126778356
},
{
"epoch": 2.5809248554913293,
"grad_norm": 0.5663018542099373,
"learning_rate": 5.2418354322468884e-08,
"loss": 0.4645715057849884,
"step": 893,
"token_acc": 0.8437703660317277
},
{
"epoch": 2.583815028901734,
"grad_norm": 0.5603090911019164,
"learning_rate": 5.1710521765597593e-08,
"loss": 0.5438505411148071,
"step": 894,
"token_acc": 0.8167114037179182
},
{
"epoch": 2.586705202312139,
"grad_norm": 0.5650529942357706,
"learning_rate": 5.100724017999575e-08,
"loss": 0.537551760673523,
"step": 895,
"token_acc": 0.8162509350365383
},
{
"epoch": 2.5895953757225434,
"grad_norm": 0.5946617661686765,
"learning_rate": 5.0308516705278525e-08,
"loss": 0.5363532304763794,
"step": 896,
"token_acc": 0.8188319733413082
},
{
"epoch": 2.592485549132948,
"grad_norm": 0.529447543384607,
"learning_rate": 4.961435843478751e-08,
"loss": 0.547370195388794,
"step": 897,
"token_acc": 0.8166483874998265
},
{
"epoch": 2.5953757225433525,
"grad_norm": 0.5564539974665098,
"learning_rate": 4.892477241551901e-08,
"loss": 0.5567014813423157,
"step": 898,
"token_acc": 0.8142607154390945
},
{
"epoch": 2.598265895953757,
"grad_norm": 0.6758226853294469,
"learning_rate": 4.8239765648052985e-08,
"loss": 0.5622668862342834,
"step": 899,
"token_acc": 0.8094786656801085
},
{
"epoch": 2.601156069364162,
"grad_norm": 0.6030746534353,
"learning_rate": 4.755934508648057e-08,
"loss": 0.48511946201324463,
"step": 900,
"token_acc": 0.8383746553751593
},
{
"epoch": 2.6040462427745665,
"grad_norm": 0.5291224134313559,
"learning_rate": 4.688351763833531e-08,
"loss": 0.5561063289642334,
"step": 901,
"token_acc": 0.811450131453075
},
{
"epoch": 2.606936416184971,
"grad_norm": 0.5231587422483082,
"learning_rate": 4.621229016452155e-08,
"loss": 0.585370659828186,
"step": 902,
"token_acc": 0.8056932036025608
},
{
"epoch": 2.6098265895953756,
"grad_norm": 1.1223139233293984,
"learning_rate": 4.554566947924537e-08,
"loss": 0.5447970628738403,
"step": 903,
"token_acc": 0.8164786148920761
},
{
"epoch": 2.61271676300578,
"grad_norm": 0.5225735759201205,
"learning_rate": 4.4883662349945784e-08,
"loss": 0.5505392551422119,
"step": 904,
"token_acc": 0.8164482180639134
},
{
"epoch": 2.615606936416185,
"grad_norm": 0.54473619880049,
"learning_rate": 4.422627549722519e-08,
"loss": 0.5359902381896973,
"step": 905,
"token_acc": 0.820455104729094
},
{
"epoch": 2.6184971098265897,
"grad_norm": 0.7561505246031067,
"learning_rate": 4.357351559478201e-08,
"loss": 0.47267240285873413,
"step": 906,
"token_acc": 0.8387789854590445
},
{
"epoch": 2.621387283236994,
"grad_norm": 0.5548449336113677,
"learning_rate": 4.2925389269341916e-08,
"loss": 0.5412442684173584,
"step": 907,
"token_acc": 0.8155705621117785
},
{
"epoch": 2.6242774566473988,
"grad_norm": 0.7283156817419644,
"learning_rate": 4.228190310059182e-08,
"loss": 0.5299142599105835,
"step": 908,
"token_acc": 0.8230541763009774
},
{
"epoch": 2.6271676300578033,
"grad_norm": 0.5365454152037888,
"learning_rate": 4.164306362111208e-08,
"loss": 0.5737514495849609,
"step": 909,
"token_acc": 0.8103234930175004
},
{
"epoch": 2.6300578034682083,
"grad_norm": 0.5438553812892487,
"learning_rate": 4.100887731631053e-08,
"loss": 0.5420162677764893,
"step": 910,
"token_acc": 0.8180698387235383
},
{
"epoch": 2.632947976878613,
"grad_norm": 0.64070798422041,
"learning_rate": 4.0379350624356766e-08,
"loss": 0.5189142823219299,
"step": 911,
"token_acc": 0.8237202834249387
},
{
"epoch": 2.6358381502890174,
"grad_norm": 0.47802319033882207,
"learning_rate": 3.975448993611652e-08,
"loss": 0.5308249592781067,
"step": 912,
"token_acc": 0.8203262576745515
},
{
"epoch": 2.638728323699422,
"grad_norm": 0.5724668109330596,
"learning_rate": 3.913430159508696e-08,
"loss": 0.5157672166824341,
"step": 913,
"token_acc": 0.8241608973797213
},
{
"epoch": 2.6416184971098264,
"grad_norm": 0.5470703054848514,
"learning_rate": 3.8518791897332204e-08,
"loss": 0.5976561307907104,
"step": 914,
"token_acc": 0.8007923950822223
},
{
"epoch": 2.6445086705202314,
"grad_norm": 0.5294401571240512,
"learning_rate": 3.790796709141975e-08,
"loss": 0.5527437925338745,
"step": 915,
"token_acc": 0.8132948131146666
},
{
"epoch": 2.647398843930636,
"grad_norm": 0.6321676647074376,
"learning_rate": 3.7301833378356073e-08,
"loss": 0.4902818202972412,
"step": 916,
"token_acc": 0.8343280912033046
},
{
"epoch": 2.6502890173410405,
"grad_norm": 0.6734799143444675,
"learning_rate": 3.67003969115251e-08,
"loss": 0.5476257801055908,
"step": 917,
"token_acc": 0.8164087189044648
},
{
"epoch": 2.653179190751445,
"grad_norm": 0.4933080483096889,
"learning_rate": 3.610366379662455e-08,
"loss": 0.5034703612327576,
"step": 918,
"token_acc": 0.8296526697770866
},
{
"epoch": 2.6560693641618496,
"grad_norm": 0.5701973114157253,
"learning_rate": 3.551164009160429e-08,
"loss": 0.5260199904441833,
"step": 919,
"token_acc": 0.8228647844657014
},
{
"epoch": 2.6589595375722546,
"grad_norm": 0.4606917700933646,
"learning_rate": 3.4924331806605314e-08,
"loss": 0.5847440361976624,
"step": 920,
"token_acc": 0.8036149091590186
},
{
"epoch": 2.661849710982659,
"grad_norm": 0.5312291603560868,
"learning_rate": 3.4341744903897963e-08,
"loss": 0.5280716419219971,
"step": 921,
"token_acc": 0.8217670827512655
},
{
"epoch": 2.6647398843930636,
"grad_norm": 0.5137738686874723,
"learning_rate": 3.376388529782215e-08,
"loss": 0.5434746146202087,
"step": 922,
"token_acc": 0.8166855043797683
},
{
"epoch": 2.667630057803468,
"grad_norm": 0.5112438107405131,
"learning_rate": 3.319075885472644e-08,
"loss": 0.4704023599624634,
"step": 923,
"token_acc": 0.8407168549429551
},
{
"epoch": 2.6705202312138727,
"grad_norm": 0.5633980375468464,
"learning_rate": 3.262237139290952e-08,
"loss": 0.5437241792678833,
"step": 924,
"token_acc": 0.8174555734488506
},
{
"epoch": 2.6734104046242777,
"grad_norm": 0.4789519578675391,
"learning_rate": 3.205872868256021e-08,
"loss": 0.5591274499893188,
"step": 925,
"token_acc": 0.8126648310155333
},
{
"epoch": 2.6763005780346822,
"grad_norm": 0.545383577218125,
"learning_rate": 3.149983644569948e-08,
"loss": 0.4846089482307434,
"step": 926,
"token_acc": 0.8357118170559603
},
{
"epoch": 2.679190751445087,
"grad_norm": 0.5624813066511716,
"learning_rate": 3.094570035612226e-08,
"loss": 0.5257154703140259,
"step": 927,
"token_acc": 0.8209082215813688
},
{
"epoch": 2.6820809248554913,
"grad_norm": 0.5921212603993137,
"learning_rate": 3.0396326039339507e-08,
"loss": 0.5992392897605896,
"step": 928,
"token_acc": 0.7986864607734648
},
{
"epoch": 2.684971098265896,
"grad_norm": 0.5498631051018497,
"learning_rate": 2.9851719072521487e-08,
"loss": 0.5509431958198547,
"step": 929,
"token_acc": 0.8177149696899494
},
{
"epoch": 2.687861271676301,
"grad_norm": 0.5215571767600914,
"learning_rate": 2.9311884984440873e-08,
"loss": 0.561446487903595,
"step": 930,
"token_acc": 0.8129055922352012
},
{
"epoch": 2.690751445086705,
"grad_norm": 0.559786563643402,
"learning_rate": 2.8776829255416967e-08,
"loss": 0.5166699290275574,
"step": 931,
"token_acc": 0.8237840118657938
},
{
"epoch": 2.69364161849711,
"grad_norm": 0.5753952050911679,
"learning_rate": 2.8246557317259723e-08,
"loss": 0.5357648134231567,
"step": 932,
"token_acc": 0.8212208495005039
},
{
"epoch": 2.6965317919075145,
"grad_norm": 0.5636571499534591,
"learning_rate": 2.7721074553214596e-08,
"loss": 0.5390565395355225,
"step": 933,
"token_acc": 0.8159201695282208
},
{
"epoch": 2.699421965317919,
"grad_norm": 0.5407560890645442,
"learning_rate": 2.7200386297908386e-08,
"loss": 0.541710615158081,
"step": 934,
"token_acc": 0.8174959891247107
},
{
"epoch": 2.7023121387283235,
"grad_norm": 0.48421827585155863,
"learning_rate": 2.6684497837294208e-08,
"loss": 0.5409998297691345,
"step": 935,
"token_acc": 0.8210280803345742
},
{
"epoch": 2.705202312138728,
"grad_norm": 0.49710877088501176,
"learning_rate": 2.6173414408598826e-08,
"loss": 0.5135529637336731,
"step": 936,
"token_acc": 0.8251490888501849
},
{
"epoch": 2.708092485549133,
"grad_norm": 0.6329172467067579,
"learning_rate": 2.5667141200268694e-08,
"loss": 0.5547735691070557,
"step": 937,
"token_acc": 0.8145400135743814
},
{
"epoch": 2.7109826589595376,
"grad_norm": 0.5576557557006313,
"learning_rate": 2.5165683351917765e-08,
"loss": 0.5579146146774292,
"step": 938,
"token_acc": 0.8112171853454817
},
{
"epoch": 2.713872832369942,
"grad_norm": 0.5905103597710084,
"learning_rate": 2.4669045954275046e-08,
"loss": 0.5442934632301331,
"step": 939,
"token_acc": 0.818311620283537
},
{
"epoch": 2.7167630057803467,
"grad_norm": 0.6610701567101593,
"learning_rate": 2.4177234049133023e-08,
"loss": 0.49151283502578735,
"step": 940,
"token_acc": 0.8325153415650084
},
{
"epoch": 2.7196531791907512,
"grad_norm": 0.6214821823759014,
"learning_rate": 2.369025262929658e-08,
"loss": 0.5725831389427185,
"step": 941,
"token_acc": 0.8070232229912145
},
{
"epoch": 2.722543352601156,
"grad_norm": 0.5547499629666095,
"learning_rate": 2.3208106638531842e-08,
"loss": 0.5330009460449219,
"step": 942,
"token_acc": 0.8195172027623966
},
{
"epoch": 2.7254335260115607,
"grad_norm": 0.5521438894414953,
"learning_rate": 2.2730800971516862e-08,
"loss": 0.5747419595718384,
"step": 943,
"token_acc": 0.8086665948043549
},
{
"epoch": 2.7283236994219653,
"grad_norm": 0.6317779099057246,
"learning_rate": 2.225834047379099e-08,
"loss": 0.49804458022117615,
"step": 944,
"token_acc": 0.8307906934881418
},
{
"epoch": 2.73121387283237,
"grad_norm": 0.5560572315857666,
"learning_rate": 2.1790729941706276e-08,
"loss": 0.5384119153022766,
"step": 945,
"token_acc": 0.8186016301942814
},
{
"epoch": 2.7341040462427744,
"grad_norm": 0.5706315776877087,
"learning_rate": 2.132797412237869e-08,
"loss": 0.5331531167030334,
"step": 946,
"token_acc": 0.8183284045442989
},
{
"epoch": 2.7369942196531793,
"grad_norm": 0.5767818083804982,
"learning_rate": 2.087007771363969e-08,
"loss": 0.5555546879768372,
"step": 947,
"token_acc": 0.8130259084965389
},
{
"epoch": 2.739884393063584,
"grad_norm": 0.5074851398256462,
"learning_rate": 2.041704536398875e-08,
"loss": 0.5641285181045532,
"step": 948,
"token_acc": 0.8102424125823674
},
{
"epoch": 2.7427745664739884,
"grad_norm": 0.5656737111306388,
"learning_rate": 1.9968881672545957e-08,
"loss": 0.5804109573364258,
"step": 949,
"token_acc": 0.8069046557228511
},
{
"epoch": 2.745664739884393,
"grad_norm": 0.5396023274518039,
"learning_rate": 1.9525591189005874e-08,
"loss": 0.5026800632476807,
"step": 950,
"token_acc": 0.8291645642615152
},
{
"epoch": 2.7485549132947975,
"grad_norm": 0.5545085068594241,
"learning_rate": 1.9087178413590476e-08,
"loss": 0.5121109485626221,
"step": 951,
"token_acc": 0.829365647193499
},
{
"epoch": 2.7514450867052025,
"grad_norm": 0.5744534847489216,
"learning_rate": 1.8653647797004236e-08,
"loss": 0.5073999166488647,
"step": 952,
"token_acc": 0.8286528286528286
},
{
"epoch": 2.754335260115607,
"grad_norm": 0.5473570344774414,
"learning_rate": 1.8225003740388545e-08,
"loss": 0.5411463975906372,
"step": 953,
"token_acc": 0.8197644649257553
},
{
"epoch": 2.7572254335260116,
"grad_norm": 0.5960870996950273,
"learning_rate": 1.7801250595277095e-08,
"loss": 0.45802488923072815,
"step": 954,
"token_acc": 0.8439128432584406
},
{
"epoch": 2.760115606936416,
"grad_norm": 0.5872410848204962,
"learning_rate": 1.738239266355185e-08,
"loss": 0.5364171862602234,
"step": 955,
"token_acc": 0.8192522793328644
},
{
"epoch": 2.7630057803468207,
"grad_norm": 0.5452386927866908,
"learning_rate": 1.6968434197399072e-08,
"loss": 0.5837544202804565,
"step": 956,
"token_acc": 0.8051349532888352
},
{
"epoch": 2.7658959537572256,
"grad_norm": 0.5752700596867665,
"learning_rate": 1.655937939926655e-08,
"loss": 0.5129964351654053,
"step": 957,
"token_acc": 0.8282252791972994
},
{
"epoch": 2.76878612716763,
"grad_norm": 0.5428098765109344,
"learning_rate": 1.6155232421820653e-08,
"loss": 0.5746065378189087,
"step": 958,
"token_acc": 0.8089228223154
},
{
"epoch": 2.7716763005780347,
"grad_norm": 0.5949829280630812,
"learning_rate": 1.5755997367904173e-08,
"loss": 0.4916711747646332,
"step": 959,
"token_acc": 0.8342608068069589
},
{
"epoch": 2.7745664739884393,
"grad_norm": 0.5674429218313363,
"learning_rate": 1.536167829049495e-08,
"loss": 0.5395721197128296,
"step": 960,
"token_acc": 0.8203693073096058
},
{
"epoch": 2.777456647398844,
"grad_norm": 0.561452376268135,
"learning_rate": 1.497227919266414e-08,
"loss": 0.51889967918396,
"step": 961,
"token_acc": 0.8233378239163167
},
{
"epoch": 2.7803468208092488,
"grad_norm": 0.6257227381883494,
"learning_rate": 1.4587804027536454e-08,
"loss": 0.5111842155456543,
"step": 962,
"token_acc": 0.8274028303059359
},
{
"epoch": 2.7832369942196533,
"grad_norm": 0.5900526631508034,
"learning_rate": 1.420825669824921e-08,
"loss": 0.5204794406890869,
"step": 963,
"token_acc": 0.8234049795759579
},
{
"epoch": 2.786127167630058,
"grad_norm": 0.509902068102799,
"learning_rate": 1.3833641057913015e-08,
"loss": 0.47923728823661804,
"step": 964,
"token_acc": 0.8353080111030787
},
{
"epoch": 2.7890173410404624,
"grad_norm": 0.5460825106119277,
"learning_rate": 1.346396090957297e-08,
"loss": 0.520375669002533,
"step": 965,
"token_acc": 0.8276919599125914
},
{
"epoch": 2.791907514450867,
"grad_norm": 0.5432685057122655,
"learning_rate": 1.309922000616942e-08,
"loss": 0.5795409679412842,
"step": 966,
"token_acc": 0.8071895906398279
},
{
"epoch": 2.794797687861272,
"grad_norm": 0.5657536988747344,
"learning_rate": 1.2739422050500436e-08,
"loss": 0.5345174074172974,
"step": 967,
"token_acc": 0.8179120793316155
},
{
"epoch": 2.7976878612716765,
"grad_norm": 0.521811401090051,
"learning_rate": 1.2384570695183782e-08,
"loss": 0.5313125252723694,
"step": 968,
"token_acc": 0.8208080793990667
},
{
"epoch": 2.800578034682081,
"grad_norm": 0.5951506599748814,
"learning_rate": 1.2034669542620223e-08,
"loss": 0.5154579877853394,
"step": 969,
"token_acc": 0.8274639716414208
},
{
"epoch": 2.8034682080924855,
"grad_norm": 0.7493969316675455,
"learning_rate": 1.168972214495667e-08,
"loss": 0.4610113203525543,
"step": 970,
"token_acc": 0.8410565847986298
},
{
"epoch": 2.80635838150289,
"grad_norm": 0.6158144745722535,
"learning_rate": 1.1349732004050205e-08,
"loss": 0.5308967232704163,
"step": 971,
"token_acc": 0.823366838754401
},
{
"epoch": 2.809248554913295,
"grad_norm": 0.49701991004281837,
"learning_rate": 1.101470257143261e-08,
"loss": 0.5433156490325928,
"step": 972,
"token_acc": 0.8172732427363528
},
{
"epoch": 2.812138728323699,
"grad_norm": 0.614964929129747,
"learning_rate": 1.0684637248275175e-08,
"loss": 0.4856722056865692,
"step": 973,
"token_acc": 0.8371653570989119
},
{
"epoch": 2.815028901734104,
"grad_norm": 0.5531928817079772,
"learning_rate": 1.0359539385354387e-08,
"loss": 0.5472983121871948,
"step": 974,
"token_acc": 0.8166184194819147
},
{
"epoch": 2.8179190751445087,
"grad_norm": 0.6036213061429313,
"learning_rate": 1.0039412283017523e-08,
"loss": 0.5529719591140747,
"step": 975,
"token_acc": 0.8155163061650604
},
{
"epoch": 2.820809248554913,
"grad_norm": 0.5564254532918392,
"learning_rate": 9.724259191149774e-09,
"loss": 0.4628450572490692,
"step": 976,
"token_acc": 0.8427982220798462
},
{
"epoch": 2.8236994219653178,
"grad_norm": 0.5588830748507647,
"learning_rate": 9.414083309140453e-09,
"loss": 0.5567787289619446,
"step": 977,
"token_acc": 0.8121751346288926
},
{
"epoch": 2.8265895953757223,
"grad_norm": 0.5529058564154966,
"learning_rate": 9.108887785851338e-09,
"loss": 0.5580377578735352,
"step": 978,
"token_acc": 0.8109314422108472
},
{
"epoch": 2.8294797687861273,
"grad_norm": 0.61646098239251,
"learning_rate": 8.808675719584158e-09,
"loss": 0.5375653505325317,
"step": 979,
"token_acc": 0.8192844783892899
},
{
"epoch": 2.832369942196532,
"grad_norm": 0.5248181521879705,
"learning_rate": 8.513450158049106e-09,
"loss": 0.5359894037246704,
"step": 980,
"token_acc": 0.8180794693882546
},
{
"epoch": 2.8352601156069364,
"grad_norm": 0.530766621077344,
"learning_rate": 8.22321409833443e-09,
"loss": 0.5032058358192444,
"step": 981,
"token_acc": 0.8299942928720195
},
{
"epoch": 2.838150289017341,
"grad_norm": 0.5767728092897907,
"learning_rate": 7.93797048687539e-09,
"loss": 0.555617094039917,
"step": 982,
"token_acc": 0.8127699150828953
},
{
"epoch": 2.8410404624277454,
"grad_norm": 0.5275196163844481,
"learning_rate": 7.657722219424789e-09,
"loss": 0.5177302956581116,
"step": 983,
"token_acc": 0.8254756164272545
},
{
"epoch": 2.8439306358381504,
"grad_norm": 0.7188190918164308,
"learning_rate": 7.382472141023221e-09,
"loss": 0.5488888025283813,
"step": 984,
"token_acc": 0.8139118457300275
},
{
"epoch": 2.846820809248555,
"grad_norm": 0.5053524666497287,
"learning_rate": 7.112223045970589e-09,
"loss": 0.5309122800827026,
"step": 985,
"token_acc": 0.818977587114551
},
{
"epoch": 2.8497109826589595,
"grad_norm": 0.49254982998325725,
"learning_rate": 6.8469776777973494e-09,
"loss": 0.48389381170272827,
"step": 986,
"token_acc": 0.839111193678302
},
{
"epoch": 2.852601156069364,
"grad_norm": 0.5088843284530131,
"learning_rate": 6.5867387292369295e-09,
"loss": 0.5327301025390625,
"step": 987,
"token_acc": 0.8190361305134541
},
{
"epoch": 2.8554913294797686,
"grad_norm": 0.5579589460192081,
"learning_rate": 6.331508842198296e-09,
"loss": 0.46285098791122437,
"step": 988,
"token_acc": 0.8444943903023158
},
{
"epoch": 2.8583815028901736,
"grad_norm": 0.5480219063407678,
"learning_rate": 6.081290607739042e-09,
"loss": 0.4747048616409302,
"step": 989,
"token_acc": 0.8427808981834031
},
{
"epoch": 2.861271676300578,
"grad_norm": 0.7741942154519839,
"learning_rate": 5.836086566039289e-09,
"loss": 0.5887913703918457,
"step": 990,
"token_acc": 0.8049742371893245
},
{
"epoch": 2.8641618497109826,
"grad_norm": 0.5193852803751504,
"learning_rate": 5.595899206375654e-09,
"loss": 0.5110014081001282,
"step": 991,
"token_acc": 0.8288312763590261
},
{
"epoch": 2.867052023121387,
"grad_norm": 0.5341612707698237,
"learning_rate": 5.360730967096272e-09,
"loss": 0.5477676391601562,
"step": 992,
"token_acc": 0.8129789165141573
},
{
"epoch": 2.8699421965317917,
"grad_norm": 0.7306055692439172,
"learning_rate": 5.130584235595703e-09,
"loss": 0.5541284680366516,
"step": 993,
"token_acc": 0.8145775823594559
},
{
"epoch": 2.8728323699421967,
"grad_norm": 0.5713799415951762,
"learning_rate": 4.9054613482910065e-09,
"loss": 0.44801950454711914,
"step": 994,
"token_acc": 0.845931691583633
},
{
"epoch": 2.8757225433526012,
"grad_norm": 0.5839589911780936,
"learning_rate": 4.685364590597929e-09,
"loss": 0.5638971924781799,
"step": 995,
"token_acc": 0.8107071579171281
},
{
"epoch": 2.878612716763006,
"grad_norm": 0.5287376481818248,
"learning_rate": 4.470296196907364e-09,
"loss": 0.5595090389251709,
"step": 996,
"token_acc": 0.8104899471905078
},
{
"epoch": 2.8815028901734103,
"grad_norm": 0.5379724615788479,
"learning_rate": 4.260258350563317e-09,
"loss": 0.5029683709144592,
"step": 997,
"token_acc": 0.8288261472452321
},
{
"epoch": 2.884393063583815,
"grad_norm": 0.6018325527774611,
"learning_rate": 4.055253183840257e-09,
"loss": 0.5635591149330139,
"step": 998,
"token_acc": 0.8117199938369883
},
{
"epoch": 2.88728323699422,
"grad_norm": 0.5473646076466034,
"learning_rate": 3.855282777921465e-09,
"loss": 0.44404757022857666,
"step": 999,
"token_acc": 0.8481432594156987
},
{
"epoch": 2.8901734104046244,
"grad_norm": 0.607676333795665,
"learning_rate": 3.660349162878329e-09,
"loss": 0.5595177412033081,
"step": 1000,
"token_acc": 0.8098022742758105
},
{
"epoch": 2.8901734104046244,
"eval_loss": 0.5740217566490173,
"eval_runtime": 69.5297,
"eval_samples_per_second": 1.582,
"eval_steps_per_second": 0.201,
"eval_token_acc": 0.808306147135369,
"step": 1000
},
{
"epoch": 2.893063583815029,
"grad_norm": 0.6028179153533768,
"learning_rate": 3.4704543176491407e-09,
"loss": 0.5201370716094971,
"step": 1001,
"token_acc": 0.8248979009505466
},
{
"epoch": 2.8959537572254335,
"grad_norm": 0.5618469428482809,
"learning_rate": 3.285600170019609e-09,
"loss": 0.4737909138202667,
"step": 1002,
"token_acc": 0.8380801687763713
},
{
"epoch": 2.898843930635838,
"grad_norm": 0.520670079505936,
"learning_rate": 3.10578859660271e-09,
"loss": 0.4949793815612793,
"step": 1003,
"token_acc": 0.8310451985643839
},
{
"epoch": 2.901734104046243,
"grad_norm": 0.5898385451823664,
"learning_rate": 2.9310214228202014e-09,
"loss": 0.5583693981170654,
"step": 1004,
"token_acc": 0.8109677906011918
},
{
"epoch": 2.9046242774566475,
"grad_norm": 0.5434063241260475,
"learning_rate": 2.7613004228835836e-09,
"loss": 0.5403155088424683,
"step": 1005,
"token_acc": 0.8173558831911802
},
{
"epoch": 2.907514450867052,
"grad_norm": 0.5472051803786162,
"learning_rate": 2.59662731977639e-09,
"loss": 0.5251212120056152,
"step": 1006,
"token_acc": 0.8263490698267074
},
{
"epoch": 2.9104046242774566,
"grad_norm": 0.49207250611822545,
"learning_rate": 2.437003785236702e-09,
"loss": 0.5539924502372742,
"step": 1007,
"token_acc": 0.8112695897164994
},
{
"epoch": 2.913294797687861,
"grad_norm": 0.5002736177395538,
"learning_rate": 2.2824314397399404e-09,
"loss": 0.5284777283668518,
"step": 1008,
"token_acc": 0.8207929017091751
},
{
"epoch": 2.916184971098266,
"grad_norm": 0.5322616545740584,
"learning_rate": 2.132911852482766e-09,
"loss": 0.5585949420928955,
"step": 1009,
"token_acc": 0.8104817895999946
},
{
"epoch": 2.9190751445086707,
"grad_norm": 0.5531944879626155,
"learning_rate": 1.9884465413667063e-09,
"loss": 0.5428365468978882,
"step": 1010,
"token_acc": 0.815299992762539
},
{
"epoch": 2.921965317919075,
"grad_norm": 0.5219295200504247,
"learning_rate": 1.8490369729832755e-09,
"loss": 0.5256614685058594,
"step": 1011,
"token_acc": 0.8222089510292981
},
{
"epoch": 2.9248554913294798,
"grad_norm": 0.5231759747194448,
"learning_rate": 1.714684562598545e-09,
"loss": 0.5462931990623474,
"step": 1012,
"token_acc": 0.8166555934189188
},
{
"epoch": 2.9277456647398843,
"grad_norm": 0.511178905264401,
"learning_rate": 1.5853906741392086e-09,
"loss": 0.48754703998565674,
"step": 1013,
"token_acc": 0.8340968562927913
},
{
"epoch": 2.9306358381502893,
"grad_norm": 0.49209363879670576,
"learning_rate": 1.4611566201785386e-09,
"loss": 0.6072345972061157,
"step": 1014,
"token_acc": 0.796086135633005
},
{
"epoch": 2.9335260115606934,
"grad_norm": 0.5468806874394325,
"learning_rate": 1.3419836619229519e-09,
"loss": 0.5350404381752014,
"step": 1015,
"token_acc": 0.8205611421851678
},
{
"epoch": 2.9364161849710984,
"grad_norm": 0.5545661554638134,
"learning_rate": 1.227873009199465e-09,
"loss": 0.48873502016067505,
"step": 1016,
"token_acc": 0.8335308101581073
},
{
"epoch": 2.939306358381503,
"grad_norm": 0.6117033520146128,
"learning_rate": 1.1188258204433144e-09,
"loss": 0.5223637819290161,
"step": 1017,
"token_acc": 0.8220580971784899
},
{
"epoch": 2.9421965317919074,
"grad_norm": 0.5990530756110558,
"learning_rate": 1.0148432026860775e-09,
"loss": 0.5375405550003052,
"step": 1018,
"token_acc": 0.8204211966851669
},
{
"epoch": 2.9450867052023124,
"grad_norm": 0.5179575810720268,
"learning_rate": 9.159262115445709e-10,
"loss": 0.5529065132141113,
"step": 1019,
"token_acc": 0.8146867269147271
},
{
"epoch": 2.9479768786127165,
"grad_norm": 0.4852204771957678,
"learning_rate": 8.220758512100246e-10,
"loss": 0.5473994016647339,
"step": 1020,
"token_acc": 0.8154385812017952
},
{
"epoch": 2.9508670520231215,
"grad_norm": 0.5869353604242789,
"learning_rate": 7.332930744380905e-10,
"loss": 0.5176626443862915,
"step": 1021,
"token_acc": 0.8273430939731791
},
{
"epoch": 2.953757225433526,
"grad_norm": 0.5602528809896415,
"learning_rate": 6.49578782538851e-10,
"loss": 0.5115993618965149,
"step": 1022,
"token_acc": 0.8288524482039359
},
{
"epoch": 2.9566473988439306,
"grad_norm": 0.5342085317349031,
"learning_rate": 5.709338253679363e-10,
"loss": 0.5524012446403503,
"step": 1023,
"token_acc": 0.8131655170976683
},
{
"epoch": 2.959537572254335,
"grad_norm": 0.5776521748726285,
"learning_rate": 4.973590013178652e-10,
"loss": 0.5437720417976379,
"step": 1024,
"token_acc": 0.8181899648876977
},
{
"epoch": 2.9624277456647397,
"grad_norm": 0.5915883065627155,
"learning_rate": 4.288550573098293e-10,
"loss": 0.5497083067893982,
"step": 1025,
"token_acc": 0.8166504174699635
},
{
"epoch": 2.9653179190751446,
"grad_norm": 0.519862153616305,
"learning_rate": 3.6542268878608785e-10,
"loss": 0.5397800207138062,
"step": 1026,
"token_acc": 0.8185784280824216
},
{
"epoch": 2.968208092485549,
"grad_norm": 0.6328021139986955,
"learning_rate": 3.070625397031401e-10,
"loss": 0.5588440299034119,
"step": 1027,
"token_acc": 0.8125476802049286
},
{
"epoch": 2.9710982658959537,
"grad_norm": 0.5575020860016229,
"learning_rate": 2.537752025249529e-10,
"loss": 0.5562065839767456,
"step": 1028,
"token_acc": 0.8104220354019687
},
{
"epoch": 2.9739884393063583,
"grad_norm": 0.5378061802083338,
"learning_rate": 2.0556121821696527e-10,
"loss": 0.5177541971206665,
"step": 1029,
"token_acc": 0.8242314812400594
},
{
"epoch": 2.976878612716763,
"grad_norm": 0.5832757184904683,
"learning_rate": 1.6242107624070412e-10,
"loss": 0.49845069646835327,
"step": 1030,
"token_acc": 0.8330388762567243
},
{
"epoch": 2.979768786127168,
"grad_norm": 0.7982615431706986,
"learning_rate": 1.2435521454884358e-10,
"loss": 0.5247231125831604,
"step": 1031,
"token_acc": 0.823871938586352
},
{
"epoch": 2.9826589595375723,
"grad_norm": 0.5127749961245016,
"learning_rate": 9.136401958059759e-11,
"loss": 0.5525383353233337,
"step": 1032,
"token_acc": 0.8136602187346615
},
{
"epoch": 2.985549132947977,
"grad_norm": 0.542665341113767,
"learning_rate": 6.34478262578897e-11,
"loss": 0.5264041423797607,
"step": 1033,
"token_acc": 0.8259248289322793
},
{
"epoch": 2.9884393063583814,
"grad_norm": 0.5981387552317852,
"learning_rate": 4.0606917981966804e-11,
"loss": 0.5639816522598267,
"step": 1034,
"token_acc": 0.811261064452967
},
{
"epoch": 2.991329479768786,
"grad_norm": 0.5182263398780822,
"learning_rate": 2.2841526630512642e-11,
"loss": 0.5699348449707031,
"step": 1035,
"token_acc": 0.8084916570295722
},
{
"epoch": 2.994219653179191,
"grad_norm": 0.48173987479445357,
"learning_rate": 1.0151832555205242e-11,
"loss": 0.5670179128646851,
"step": 1036,
"token_acc": 0.8119991095280499
},
{
"epoch": 2.9971098265895955,
"grad_norm": 0.5532608077856682,
"learning_rate": 2.5379645800516215e-12,
"loss": 0.5611600875854492,
"step": 1037,
"token_acc": 0.8147770004529734
},
{
"epoch": 3.0,
"grad_norm": 0.5148238785537761,
"learning_rate": 0.0,
"loss": 0.5508678555488586,
"step": 1038,
"token_acc": 0.8153577131547579
},
{
"epoch": 3.0,
"eval_loss": 0.5740059018135071,
"eval_runtime": 69.9798,
"eval_samples_per_second": 1.572,
"eval_steps_per_second": 0.2,
"eval_token_acc": 0.808306147135369,
"step": 1038
}
],
"logging_steps": 1,
"max_steps": 1038,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1140072026079232.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}