SchGen / trainer_state.json
ruichunma's picture
Upload folder using huggingface_hub
d5739f0 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.0,
"eval_steps": 500,
"global_step": 990,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0020222446916076846,
"grad_norm": 3.467946767807007,
"learning_rate": 0.0,
"loss": 3.4468,
"mean_token_accuracy": 0.4403058011084795,
"num_tokens": 69017.0,
"step": 1
},
{
"epoch": 0.004044489383215369,
"grad_norm": 3.8678574562072754,
"learning_rate": 8.88888888888889e-06,
"loss": 3.0508,
"mean_token_accuracy": 0.470831586048007,
"num_tokens": 150583.0,
"step": 2
},
{
"epoch": 0.006066734074823054,
"grad_norm": 3.7103006839752197,
"learning_rate": 1.777777777777778e-05,
"loss": 3.285,
"mean_token_accuracy": 0.45825996436178684,
"num_tokens": 221144.0,
"step": 3
},
{
"epoch": 0.008088978766430738,
"grad_norm": 3.591843605041504,
"learning_rate": 2.6666666666666667e-05,
"loss": 3.4107,
"mean_token_accuracy": 0.44140205159783363,
"num_tokens": 287737.0,
"step": 4
},
{
"epoch": 0.010111223458038422,
"grad_norm": 3.940007209777832,
"learning_rate": 3.555555555555556e-05,
"loss": 3.0975,
"mean_token_accuracy": 0.4831150006502867,
"num_tokens": 362591.0,
"step": 5
},
{
"epoch": 0.012133468149646108,
"grad_norm": 3.8853604793548584,
"learning_rate": 4.4444444444444447e-05,
"loss": 3.1354,
"mean_token_accuracy": 0.48446146585047245,
"num_tokens": 432327.0,
"step": 6
},
{
"epoch": 0.014155712841253791,
"grad_norm": 3.9134953022003174,
"learning_rate": 5.333333333333333e-05,
"loss": 3.215,
"mean_token_accuracy": 0.47610872238874435,
"num_tokens": 506671.0,
"step": 7
},
{
"epoch": 0.016177957532861477,
"grad_norm": 4.14130973815918,
"learning_rate": 6.222222222222222e-05,
"loss": 3.0424,
"mean_token_accuracy": 0.47477637231349945,
"num_tokens": 577418.0,
"step": 8
},
{
"epoch": 0.01820020222446916,
"grad_norm": 4.15872859954834,
"learning_rate": 7.111111111111112e-05,
"loss": 3.0563,
"mean_token_accuracy": 0.49431027099490166,
"num_tokens": 640014.0,
"step": 9
},
{
"epoch": 0.020222446916076844,
"grad_norm": 3.9895355701446533,
"learning_rate": 8e-05,
"loss": 2.6808,
"mean_token_accuracy": 0.5322843790054321,
"num_tokens": 704272.0,
"step": 10
},
{
"epoch": 0.022244691607684528,
"grad_norm": 4.202198028564453,
"learning_rate": 8.888888888888889e-05,
"loss": 2.6339,
"mean_token_accuracy": 0.5354921519756317,
"num_tokens": 771361.0,
"step": 11
},
{
"epoch": 0.024266936299292215,
"grad_norm": 4.070754051208496,
"learning_rate": 9.777777777777778e-05,
"loss": 2.2029,
"mean_token_accuracy": 0.5880691334605217,
"num_tokens": 846229.0,
"step": 12
},
{
"epoch": 0.0262891809908999,
"grad_norm": 3.6637940406799316,
"learning_rate": 0.00010666666666666667,
"loss": 1.7795,
"mean_token_accuracy": 0.6244243904948235,
"num_tokens": 927862.0,
"step": 13
},
{
"epoch": 0.028311425682507583,
"grad_norm": 3.9786410331726074,
"learning_rate": 0.00011555555555555555,
"loss": 1.9043,
"mean_token_accuracy": 0.6317372992634773,
"num_tokens": 988396.0,
"step": 14
},
{
"epoch": 0.030333670374115267,
"grad_norm": 3.229816198348999,
"learning_rate": 0.00012444444444444444,
"loss": 1.63,
"mean_token_accuracy": 0.65444141253829,
"num_tokens": 1047670.0,
"step": 15
},
{
"epoch": 0.032355915065722954,
"grad_norm": 2.8272366523742676,
"learning_rate": 0.00013333333333333334,
"loss": 1.4858,
"mean_token_accuracy": 0.6778117530047894,
"num_tokens": 1113088.0,
"step": 16
},
{
"epoch": 0.034378159757330634,
"grad_norm": 2.599519968032837,
"learning_rate": 0.00014222222222222224,
"loss": 1.323,
"mean_token_accuracy": 0.688772302120924,
"num_tokens": 1178886.0,
"step": 17
},
{
"epoch": 0.03640040444893832,
"grad_norm": 2.801631212234497,
"learning_rate": 0.0001511111111111111,
"loss": 1.2173,
"mean_token_accuracy": 0.7124413475394249,
"num_tokens": 1248356.0,
"step": 18
},
{
"epoch": 0.03842264914054601,
"grad_norm": 3.745363473892212,
"learning_rate": 0.00016,
"loss": 1.0959,
"mean_token_accuracy": 0.7285233177244663,
"num_tokens": 1324299.0,
"step": 19
},
{
"epoch": 0.04044489383215369,
"grad_norm": 4.511194229125977,
"learning_rate": 0.00016888888888888889,
"loss": 1.1729,
"mean_token_accuracy": 0.7189365439116955,
"num_tokens": 1392035.0,
"step": 20
},
{
"epoch": 0.042467138523761376,
"grad_norm": 4.869667053222656,
"learning_rate": 0.00017777777777777779,
"loss": 0.965,
"mean_token_accuracy": 0.7327957898378372,
"num_tokens": 1474776.0,
"step": 21
},
{
"epoch": 0.044489383215369056,
"grad_norm": 3.513063430786133,
"learning_rate": 0.0001866666666666667,
"loss": 0.958,
"mean_token_accuracy": 0.7463030181825161,
"num_tokens": 1546445.0,
"step": 22
},
{
"epoch": 0.046511627906976744,
"grad_norm": 2.169617176055908,
"learning_rate": 0.00019555555555555556,
"loss": 0.9572,
"mean_token_accuracy": 0.748451080173254,
"num_tokens": 1614331.0,
"step": 23
},
{
"epoch": 0.04853387259858443,
"grad_norm": 1.2484831809997559,
"learning_rate": 0.00020444444444444443,
"loss": 0.8834,
"mean_token_accuracy": 0.7673822268843651,
"num_tokens": 1679566.0,
"step": 24
},
{
"epoch": 0.05055611729019211,
"grad_norm": 1.0600098371505737,
"learning_rate": 0.00021333333333333333,
"loss": 0.8514,
"mean_token_accuracy": 0.7709708698093891,
"num_tokens": 1741770.0,
"step": 25
},
{
"epoch": 0.0525783619817998,
"grad_norm": 1.095992922782898,
"learning_rate": 0.00022222222222222223,
"loss": 0.8617,
"mean_token_accuracy": 0.7583519890904427,
"num_tokens": 1806990.0,
"step": 26
},
{
"epoch": 0.054600606673407485,
"grad_norm": 1.0006545782089233,
"learning_rate": 0.0002311111111111111,
"loss": 0.7725,
"mean_token_accuracy": 0.7819164581596851,
"num_tokens": 1872686.0,
"step": 27
},
{
"epoch": 0.056622851365015166,
"grad_norm": 0.6671711802482605,
"learning_rate": 0.00024,
"loss": 0.6548,
"mean_token_accuracy": 0.8015744872391224,
"num_tokens": 1943614.0,
"step": 28
},
{
"epoch": 0.05864509605662285,
"grad_norm": 0.47610151767730713,
"learning_rate": 0.0002488888888888889,
"loss": 0.6524,
"mean_token_accuracy": 0.8063510619103909,
"num_tokens": 2021034.0,
"step": 29
},
{
"epoch": 0.06066734074823053,
"grad_norm": 0.5676872730255127,
"learning_rate": 0.00025777777777777783,
"loss": 0.7402,
"mean_token_accuracy": 0.784897617995739,
"num_tokens": 2087348.0,
"step": 30
},
{
"epoch": 0.06268958543983821,
"grad_norm": 0.6818390488624573,
"learning_rate": 0.0002666666666666667,
"loss": 0.6894,
"mean_token_accuracy": 0.8017890304327011,
"num_tokens": 2154170.0,
"step": 31
},
{
"epoch": 0.06471183013144591,
"grad_norm": 0.5972866415977478,
"learning_rate": 0.0002755555555555556,
"loss": 0.612,
"mean_token_accuracy": 0.8184943534433842,
"num_tokens": 2229392.0,
"step": 32
},
{
"epoch": 0.06673407482305359,
"grad_norm": 0.4275088608264923,
"learning_rate": 0.0002844444444444445,
"loss": 0.5885,
"mean_token_accuracy": 0.8229578360915184,
"num_tokens": 2290048.0,
"step": 33
},
{
"epoch": 0.06875631951466127,
"grad_norm": 0.3523823618888855,
"learning_rate": 0.0002933333333333333,
"loss": 0.5766,
"mean_token_accuracy": 0.82804736495018,
"num_tokens": 2360740.0,
"step": 34
},
{
"epoch": 0.07077856420626896,
"grad_norm": 0.45881009101867676,
"learning_rate": 0.0003022222222222222,
"loss": 0.6217,
"mean_token_accuracy": 0.8134612888097763,
"num_tokens": 2419828.0,
"step": 35
},
{
"epoch": 0.07280080889787664,
"grad_norm": 0.46817246079444885,
"learning_rate": 0.0003111111111111111,
"loss": 0.5311,
"mean_token_accuracy": 0.8388609476387501,
"num_tokens": 2483206.0,
"step": 36
},
{
"epoch": 0.07482305358948432,
"grad_norm": 0.36155763268470764,
"learning_rate": 0.00032,
"loss": 0.5268,
"mean_token_accuracy": 0.8369965106248856,
"num_tokens": 2556908.0,
"step": 37
},
{
"epoch": 0.07684529828109202,
"grad_norm": 0.36704790592193604,
"learning_rate": 0.00032888888888888887,
"loss": 0.5548,
"mean_token_accuracy": 0.8294766061007977,
"num_tokens": 2626172.0,
"step": 38
},
{
"epoch": 0.0788675429726997,
"grad_norm": 0.3038175106048584,
"learning_rate": 0.00033777777777777777,
"loss": 0.5232,
"mean_token_accuracy": 0.8495447933673859,
"num_tokens": 2693541.0,
"step": 39
},
{
"epoch": 0.08088978766430738,
"grad_norm": 0.30305811762809753,
"learning_rate": 0.00034666666666666667,
"loss": 0.4882,
"mean_token_accuracy": 0.8428361192345619,
"num_tokens": 2758471.0,
"step": 40
},
{
"epoch": 0.08291203235591507,
"grad_norm": 0.33424293994903564,
"learning_rate": 0.00035555555555555557,
"loss": 0.508,
"mean_token_accuracy": 0.8437883704900742,
"num_tokens": 2826093.0,
"step": 41
},
{
"epoch": 0.08493427704752275,
"grad_norm": 0.3217228651046753,
"learning_rate": 0.00036444444444444447,
"loss": 0.5045,
"mean_token_accuracy": 0.8461326025426388,
"num_tokens": 2893222.0,
"step": 42
},
{
"epoch": 0.08695652173913043,
"grad_norm": 0.26564908027648926,
"learning_rate": 0.0003733333333333334,
"loss": 0.5068,
"mean_token_accuracy": 0.8433473333716393,
"num_tokens": 2956663.0,
"step": 43
},
{
"epoch": 0.08897876643073811,
"grad_norm": 0.25354474782943726,
"learning_rate": 0.0003822222222222223,
"loss": 0.4609,
"mean_token_accuracy": 0.8593583293259144,
"num_tokens": 3020245.0,
"step": 44
},
{
"epoch": 0.0910010111223458,
"grad_norm": 0.31298667192459106,
"learning_rate": 0.0003911111111111111,
"loss": 0.4884,
"mean_token_accuracy": 0.8503717556595802,
"num_tokens": 3091022.0,
"step": 45
},
{
"epoch": 0.09302325581395349,
"grad_norm": 0.23926222324371338,
"learning_rate": 0.0004,
"loss": 0.4635,
"mean_token_accuracy": 0.8578044883906841,
"num_tokens": 3167731.0,
"step": 46
},
{
"epoch": 0.09504550050556117,
"grad_norm": 0.23057548701763153,
"learning_rate": 0.00039999957163192333,
"loss": 0.4464,
"mean_token_accuracy": 0.8583495616912842,
"num_tokens": 3230183.0,
"step": 47
},
{
"epoch": 0.09706774519716886,
"grad_norm": 0.22786663472652435,
"learning_rate": 0.0003999982865297322,
"loss": 0.4165,
"mean_token_accuracy": 0.8637920096516609,
"num_tokens": 3300798.0,
"step": 48
},
{
"epoch": 0.09908998988877654,
"grad_norm": 0.27733081579208374,
"learning_rate": 0.0003999961446995433,
"loss": 0.4348,
"mean_token_accuracy": 0.8584615886211395,
"num_tokens": 3368808.0,
"step": 49
},
{
"epoch": 0.10111223458038422,
"grad_norm": 0.2632873058319092,
"learning_rate": 0.00039999314615155084,
"loss": 0.4545,
"mean_token_accuracy": 0.8571835160255432,
"num_tokens": 3436471.0,
"step": 50
},
{
"epoch": 0.10313447927199192,
"grad_norm": 0.20401886105537415,
"learning_rate": 0.000399989290900027,
"loss": 0.426,
"mean_token_accuracy": 0.8630774058401585,
"num_tokens": 3504251.0,
"step": 51
},
{
"epoch": 0.1051567239635996,
"grad_norm": 0.2126135528087616,
"learning_rate": 0.0003999845789633213,
"loss": 0.4209,
"mean_token_accuracy": 0.8644996210932732,
"num_tokens": 3569455.0,
"step": 52
},
{
"epoch": 0.10717896865520728,
"grad_norm": 0.20767471194267273,
"learning_rate": 0.00039997901036386093,
"loss": 0.4312,
"mean_token_accuracy": 0.8648513294756413,
"num_tokens": 3633701.0,
"step": 53
},
{
"epoch": 0.10920121334681497,
"grad_norm": 0.19368676841259003,
"learning_rate": 0.0003999725851281504,
"loss": 0.4219,
"mean_token_accuracy": 0.8675987049937248,
"num_tokens": 3700579.0,
"step": 54
},
{
"epoch": 0.11122345803842265,
"grad_norm": 0.19997400045394897,
"learning_rate": 0.0003999653032867717,
"loss": 0.4305,
"mean_token_accuracy": 0.8599656298756599,
"num_tokens": 3766515.0,
"step": 55
},
{
"epoch": 0.11324570273003033,
"grad_norm": 0.19456814229488373,
"learning_rate": 0.00039995716487438367,
"loss": 0.4084,
"mean_token_accuracy": 0.8680460080504417,
"num_tokens": 3832179.0,
"step": 56
},
{
"epoch": 0.11526794742163801,
"grad_norm": 0.19756172597408295,
"learning_rate": 0.00039994816992972227,
"loss": 0.4199,
"mean_token_accuracy": 0.8612547963857651,
"num_tokens": 3898904.0,
"step": 57
},
{
"epoch": 0.1172901921132457,
"grad_norm": 0.1712576448917389,
"learning_rate": 0.0003999383184956003,
"loss": 0.36,
"mean_token_accuracy": 0.879060622304678,
"num_tokens": 3976416.0,
"step": 58
},
{
"epoch": 0.11931243680485339,
"grad_norm": 0.20002008974552155,
"learning_rate": 0.00039992761061890717,
"loss": 0.4269,
"mean_token_accuracy": 0.8589905127882957,
"num_tokens": 4036526.0,
"step": 59
},
{
"epoch": 0.12133468149646107,
"grad_norm": 0.1924401819705963,
"learning_rate": 0.00039991604635060835,
"loss": 0.4268,
"mean_token_accuracy": 0.8678371347486973,
"num_tokens": 4100376.0,
"step": 60
},
{
"epoch": 0.12335692618806876,
"grad_norm": 0.17639940977096558,
"learning_rate": 0.00039990362574574586,
"loss": 0.3919,
"mean_token_accuracy": 0.8658471070230007,
"num_tokens": 4165704.0,
"step": 61
},
{
"epoch": 0.12537917087967643,
"grad_norm": 0.1817377358675003,
"learning_rate": 0.00039989034886343724,
"loss": 0.3735,
"mean_token_accuracy": 0.8759783655405045,
"num_tokens": 4234412.0,
"step": 62
},
{
"epoch": 0.12740141557128412,
"grad_norm": 0.18214447796344757,
"learning_rate": 0.00039987621576687585,
"loss": 0.3454,
"mean_token_accuracy": 0.8825861141085625,
"num_tokens": 4307593.0,
"step": 63
},
{
"epoch": 0.12942366026289182,
"grad_norm": 0.18159601092338562,
"learning_rate": 0.0003998612265233302,
"loss": 0.3672,
"mean_token_accuracy": 0.8755885139107704,
"num_tokens": 4376630.0,
"step": 64
},
{
"epoch": 0.13144590495449948,
"grad_norm": 0.17050184309482574,
"learning_rate": 0.00039984538120414363,
"loss": 0.3333,
"mean_token_accuracy": 0.8833661302924156,
"num_tokens": 4449580.0,
"step": 65
},
{
"epoch": 0.13346814964610718,
"grad_norm": 0.20457544922828674,
"learning_rate": 0.0003998286798847344,
"loss": 0.4182,
"mean_token_accuracy": 0.8619738966226578,
"num_tokens": 4518076.0,
"step": 66
},
{
"epoch": 0.13549039433771487,
"grad_norm": 0.196366086602211,
"learning_rate": 0.00039981112264459486,
"loss": 0.3386,
"mean_token_accuracy": 0.8908565118908882,
"num_tokens": 4581622.0,
"step": 67
},
{
"epoch": 0.13751263902932254,
"grad_norm": 0.18182213604450226,
"learning_rate": 0.00039979270956729115,
"loss": 0.3999,
"mean_token_accuracy": 0.8703116998076439,
"num_tokens": 4646580.0,
"step": 68
},
{
"epoch": 0.13953488372093023,
"grad_norm": 0.18271780014038086,
"learning_rate": 0.0003997734407404631,
"loss": 0.3504,
"mean_token_accuracy": 0.8762697987258434,
"num_tokens": 4716771.0,
"step": 69
},
{
"epoch": 0.14155712841253792,
"grad_norm": 0.19590984284877777,
"learning_rate": 0.0003997533162558233,
"loss": 0.3753,
"mean_token_accuracy": 0.8757792375981808,
"num_tokens": 4789100.0,
"step": 70
},
{
"epoch": 0.1435793731041456,
"grad_norm": 0.23697857558727264,
"learning_rate": 0.00039973233620915733,
"loss": 0.4225,
"mean_token_accuracy": 0.8598962388932705,
"num_tokens": 4851640.0,
"step": 71
},
{
"epoch": 0.14560161779575329,
"grad_norm": 0.19626037776470184,
"learning_rate": 0.0003997105007003228,
"loss": 0.3572,
"mean_token_accuracy": 0.8849809169769287,
"num_tokens": 4916098.0,
"step": 72
},
{
"epoch": 0.14762386248736098,
"grad_norm": 0.20964385569095612,
"learning_rate": 0.00039968780983324893,
"loss": 0.3507,
"mean_token_accuracy": 0.8796872869133949,
"num_tokens": 4979744.0,
"step": 73
},
{
"epoch": 0.14964610717896865,
"grad_norm": 0.18054573237895966,
"learning_rate": 0.00039966426371593607,
"loss": 0.3683,
"mean_token_accuracy": 0.8814638741314411,
"num_tokens": 5050714.0,
"step": 74
},
{
"epoch": 0.15166835187057634,
"grad_norm": 0.16331350803375244,
"learning_rate": 0.0003996398624604556,
"loss": 0.3406,
"mean_token_accuracy": 0.8873084634542465,
"num_tokens": 5130559.0,
"step": 75
},
{
"epoch": 0.15369059656218403,
"grad_norm": 0.20746077597141266,
"learning_rate": 0.0003996146061829487,
"loss": 0.3762,
"mean_token_accuracy": 0.8765941001474857,
"num_tokens": 5199691.0,
"step": 76
},
{
"epoch": 0.1557128412537917,
"grad_norm": 0.16679136455059052,
"learning_rate": 0.0003995884950036263,
"loss": 0.3691,
"mean_token_accuracy": 0.8776806406676769,
"num_tokens": 5277116.0,
"step": 77
},
{
"epoch": 0.1577350859453994,
"grad_norm": 0.2159774750471115,
"learning_rate": 0.00039956152904676835,
"loss": 0.4017,
"mean_token_accuracy": 0.8709179721772671,
"num_tokens": 5343258.0,
"step": 78
},
{
"epoch": 0.1597573306370071,
"grad_norm": 0.16525208950042725,
"learning_rate": 0.00039953370844072333,
"loss": 0.372,
"mean_token_accuracy": 0.8759802021086216,
"num_tokens": 5418084.0,
"step": 79
},
{
"epoch": 0.16177957532861476,
"grad_norm": 0.18534427881240845,
"learning_rate": 0.00039950503331790744,
"loss": 0.4236,
"mean_token_accuracy": 0.8610594123601913,
"num_tokens": 5483557.0,
"step": 80
},
{
"epoch": 0.16380182002022245,
"grad_norm": 0.17822565138339996,
"learning_rate": 0.000399475503814804,
"loss": 0.347,
"mean_token_accuracy": 0.883899986743927,
"num_tokens": 5559324.0,
"step": 81
},
{
"epoch": 0.16582406471183014,
"grad_norm": 0.16568556427955627,
"learning_rate": 0.00039944512007196307,
"loss": 0.3046,
"mean_token_accuracy": 0.8883480541408062,
"num_tokens": 5646732.0,
"step": 82
},
{
"epoch": 0.1678463094034378,
"grad_norm": 0.20850011706352234,
"learning_rate": 0.0003994138822340004,
"loss": 0.3727,
"mean_token_accuracy": 0.8808489926159382,
"num_tokens": 5709555.0,
"step": 83
},
{
"epoch": 0.1698685540950455,
"grad_norm": 0.19419965147972107,
"learning_rate": 0.00039938179044959714,
"loss": 0.3667,
"mean_token_accuracy": 0.8805488795042038,
"num_tokens": 5779149.0,
"step": 84
},
{
"epoch": 0.1718907987866532,
"grad_norm": 0.21039818227291107,
"learning_rate": 0.0003993488448714986,
"loss": 0.3912,
"mean_token_accuracy": 0.8791179358959198,
"num_tokens": 5850163.0,
"step": 85
},
{
"epoch": 0.17391304347826086,
"grad_norm": 0.2167867124080658,
"learning_rate": 0.00039931504565651424,
"loss": 0.3571,
"mean_token_accuracy": 0.8792387843132019,
"num_tokens": 5916129.0,
"step": 86
},
{
"epoch": 0.17593528816986856,
"grad_norm": 0.2154702991247177,
"learning_rate": 0.0003992803929655162,
"loss": 0.3868,
"mean_token_accuracy": 0.8748185895383358,
"num_tokens": 5979082.0,
"step": 87
},
{
"epoch": 0.17795753286147623,
"grad_norm": 0.1713341772556305,
"learning_rate": 0.00039924488696343915,
"loss": 0.338,
"mean_token_accuracy": 0.8834210820496082,
"num_tokens": 6048831.0,
"step": 88
},
{
"epoch": 0.17997977755308392,
"grad_norm": 0.20742323994636536,
"learning_rate": 0.00039920852781927886,
"loss": 0.3911,
"mean_token_accuracy": 0.868148323148489,
"num_tokens": 6114503.0,
"step": 89
},
{
"epoch": 0.1820020222446916,
"grad_norm": 0.18235628306865692,
"learning_rate": 0.0003991713157060922,
"loss": 0.3169,
"mean_token_accuracy": 0.8923499137163162,
"num_tokens": 6184293.0,
"step": 90
},
{
"epoch": 0.18402426693629928,
"grad_norm": 0.18693064153194427,
"learning_rate": 0.00039913325080099545,
"loss": 0.3678,
"mean_token_accuracy": 0.8744825124740601,
"num_tokens": 6252712.0,
"step": 91
},
{
"epoch": 0.18604651162790697,
"grad_norm": 0.19899111986160278,
"learning_rate": 0.0003990943332851641,
"loss": 0.3497,
"mean_token_accuracy": 0.8849819526076317,
"num_tokens": 6313767.0,
"step": 92
},
{
"epoch": 0.18806875631951467,
"grad_norm": 0.19068098068237305,
"learning_rate": 0.0003990545633438318,
"loss": 0.3492,
"mean_token_accuracy": 0.8846092559397221,
"num_tokens": 6382110.0,
"step": 93
},
{
"epoch": 0.19009100101112233,
"grad_norm": 0.19140516221523285,
"learning_rate": 0.0003990139411662892,
"loss": 0.3434,
"mean_token_accuracy": 0.8847804144024849,
"num_tokens": 6445880.0,
"step": 94
},
{
"epoch": 0.19211324570273003,
"grad_norm": 0.22566284239292145,
"learning_rate": 0.00039897246694588364,
"loss": 0.3726,
"mean_token_accuracy": 0.8737127743661404,
"num_tokens": 6512190.0,
"step": 95
},
{
"epoch": 0.19413549039433772,
"grad_norm": 0.193269744515419,
"learning_rate": 0.00039893014088001754,
"loss": 0.3689,
"mean_token_accuracy": 0.8768584616482258,
"num_tokens": 6581328.0,
"step": 96
},
{
"epoch": 0.1961577350859454,
"grad_norm": 0.19110015034675598,
"learning_rate": 0.00039888696317014807,
"loss": 0.3307,
"mean_token_accuracy": 0.8812081180512905,
"num_tokens": 6653124.0,
"step": 97
},
{
"epoch": 0.19817997977755308,
"grad_norm": 0.18114197254180908,
"learning_rate": 0.00039884293402178575,
"loss": 0.3451,
"mean_token_accuracy": 0.8798027820885181,
"num_tokens": 6723465.0,
"step": 98
},
{
"epoch": 0.20020222446916078,
"grad_norm": 0.19303397834300995,
"learning_rate": 0.0003987980536444938,
"loss": 0.334,
"mean_token_accuracy": 0.8881032280623913,
"num_tokens": 6801637.0,
"step": 99
},
{
"epoch": 0.20222446916076844,
"grad_norm": 0.1839206963777542,
"learning_rate": 0.0003987523222518868,
"loss": 0.3344,
"mean_token_accuracy": 0.8791452720761299,
"num_tokens": 6879826.0,
"step": 100
},
{
"epoch": 0.20424671385237614,
"grad_norm": 0.1716805100440979,
"learning_rate": 0.0003987057400616299,
"loss": 0.3494,
"mean_token_accuracy": 0.8803286664187908,
"num_tokens": 6958940.0,
"step": 101
},
{
"epoch": 0.20626895854398383,
"grad_norm": 0.218710795044899,
"learning_rate": 0.000398658307295438,
"loss": 0.3696,
"mean_token_accuracy": 0.8783976249396801,
"num_tokens": 7019640.0,
"step": 102
},
{
"epoch": 0.2082912032355915,
"grad_norm": 0.2176671177148819,
"learning_rate": 0.0003986100241790741,
"loss": 0.3778,
"mean_token_accuracy": 0.8742088116705418,
"num_tokens": 7083893.0,
"step": 103
},
{
"epoch": 0.2103134479271992,
"grad_norm": 0.20480629801750183,
"learning_rate": 0.0003985608909423487,
"loss": 0.3644,
"mean_token_accuracy": 0.8779697194695473,
"num_tokens": 7146243.0,
"step": 104
},
{
"epoch": 0.2123356926188069,
"grad_norm": 0.21523724496364594,
"learning_rate": 0.0003985109078191187,
"loss": 0.3384,
"mean_token_accuracy": 0.8801298663020134,
"num_tokens": 7211820.0,
"step": 105
},
{
"epoch": 0.21435793731041455,
"grad_norm": 0.2035398781299591,
"learning_rate": 0.00039846007504728593,
"loss": 0.3553,
"mean_token_accuracy": 0.8752279430627823,
"num_tokens": 7280455.0,
"step": 106
},
{
"epoch": 0.21638018200202225,
"grad_norm": 0.1565598100423813,
"learning_rate": 0.00039840839286879636,
"loss": 0.3034,
"mean_token_accuracy": 0.8931353390216827,
"num_tokens": 7357510.0,
"step": 107
},
{
"epoch": 0.21840242669362994,
"grad_norm": 0.17082397639751434,
"learning_rate": 0.00039835586152963884,
"loss": 0.3135,
"mean_token_accuracy": 0.883228026330471,
"num_tokens": 7428821.0,
"step": 108
},
{
"epoch": 0.2204246713852376,
"grad_norm": 0.18526601791381836,
"learning_rate": 0.0003983024812798439,
"loss": 0.3156,
"mean_token_accuracy": 0.88564358279109,
"num_tokens": 7494951.0,
"step": 109
},
{
"epoch": 0.2224469160768453,
"grad_norm": 0.20190876722335815,
"learning_rate": 0.0003982482523734827,
"loss": 0.3393,
"mean_token_accuracy": 0.8834404349327087,
"num_tokens": 7558067.0,
"step": 110
},
{
"epoch": 0.224469160768453,
"grad_norm": 0.1943565011024475,
"learning_rate": 0.00039819317506866543,
"loss": 0.3582,
"mean_token_accuracy": 0.8790641874074936,
"num_tokens": 7630543.0,
"step": 111
},
{
"epoch": 0.22649140546006066,
"grad_norm": 0.2155260592699051,
"learning_rate": 0.00039813724962754066,
"loss": 0.3514,
"mean_token_accuracy": 0.8799824342131615,
"num_tokens": 7693798.0,
"step": 112
},
{
"epoch": 0.22851365015166836,
"grad_norm": 0.17986060678958893,
"learning_rate": 0.00039808047631629363,
"loss": 0.3361,
"mean_token_accuracy": 0.8870190940797329,
"num_tokens": 7763267.0,
"step": 113
},
{
"epoch": 0.23053589484327602,
"grad_norm": 0.18999366462230682,
"learning_rate": 0.00039802285540514504,
"loss": 0.325,
"mean_token_accuracy": 0.8898543640971184,
"num_tokens": 7834437.0,
"step": 114
},
{
"epoch": 0.23255813953488372,
"grad_norm": 0.20567375421524048,
"learning_rate": 0.0003979643871683501,
"loss": 0.3734,
"mean_token_accuracy": 0.8742238134145737,
"num_tokens": 7896274.0,
"step": 115
},
{
"epoch": 0.2345803842264914,
"grad_norm": 0.18579523265361786,
"learning_rate": 0.000397905071884197,
"loss": 0.3543,
"mean_token_accuracy": 0.8827438056468964,
"num_tokens": 7962304.0,
"step": 116
},
{
"epoch": 0.23660262891809908,
"grad_norm": 0.1895459145307541,
"learning_rate": 0.00039784490983500514,
"loss": 0.2899,
"mean_token_accuracy": 0.888210829347372,
"num_tokens": 8037020.0,
"step": 117
},
{
"epoch": 0.23862487360970677,
"grad_norm": 0.1934623420238495,
"learning_rate": 0.0003977839013071248,
"loss": 0.3172,
"mean_token_accuracy": 0.8874295391142368,
"num_tokens": 8106669.0,
"step": 118
},
{
"epoch": 0.24064711830131447,
"grad_norm": 0.18337437510490417,
"learning_rate": 0.0003977220465909348,
"loss": 0.328,
"mean_token_accuracy": 0.884034089744091,
"num_tokens": 8174813.0,
"step": 119
},
{
"epoch": 0.24266936299292213,
"grad_norm": 0.18985910713672638,
"learning_rate": 0.00039765934598084176,
"loss": 0.3396,
"mean_token_accuracy": 0.8789964653551579,
"num_tokens": 8247396.0,
"step": 120
},
{
"epoch": 0.24469160768452983,
"grad_norm": 0.20584100484848022,
"learning_rate": 0.0003975957997752783,
"loss": 0.3537,
"mean_token_accuracy": 0.8752495422959328,
"num_tokens": 8310521.0,
"step": 121
},
{
"epoch": 0.24671385237613752,
"grad_norm": 0.20211565494537354,
"learning_rate": 0.00039753140827670163,
"loss": 0.3607,
"mean_token_accuracy": 0.877599012106657,
"num_tokens": 8374419.0,
"step": 122
},
{
"epoch": 0.2487360970677452,
"grad_norm": 0.21102474629878998,
"learning_rate": 0.00039746617179159274,
"loss": 0.3411,
"mean_token_accuracy": 0.8837038949131966,
"num_tokens": 8436270.0,
"step": 123
},
{
"epoch": 0.25075834175935285,
"grad_norm": 0.22104637324810028,
"learning_rate": 0.000397400090630454,
"loss": 0.3467,
"mean_token_accuracy": 0.8838667124509811,
"num_tokens": 8496108.0,
"step": 124
},
{
"epoch": 0.2527805864509606,
"grad_norm": 0.23561948537826538,
"learning_rate": 0.0003973331651078084,
"loss": 0.3933,
"mean_token_accuracy": 0.864571388810873,
"num_tokens": 8558449.0,
"step": 125
},
{
"epoch": 0.25480283114256824,
"grad_norm": 0.17975358664989471,
"learning_rate": 0.00039726539554219746,
"loss": 0.3246,
"mean_token_accuracy": 0.8921530395746231,
"num_tokens": 8633747.0,
"step": 126
},
{
"epoch": 0.2568250758341759,
"grad_norm": 0.19475312530994415,
"learning_rate": 0.0003971967822561805,
"loss": 0.359,
"mean_token_accuracy": 0.878424908965826,
"num_tokens": 8700730.0,
"step": 127
},
{
"epoch": 0.25884732052578363,
"grad_norm": 0.23659245669841766,
"learning_rate": 0.0003971273255763324,
"loss": 0.3606,
"mean_token_accuracy": 0.8830053992569447,
"num_tokens": 8760014.0,
"step": 128
},
{
"epoch": 0.2608695652173913,
"grad_norm": 0.19543145596981049,
"learning_rate": 0.0003970570258332422,
"loss": 0.3309,
"mean_token_accuracy": 0.8853320479393005,
"num_tokens": 8824736.0,
"step": 129
},
{
"epoch": 0.26289180990899896,
"grad_norm": 0.2464882731437683,
"learning_rate": 0.0003969858833615119,
"loss": 0.3589,
"mean_token_accuracy": 0.8793282993137836,
"num_tokens": 8887323.0,
"step": 130
},
{
"epoch": 0.2649140546006067,
"grad_norm": 0.16774067282676697,
"learning_rate": 0.0003969138984997542,
"loss": 0.3198,
"mean_token_accuracy": 0.8886825554072857,
"num_tokens": 8965857.0,
"step": 131
},
{
"epoch": 0.26693629929221435,
"grad_norm": 0.19399577379226685,
"learning_rate": 0.00039684107159059174,
"loss": 0.3468,
"mean_token_accuracy": 0.8808378390967846,
"num_tokens": 9039028.0,
"step": 132
},
{
"epoch": 0.268958543983822,
"grad_norm": 0.1961926966905594,
"learning_rate": 0.00039676740298065467,
"loss": 0.3501,
"mean_token_accuracy": 0.8791337199509144,
"num_tokens": 9108645.0,
"step": 133
},
{
"epoch": 0.27098078867542974,
"grad_norm": 0.16180327534675598,
"learning_rate": 0.00039669289302057955,
"loss": 0.3291,
"mean_token_accuracy": 0.889164712280035,
"num_tokens": 9182295.0,
"step": 134
},
{
"epoch": 0.2730030333670374,
"grad_norm": 0.18792307376861572,
"learning_rate": 0.00039661754206500723,
"loss": 0.305,
"mean_token_accuracy": 0.890954252332449,
"num_tokens": 9253798.0,
"step": 135
},
{
"epoch": 0.2750252780586451,
"grad_norm": 0.2211407721042633,
"learning_rate": 0.0003965413504725815,
"loss": 0.3516,
"mean_token_accuracy": 0.8829210363328457,
"num_tokens": 9319632.0,
"step": 136
},
{
"epoch": 0.2770475227502528,
"grad_norm": 0.15928597748279572,
"learning_rate": 0.0003964643186059474,
"loss": 0.3209,
"mean_token_accuracy": 0.8902908116579056,
"num_tokens": 9396460.0,
"step": 137
},
{
"epoch": 0.27906976744186046,
"grad_norm": 0.25479844212532043,
"learning_rate": 0.00039638644683174937,
"loss": 0.3247,
"mean_token_accuracy": 0.8880501836538315,
"num_tokens": 9460466.0,
"step": 138
},
{
"epoch": 0.2810920121334681,
"grad_norm": 0.17745117843151093,
"learning_rate": 0.00039630773552062925,
"loss": 0.3383,
"mean_token_accuracy": 0.8863355927169323,
"num_tokens": 9532155.0,
"step": 139
},
{
"epoch": 0.28311425682507585,
"grad_norm": 0.22157195210456848,
"learning_rate": 0.0003962281850472251,
"loss": 0.3499,
"mean_token_accuracy": 0.879049763083458,
"num_tokens": 9590255.0,
"step": 140
},
{
"epoch": 0.2851365015166835,
"grad_norm": 0.1807304471731186,
"learning_rate": 0.0003961477957901689,
"loss": 0.3065,
"mean_token_accuracy": 0.8949154578149319,
"num_tokens": 9667027.0,
"step": 141
},
{
"epoch": 0.2871587462082912,
"grad_norm": 0.23244738578796387,
"learning_rate": 0.00039606656813208504,
"loss": 0.3608,
"mean_token_accuracy": 0.8768214285373688,
"num_tokens": 9723117.0,
"step": 142
},
{
"epoch": 0.2891809908998989,
"grad_norm": 0.18404552340507507,
"learning_rate": 0.0003959845024595883,
"loss": 0.2972,
"mean_token_accuracy": 0.8935975506901741,
"num_tokens": 9792714.0,
"step": 143
},
{
"epoch": 0.29120323559150657,
"grad_norm": 0.21092693507671356,
"learning_rate": 0.00039590159916328224,
"loss": 0.3552,
"mean_token_accuracy": 0.8813748992979527,
"num_tokens": 9846790.0,
"step": 144
},
{
"epoch": 0.29322548028311424,
"grad_norm": 0.18293221294879913,
"learning_rate": 0.00039581785863775705,
"loss": 0.3497,
"mean_token_accuracy": 0.8868285343050957,
"num_tokens": 9920682.0,
"step": 145
},
{
"epoch": 0.29524772497472196,
"grad_norm": 0.23161938786506653,
"learning_rate": 0.00039573328128158803,
"loss": 0.3671,
"mean_token_accuracy": 0.8772343806922436,
"num_tokens": 9989629.0,
"step": 146
},
{
"epoch": 0.2972699696663296,
"grad_norm": 0.19797147810459137,
"learning_rate": 0.0003956478674973333,
"loss": 0.356,
"mean_token_accuracy": 0.8782718777656555,
"num_tokens": 10048794.0,
"step": 147
},
{
"epoch": 0.2992922143579373,
"grad_norm": 0.18177340924739838,
"learning_rate": 0.00039556161769153226,
"loss": 0.3122,
"mean_token_accuracy": 0.8886930793523788,
"num_tokens": 10116701.0,
"step": 148
},
{
"epoch": 0.301314459049545,
"grad_norm": 0.24357731640338898,
"learning_rate": 0.0003954745322747034,
"loss": 0.344,
"mean_token_accuracy": 0.8848157115280628,
"num_tokens": 10176439.0,
"step": 149
},
{
"epoch": 0.3033367037411527,
"grad_norm": 0.18051762878894806,
"learning_rate": 0.00039538661166134236,
"loss": 0.3134,
"mean_token_accuracy": 0.8913725949823856,
"num_tokens": 10248461.0,
"step": 150
},
{
"epoch": 0.30535894843276035,
"grad_norm": 0.20022518932819366,
"learning_rate": 0.00039529785626992006,
"loss": 0.3436,
"mean_token_accuracy": 0.8848014548420906,
"num_tokens": 10310254.0,
"step": 151
},
{
"epoch": 0.30738119312436807,
"grad_norm": 0.23199647665023804,
"learning_rate": 0.0003952082665228808,
"loss": 0.3233,
"mean_token_accuracy": 0.8871180489659309,
"num_tokens": 10375248.0,
"step": 152
},
{
"epoch": 0.30940343781597573,
"grad_norm": 0.18778662383556366,
"learning_rate": 0.00039511784284663976,
"loss": 0.3044,
"mean_token_accuracy": 0.8951373845338821,
"num_tokens": 10442606.0,
"step": 153
},
{
"epoch": 0.3114256825075834,
"grad_norm": 0.17572450637817383,
"learning_rate": 0.0003950265856715818,
"loss": 0.3331,
"mean_token_accuracy": 0.8889199234545231,
"num_tokens": 10509923.0,
"step": 154
},
{
"epoch": 0.3134479271991911,
"grad_norm": 0.16929855942726135,
"learning_rate": 0.0003949344954320586,
"loss": 0.348,
"mean_token_accuracy": 0.8804797492921352,
"num_tokens": 10579730.0,
"step": 155
},
{
"epoch": 0.3154701718907988,
"grad_norm": 0.17655323445796967,
"learning_rate": 0.0003948415725663871,
"loss": 0.3293,
"mean_token_accuracy": 0.883028332144022,
"num_tokens": 10648731.0,
"step": 156
},
{
"epoch": 0.31749241658240646,
"grad_norm": 0.1909574717283249,
"learning_rate": 0.00039474781751684706,
"loss": 0.3183,
"mean_token_accuracy": 0.8886212892830372,
"num_tokens": 10713689.0,
"step": 157
},
{
"epoch": 0.3195146612740142,
"grad_norm": 0.17727530002593994,
"learning_rate": 0.00039465323072967936,
"loss": 0.3237,
"mean_token_accuracy": 0.8898195438086987,
"num_tokens": 10785736.0,
"step": 158
},
{
"epoch": 0.32153690596562184,
"grad_norm": 0.18001440167427063,
"learning_rate": 0.00039455781265508355,
"loss": 0.332,
"mean_token_accuracy": 0.8871553801000118,
"num_tokens": 10856647.0,
"step": 159
},
{
"epoch": 0.3235591506572295,
"grad_norm": 0.19728383421897888,
"learning_rate": 0.0003944615637472158,
"loss": 0.3621,
"mean_token_accuracy": 0.8775678239762783,
"num_tokens": 10918872.0,
"step": 160
},
{
"epoch": 0.32558139534883723,
"grad_norm": 0.20538869500160217,
"learning_rate": 0.00039436448446418683,
"loss": 0.3633,
"mean_token_accuracy": 0.8745956718921661,
"num_tokens": 10981209.0,
"step": 161
},
{
"epoch": 0.3276036400404449,
"grad_norm": 0.19733993709087372,
"learning_rate": 0.00039426657526805937,
"loss": 0.3201,
"mean_token_accuracy": 0.8928566165268421,
"num_tokens": 11047089.0,
"step": 162
},
{
"epoch": 0.32962588473205257,
"grad_norm": 0.15281331539154053,
"learning_rate": 0.0003941678366248468,
"loss": 0.3003,
"mean_token_accuracy": 0.8931796550750732,
"num_tokens": 11122846.0,
"step": 163
},
{
"epoch": 0.3316481294236603,
"grad_norm": 0.17711788415908813,
"learning_rate": 0.00039406826900450977,
"loss": 0.3127,
"mean_token_accuracy": 0.892613273113966,
"num_tokens": 11197993.0,
"step": 164
},
{
"epoch": 0.33367037411526795,
"grad_norm": 0.2002251148223877,
"learning_rate": 0.00039396787288095497,
"loss": 0.3328,
"mean_token_accuracy": 0.8890563920140266,
"num_tokens": 11267855.0,
"step": 165
},
{
"epoch": 0.3356926188068756,
"grad_norm": 0.16182006895542145,
"learning_rate": 0.00039386664873203227,
"loss": 0.3251,
"mean_token_accuracy": 0.8839607983827591,
"num_tokens": 11344330.0,
"step": 166
},
{
"epoch": 0.33771486349848334,
"grad_norm": 0.16747458279132843,
"learning_rate": 0.00039376459703953284,
"loss": 0.3249,
"mean_token_accuracy": 0.8876189365983009,
"num_tokens": 11418350.0,
"step": 167
},
{
"epoch": 0.339737108190091,
"grad_norm": 0.1826547235250473,
"learning_rate": 0.0003936617182891864,
"loss": 0.3291,
"mean_token_accuracy": 0.8888828568160534,
"num_tokens": 11485723.0,
"step": 168
},
{
"epoch": 0.3417593528816987,
"grad_norm": 0.18488235771656036,
"learning_rate": 0.0003935580129706593,
"loss": 0.3097,
"mean_token_accuracy": 0.8907660692930222,
"num_tokens": 11551678.0,
"step": 169
},
{
"epoch": 0.3437815975733064,
"grad_norm": 0.23008394241333008,
"learning_rate": 0.00039345348157755213,
"loss": 0.3533,
"mean_token_accuracy": 0.8763989768922329,
"num_tokens": 11609063.0,
"step": 170
},
{
"epoch": 0.34580384226491406,
"grad_norm": 0.2060030996799469,
"learning_rate": 0.0003933481246073973,
"loss": 0.3399,
"mean_token_accuracy": 0.8879686929285526,
"num_tokens": 11673330.0,
"step": 171
},
{
"epoch": 0.34782608695652173,
"grad_norm": 0.17570629715919495,
"learning_rate": 0.0003932419425616565,
"loss": 0.3454,
"mean_token_accuracy": 0.8838200494647026,
"num_tokens": 11740475.0,
"step": 172
},
{
"epoch": 0.34984833164812945,
"grad_norm": 0.16710588335990906,
"learning_rate": 0.0003931349359457187,
"loss": 0.2969,
"mean_token_accuracy": 0.899805661290884,
"num_tokens": 11806954.0,
"step": 173
},
{
"epoch": 0.3518705763397371,
"grad_norm": 0.20197796821594238,
"learning_rate": 0.0003930271052688974,
"loss": 0.3525,
"mean_token_accuracy": 0.8779477626085281,
"num_tokens": 11870286.0,
"step": 174
},
{
"epoch": 0.3538928210313448,
"grad_norm": 0.17107857763767242,
"learning_rate": 0.0003929184510444284,
"loss": 0.3266,
"mean_token_accuracy": 0.8888569958508015,
"num_tokens": 11947117.0,
"step": 175
},
{
"epoch": 0.35591506572295245,
"grad_norm": 0.17827239632606506,
"learning_rate": 0.0003928089737894672,
"loss": 0.3252,
"mean_token_accuracy": 0.8897545039653778,
"num_tokens": 12009582.0,
"step": 176
},
{
"epoch": 0.3579373104145602,
"grad_norm": 0.22990773618221283,
"learning_rate": 0.00039269867402508675,
"loss": 0.3549,
"mean_token_accuracy": 0.8815719597041607,
"num_tokens": 12072827.0,
"step": 177
},
{
"epoch": 0.35995955510616784,
"grad_norm": 0.19108358025550842,
"learning_rate": 0.00039258755227627475,
"loss": 0.3549,
"mean_token_accuracy": 0.8812212906777859,
"num_tokens": 12141736.0,
"step": 178
},
{
"epoch": 0.3619817997977755,
"grad_norm": 0.19387130439281464,
"learning_rate": 0.0003924756090719314,
"loss": 0.3057,
"mean_token_accuracy": 0.8937871865928173,
"num_tokens": 12212850.0,
"step": 179
},
{
"epoch": 0.3640040444893832,
"grad_norm": 0.19616757333278656,
"learning_rate": 0.0003923628449448666,
"loss": 0.3337,
"mean_token_accuracy": 0.8879410326480865,
"num_tokens": 12278676.0,
"step": 180
},
{
"epoch": 0.3660262891809909,
"grad_norm": 0.19950613379478455,
"learning_rate": 0.0003922492604317976,
"loss": 0.333,
"mean_token_accuracy": 0.8837904818356037,
"num_tokens": 12344019.0,
"step": 181
},
{
"epoch": 0.36804853387259856,
"grad_norm": 0.18320327997207642,
"learning_rate": 0.0003921348560733464,
"loss": 0.3379,
"mean_token_accuracy": 0.8864001519978046,
"num_tokens": 12414279.0,
"step": 182
},
{
"epoch": 0.3700707785642063,
"grad_norm": 0.19148240983486176,
"learning_rate": 0.0003920196324140371,
"loss": 0.3438,
"mean_token_accuracy": 0.8869296424090862,
"num_tokens": 12481557.0,
"step": 183
},
{
"epoch": 0.37209302325581395,
"grad_norm": 0.16867059469223022,
"learning_rate": 0.00039190359000229364,
"loss": 0.3347,
"mean_token_accuracy": 0.8817239366471767,
"num_tokens": 12552783.0,
"step": 184
},
{
"epoch": 0.3741152679474216,
"grad_norm": 0.20269234478473663,
"learning_rate": 0.0003917867293904365,
"loss": 0.3599,
"mean_token_accuracy": 0.8779093511402607,
"num_tokens": 12611751.0,
"step": 185
},
{
"epoch": 0.37613751263902934,
"grad_norm": 0.1963576078414917,
"learning_rate": 0.0003916690511346809,
"loss": 0.3219,
"mean_token_accuracy": 0.8882619775831699,
"num_tokens": 12674136.0,
"step": 186
},
{
"epoch": 0.378159757330637,
"grad_norm": 0.1874200403690338,
"learning_rate": 0.0003915505557951335,
"loss": 0.2945,
"mean_token_accuracy": 0.8926714062690735,
"num_tokens": 12754627.0,
"step": 187
},
{
"epoch": 0.38018200202224467,
"grad_norm": 0.21084272861480713,
"learning_rate": 0.0003914312439357901,
"loss": 0.3492,
"mean_token_accuracy": 0.8815909698605537,
"num_tokens": 12812878.0,
"step": 188
},
{
"epoch": 0.3822042467138524,
"grad_norm": 0.21426641941070557,
"learning_rate": 0.00039131111612453293,
"loss": 0.3226,
"mean_token_accuracy": 0.8860650397837162,
"num_tokens": 12876950.0,
"step": 189
},
{
"epoch": 0.38422649140546006,
"grad_norm": 0.1843956857919693,
"learning_rate": 0.0003911901729331277,
"loss": 0.3012,
"mean_token_accuracy": 0.8955246210098267,
"num_tokens": 12940008.0,
"step": 190
},
{
"epoch": 0.3862487360970677,
"grad_norm": 0.16776444017887115,
"learning_rate": 0.00039106841493722103,
"loss": 0.2915,
"mean_token_accuracy": 0.8939312994480133,
"num_tokens": 13011277.0,
"step": 191
},
{
"epoch": 0.38827098078867545,
"grad_norm": 0.21435709297657013,
"learning_rate": 0.0003909458427163379,
"loss": 0.3297,
"mean_token_accuracy": 0.8883927799761295,
"num_tokens": 13076795.0,
"step": 192
},
{
"epoch": 0.3902932254802831,
"grad_norm": 0.18475346267223358,
"learning_rate": 0.00039082245685387855,
"loss": 0.3322,
"mean_token_accuracy": 0.8888528421521187,
"num_tokens": 13142952.0,
"step": 193
},
{
"epoch": 0.3923154701718908,
"grad_norm": 0.19243639707565308,
"learning_rate": 0.00039069825793711587,
"loss": 0.3213,
"mean_token_accuracy": 0.8921789862215519,
"num_tokens": 13211022.0,
"step": 194
},
{
"epoch": 0.3943377148634985,
"grad_norm": 0.1858910322189331,
"learning_rate": 0.0003905732465571928,
"loss": 0.3179,
"mean_token_accuracy": 0.8920286670327187,
"num_tokens": 13276701.0,
"step": 195
},
{
"epoch": 0.39635995955510617,
"grad_norm": 0.20470379292964935,
"learning_rate": 0.0003904474233091191,
"loss": 0.3189,
"mean_token_accuracy": 0.8954358175396919,
"num_tokens": 13344684.0,
"step": 196
},
{
"epoch": 0.39838220424671383,
"grad_norm": 0.18819299340248108,
"learning_rate": 0.00039032078879176865,
"loss": 0.3447,
"mean_token_accuracy": 0.8849571086466312,
"num_tokens": 13409885.0,
"step": 197
},
{
"epoch": 0.40040444893832156,
"grad_norm": 0.17828333377838135,
"learning_rate": 0.00039019334360787706,
"loss": 0.324,
"mean_token_accuracy": 0.8868827521800995,
"num_tokens": 13473352.0,
"step": 198
},
{
"epoch": 0.4024266936299292,
"grad_norm": 0.18609419465065002,
"learning_rate": 0.0003900650883640381,
"loss": 0.285,
"mean_token_accuracy": 0.8940243273973465,
"num_tokens": 13540264.0,
"step": 199
},
{
"epoch": 0.4044489383215369,
"grad_norm": 0.18147540092468262,
"learning_rate": 0.0003899360236707012,
"loss": 0.3077,
"mean_token_accuracy": 0.8837394788861275,
"num_tokens": 13610806.0,
"step": 200
},
{
"epoch": 0.4064711830131446,
"grad_norm": 0.19080513715744019,
"learning_rate": 0.00038980615014216853,
"loss": 0.3241,
"mean_token_accuracy": 0.8904240913689137,
"num_tokens": 13669371.0,
"step": 201
},
{
"epoch": 0.4084934277047523,
"grad_norm": 0.16377419233322144,
"learning_rate": 0.00038967546839659215,
"loss": 0.3149,
"mean_token_accuracy": 0.8902618512511253,
"num_tokens": 13745941.0,
"step": 202
},
{
"epoch": 0.41051567239635994,
"grad_norm": 0.19631735980510712,
"learning_rate": 0.00038954397905597063,
"loss": 0.3459,
"mean_token_accuracy": 0.8863471113145351,
"num_tokens": 13818760.0,
"step": 203
},
{
"epoch": 0.41253791708796766,
"grad_norm": 0.18512631952762604,
"learning_rate": 0.00038941168274614677,
"loss": 0.3168,
"mean_token_accuracy": 0.8905623555183411,
"num_tokens": 13889651.0,
"step": 204
},
{
"epoch": 0.41456016177957533,
"grad_norm": 0.21926718950271606,
"learning_rate": 0.00038927858009680394,
"loss": 0.3006,
"mean_token_accuracy": 0.8961901552975178,
"num_tokens": 13949554.0,
"step": 205
},
{
"epoch": 0.416582406471183,
"grad_norm": 0.17943674325942993,
"learning_rate": 0.0003891446717414635,
"loss": 0.3066,
"mean_token_accuracy": 0.8922952748835087,
"num_tokens": 14021083.0,
"step": 206
},
{
"epoch": 0.4186046511627907,
"grad_norm": 0.1913203001022339,
"learning_rate": 0.0003890099583174819,
"loss": 0.3209,
"mean_token_accuracy": 0.8909911513328552,
"num_tokens": 14093782.0,
"step": 207
},
{
"epoch": 0.4206268958543984,
"grad_norm": 0.19808340072631836,
"learning_rate": 0.0003888744404660472,
"loss": 0.3338,
"mean_token_accuracy": 0.884627778083086,
"num_tokens": 14159998.0,
"step": 208
},
{
"epoch": 0.42264914054600605,
"grad_norm": 0.17383399605751038,
"learning_rate": 0.0003887381188321762,
"loss": 0.3153,
"mean_token_accuracy": 0.8930625729262829,
"num_tokens": 14232551.0,
"step": 209
},
{
"epoch": 0.4246713852376138,
"grad_norm": 0.17710869014263153,
"learning_rate": 0.0003886009940647116,
"loss": 0.3296,
"mean_token_accuracy": 0.8819810189306736,
"num_tokens": 14298797.0,
"step": 210
},
{
"epoch": 0.42669362992922144,
"grad_norm": 0.1701733022928238,
"learning_rate": 0.0003884630668163186,
"loss": 0.3043,
"mean_token_accuracy": 0.8916125111281872,
"num_tokens": 14367264.0,
"step": 211
},
{
"epoch": 0.4287158746208291,
"grad_norm": 0.16983942687511444,
"learning_rate": 0.0003883243377434821,
"loss": 0.336,
"mean_token_accuracy": 0.8859187439084053,
"num_tokens": 14434405.0,
"step": 212
},
{
"epoch": 0.43073811931243683,
"grad_norm": 0.22955253720283508,
"learning_rate": 0.0003881848075065032,
"loss": 0.331,
"mean_token_accuracy": 0.89054074883461,
"num_tokens": 14504097.0,
"step": 213
},
{
"epoch": 0.4327603640040445,
"grad_norm": 0.1674816757440567,
"learning_rate": 0.0003880444767694963,
"loss": 0.3292,
"mean_token_accuracy": 0.8852434195578098,
"num_tokens": 14576012.0,
"step": 214
},
{
"epoch": 0.43478260869565216,
"grad_norm": 0.23137012124061584,
"learning_rate": 0.00038790334620038606,
"loss": 0.3293,
"mean_token_accuracy": 0.8874834440648556,
"num_tokens": 14641442.0,
"step": 215
},
{
"epoch": 0.4368048533872599,
"grad_norm": 0.1810149997472763,
"learning_rate": 0.00038776141647090375,
"loss": 0.3359,
"mean_token_accuracy": 0.8845292665064335,
"num_tokens": 14701016.0,
"step": 216
},
{
"epoch": 0.43882709807886755,
"grad_norm": 0.19873689115047455,
"learning_rate": 0.00038761868825658465,
"loss": 0.3275,
"mean_token_accuracy": 0.8850444070994854,
"num_tokens": 14762543.0,
"step": 217
},
{
"epoch": 0.4408493427704752,
"grad_norm": 0.16571380198001862,
"learning_rate": 0.00038747516223676447,
"loss": 0.3097,
"mean_token_accuracy": 0.8963964283466339,
"num_tokens": 14837183.0,
"step": 218
},
{
"epoch": 0.44287158746208294,
"grad_norm": 0.18150104582309723,
"learning_rate": 0.00038733083909457607,
"loss": 0.3066,
"mean_token_accuracy": 0.891868706792593,
"num_tokens": 14909675.0,
"step": 219
},
{
"epoch": 0.4448938321536906,
"grad_norm": 0.2008552849292755,
"learning_rate": 0.00038718571951694636,
"loss": 0.3397,
"mean_token_accuracy": 0.881518941372633,
"num_tokens": 14974075.0,
"step": 220
},
{
"epoch": 0.44691607684529827,
"grad_norm": 0.20857571065425873,
"learning_rate": 0.00038703980419459323,
"loss": 0.3251,
"mean_token_accuracy": 0.8877891451120377,
"num_tokens": 15044109.0,
"step": 221
},
{
"epoch": 0.448938321536906,
"grad_norm": 0.1777462363243103,
"learning_rate": 0.00038689309382202174,
"loss": 0.3017,
"mean_token_accuracy": 0.8944090716540813,
"num_tokens": 15114045.0,
"step": 222
},
{
"epoch": 0.45096056622851366,
"grad_norm": 0.16797004640102386,
"learning_rate": 0.0003867455890975213,
"loss": 0.2901,
"mean_token_accuracy": 0.8903030268847942,
"num_tokens": 15184412.0,
"step": 223
},
{
"epoch": 0.4529828109201213,
"grad_norm": 0.26226508617401123,
"learning_rate": 0.00038659729072316193,
"loss": 0.356,
"mean_token_accuracy": 0.8832045011222363,
"num_tokens": 15245581.0,
"step": 224
},
{
"epoch": 0.455005055611729,
"grad_norm": 0.16607579588890076,
"learning_rate": 0.00038644819940479146,
"loss": 0.3148,
"mean_token_accuracy": 0.8910624943673611,
"num_tokens": 15315013.0,
"step": 225
},
{
"epoch": 0.4570273003033367,
"grad_norm": 0.15852072834968567,
"learning_rate": 0.00038629831585203163,
"loss": 0.2908,
"mean_token_accuracy": 0.8945996090769768,
"num_tokens": 15398701.0,
"step": 226
},
{
"epoch": 0.4590495449949444,
"grad_norm": 0.17531050741672516,
"learning_rate": 0.000386147640778275,
"loss": 0.2748,
"mean_token_accuracy": 0.9027018919587135,
"num_tokens": 15490499.0,
"step": 227
},
{
"epoch": 0.46107178968655205,
"grad_norm": 0.16767503321170807,
"learning_rate": 0.00038599617490068134,
"loss": 0.3044,
"mean_token_accuracy": 0.8939338177442551,
"num_tokens": 15556168.0,
"step": 228
},
{
"epoch": 0.46309403437815977,
"grad_norm": 0.211036816239357,
"learning_rate": 0.0003858439189401747,
"loss": 0.3207,
"mean_token_accuracy": 0.8899048455059528,
"num_tokens": 15622005.0,
"step": 229
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.16442608833312988,
"learning_rate": 0.0003856908736214393,
"loss": 0.3191,
"mean_token_accuracy": 0.8901388570666313,
"num_tokens": 15693753.0,
"step": 230
},
{
"epoch": 0.4671385237613751,
"grad_norm": 0.15724638104438782,
"learning_rate": 0.0003855370396729166,
"loss": 0.2836,
"mean_token_accuracy": 0.9016358070075512,
"num_tokens": 15763494.0,
"step": 231
},
{
"epoch": 0.4691607684529828,
"grad_norm": 0.18976381421089172,
"learning_rate": 0.0003853824178268017,
"loss": 0.3205,
"mean_token_accuracy": 0.8904677703976631,
"num_tokens": 15833863.0,
"step": 232
},
{
"epoch": 0.4711830131445905,
"grad_norm": 0.14951825141906738,
"learning_rate": 0.00038522700881903966,
"loss": 0.2525,
"mean_token_accuracy": 0.8994054794311523,
"num_tokens": 15911573.0,
"step": 233
},
{
"epoch": 0.47320525783619816,
"grad_norm": 0.19189335405826569,
"learning_rate": 0.0003850708133893223,
"loss": 0.3223,
"mean_token_accuracy": 0.8889148533344269,
"num_tokens": 15973006.0,
"step": 234
},
{
"epoch": 0.4752275025278059,
"grad_norm": 0.15495674312114716,
"learning_rate": 0.0003849138322810845,
"loss": 0.3025,
"mean_token_accuracy": 0.8922797180712223,
"num_tokens": 16044921.0,
"step": 235
},
{
"epoch": 0.47724974721941354,
"grad_norm": 0.1728491634130478,
"learning_rate": 0.00038475606624150055,
"loss": 0.3094,
"mean_token_accuracy": 0.8931614607572556,
"num_tokens": 16116096.0,
"step": 236
},
{
"epoch": 0.4792719919110212,
"grad_norm": 0.1603267937898636,
"learning_rate": 0.0003845975160214808,
"loss": 0.3235,
"mean_token_accuracy": 0.8852398991584778,
"num_tokens": 16184529.0,
"step": 237
},
{
"epoch": 0.48129423660262893,
"grad_norm": 0.16991828382015228,
"learning_rate": 0.00038443818237566814,
"loss": 0.2902,
"mean_token_accuracy": 0.892944622784853,
"num_tokens": 16253758.0,
"step": 238
},
{
"epoch": 0.4833164812942366,
"grad_norm": 0.17524850368499756,
"learning_rate": 0.0003842780660624343,
"loss": 0.3227,
"mean_token_accuracy": 0.8884528502821922,
"num_tokens": 16320041.0,
"step": 239
},
{
"epoch": 0.48533872598584427,
"grad_norm": 0.18329283595085144,
"learning_rate": 0.00038411716784387596,
"loss": 0.313,
"mean_token_accuracy": 0.8975342884659767,
"num_tokens": 16392051.0,
"step": 240
},
{
"epoch": 0.487360970677452,
"grad_norm": 0.1628371924161911,
"learning_rate": 0.00038395548848581165,
"loss": 0.2817,
"mean_token_accuracy": 0.9011796675622463,
"num_tokens": 16462030.0,
"step": 241
},
{
"epoch": 0.48938321536905965,
"grad_norm": 0.18072479963302612,
"learning_rate": 0.0003837930287577778,
"loss": 0.3041,
"mean_token_accuracy": 0.8932337760925293,
"num_tokens": 16532493.0,
"step": 242
},
{
"epoch": 0.4914054600606673,
"grad_norm": 0.2059275507926941,
"learning_rate": 0.000383629789433025,
"loss": 0.3281,
"mean_token_accuracy": 0.8870198056101799,
"num_tokens": 16590133.0,
"step": 243
},
{
"epoch": 0.49342770475227504,
"grad_norm": 0.19713951647281647,
"learning_rate": 0.00038346577128851465,
"loss": 0.3235,
"mean_token_accuracy": 0.8893256969749928,
"num_tokens": 16655566.0,
"step": 244
},
{
"epoch": 0.4954499494438827,
"grad_norm": 0.16556710004806519,
"learning_rate": 0.00038330097510491483,
"loss": 0.3148,
"mean_token_accuracy": 0.8895911388099194,
"num_tokens": 16718728.0,
"step": 245
},
{
"epoch": 0.4974721941354904,
"grad_norm": 0.1870684027671814,
"learning_rate": 0.000383135401666597,
"loss": 0.329,
"mean_token_accuracy": 0.8862268440425396,
"num_tokens": 16776165.0,
"step": 246
},
{
"epoch": 0.4994944388270981,
"grad_norm": 0.18431027233600616,
"learning_rate": 0.00038296905176163197,
"loss": 0.3143,
"mean_token_accuracy": 0.8902600333094597,
"num_tokens": 16835743.0,
"step": 247
},
{
"epoch": 0.5015166835187057,
"grad_norm": 0.18272148072719574,
"learning_rate": 0.0003828019261817863,
"loss": 0.3243,
"mean_token_accuracy": 0.8864033743739128,
"num_tokens": 16899775.0,
"step": 248
},
{
"epoch": 0.5035389282103134,
"grad_norm": 0.1712082177400589,
"learning_rate": 0.0003826340257225184,
"loss": 0.324,
"mean_token_accuracy": 0.8914847373962402,
"num_tokens": 16972506.0,
"step": 249
},
{
"epoch": 0.5055611729019212,
"grad_norm": 0.18500936031341553,
"learning_rate": 0.00038246535118297497,
"loss": 0.3006,
"mean_token_accuracy": 0.8903259225189686,
"num_tokens": 17036215.0,
"step": 250
},
{
"epoch": 0.5075834175935288,
"grad_norm": 0.19614791870117188,
"learning_rate": 0.00038229590336598694,
"loss": 0.3176,
"mean_token_accuracy": 0.8885915465652943,
"num_tokens": 17099060.0,
"step": 251
},
{
"epoch": 0.5096056622851365,
"grad_norm": 0.20587585866451263,
"learning_rate": 0.0003821256830780658,
"loss": 0.3252,
"mean_token_accuracy": 0.8900357261300087,
"num_tokens": 17160737.0,
"step": 252
},
{
"epoch": 0.5116279069767442,
"grad_norm": 0.16274958848953247,
"learning_rate": 0.0003819546911293999,
"loss": 0.3065,
"mean_token_accuracy": 0.8940119668841362,
"num_tokens": 17228903.0,
"step": 253
},
{
"epoch": 0.5136501516683518,
"grad_norm": 0.16572465002536774,
"learning_rate": 0.0003817829283338501,
"loss": 0.3011,
"mean_token_accuracy": 0.8989259153604507,
"num_tokens": 17309457.0,
"step": 254
},
{
"epoch": 0.5156723963599595,
"grad_norm": 0.21092504262924194,
"learning_rate": 0.0003816103955089464,
"loss": 0.3645,
"mean_token_accuracy": 0.8738524205982685,
"num_tokens": 17371710.0,
"step": 255
},
{
"epoch": 0.5176946410515673,
"grad_norm": 0.1776529848575592,
"learning_rate": 0.0003814370934758839,
"loss": 0.3413,
"mean_token_accuracy": 0.8868374638259411,
"num_tokens": 17445132.0,
"step": 256
},
{
"epoch": 0.5197168857431749,
"grad_norm": 0.1718549132347107,
"learning_rate": 0.0003812630230595188,
"loss": 0.3472,
"mean_token_accuracy": 0.8835309036076069,
"num_tokens": 17511865.0,
"step": 257
},
{
"epoch": 0.5217391304347826,
"grad_norm": 0.17998023331165314,
"learning_rate": 0.0003810881850883645,
"loss": 0.35,
"mean_token_accuracy": 0.8810900300741196,
"num_tokens": 17579299.0,
"step": 258
},
{
"epoch": 0.5237613751263903,
"grad_norm": 0.15693029761314392,
"learning_rate": 0.0003809125803945878,
"loss": 0.2853,
"mean_token_accuracy": 0.8982386291027069,
"num_tokens": 17651748.0,
"step": 259
},
{
"epoch": 0.5257836198179979,
"grad_norm": 0.18085962533950806,
"learning_rate": 0.00038073620981400467,
"loss": 0.2962,
"mean_token_accuracy": 0.898784764111042,
"num_tokens": 17716859.0,
"step": 260
},
{
"epoch": 0.5278058645096056,
"grad_norm": 0.2137887328863144,
"learning_rate": 0.00038055907418607654,
"loss": 0.3485,
"mean_token_accuracy": 0.8793986700475216,
"num_tokens": 17776486.0,
"step": 261
},
{
"epoch": 0.5298281092012134,
"grad_norm": 0.169187992811203,
"learning_rate": 0.0003803811743539062,
"loss": 0.3093,
"mean_token_accuracy": 0.8887566514313221,
"num_tokens": 17844621.0,
"step": 262
},
{
"epoch": 0.531850353892821,
"grad_norm": 0.1435088813304901,
"learning_rate": 0.0003802025111642338,
"loss": 0.2623,
"mean_token_accuracy": 0.9051036462187767,
"num_tokens": 17930557.0,
"step": 263
},
{
"epoch": 0.5338725985844287,
"grad_norm": 0.1761457622051239,
"learning_rate": 0.00038002308546743256,
"loss": 0.3008,
"mean_token_accuracy": 0.8946518003940582,
"num_tokens": 17999603.0,
"step": 264
},
{
"epoch": 0.5358948432760364,
"grad_norm": 0.17682158946990967,
"learning_rate": 0.0003798428981175053,
"loss": 0.3043,
"mean_token_accuracy": 0.8925192318856716,
"num_tokens": 18071957.0,
"step": 265
},
{
"epoch": 0.537917087967644,
"grad_norm": 0.18640998005867004,
"learning_rate": 0.0003796619499720799,
"loss": 0.3145,
"mean_token_accuracy": 0.8919526562094688,
"num_tokens": 18141501.0,
"step": 266
},
{
"epoch": 0.5399393326592518,
"grad_norm": 0.1694413125514984,
"learning_rate": 0.0003794802418924054,
"loss": 0.299,
"mean_token_accuracy": 0.8966234587132931,
"num_tokens": 18215962.0,
"step": 267
},
{
"epoch": 0.5419615773508595,
"grad_norm": 0.1720503866672516,
"learning_rate": 0.00037929777474334756,
"loss": 0.3269,
"mean_token_accuracy": 0.8884270638227463,
"num_tokens": 18282357.0,
"step": 268
},
{
"epoch": 0.5439838220424671,
"grad_norm": 0.19224666059017181,
"learning_rate": 0.0003791145493933855,
"loss": 0.3477,
"mean_token_accuracy": 0.8821601495146751,
"num_tokens": 18347587.0,
"step": 269
},
{
"epoch": 0.5460060667340748,
"grad_norm": 0.1664774864912033,
"learning_rate": 0.0003789305667146069,
"loss": 0.3009,
"mean_token_accuracy": 0.8948215469717979,
"num_tokens": 18415368.0,
"step": 270
},
{
"epoch": 0.5480283114256825,
"grad_norm": 0.18322114646434784,
"learning_rate": 0.0003787458275827039,
"loss": 0.3195,
"mean_token_accuracy": 0.8908861018717289,
"num_tokens": 18482285.0,
"step": 271
},
{
"epoch": 0.5500505561172901,
"grad_norm": 0.20983459055423737,
"learning_rate": 0.00037856033287696943,
"loss": 0.2945,
"mean_token_accuracy": 0.8964951671659946,
"num_tokens": 18540330.0,
"step": 272
},
{
"epoch": 0.5520728008088979,
"grad_norm": 0.1815643161535263,
"learning_rate": 0.00037837408348029235,
"loss": 0.3159,
"mean_token_accuracy": 0.8929238878190517,
"num_tokens": 18604976.0,
"step": 273
},
{
"epoch": 0.5540950455005056,
"grad_norm": 0.2073771208524704,
"learning_rate": 0.00037818708027915376,
"loss": 0.3244,
"mean_token_accuracy": 0.8876978568732738,
"num_tokens": 18672322.0,
"step": 274
},
{
"epoch": 0.5561172901921132,
"grad_norm": 0.19434937834739685,
"learning_rate": 0.00037799932416362266,
"loss": 0.3111,
"mean_token_accuracy": 0.8910202607512474,
"num_tokens": 18735221.0,
"step": 275
},
{
"epoch": 0.5581395348837209,
"grad_norm": 0.15825523436069489,
"learning_rate": 0.00037781081602735145,
"loss": 0.2758,
"mean_token_accuracy": 0.8941913619637489,
"num_tokens": 18815168.0,
"step": 276
},
{
"epoch": 0.5601617795753286,
"grad_norm": 0.16185039281845093,
"learning_rate": 0.00037762155676757196,
"loss": 0.2978,
"mean_token_accuracy": 0.89651133492589,
"num_tokens": 18884062.0,
"step": 277
},
{
"epoch": 0.5621840242669363,
"grad_norm": 0.18850262463092804,
"learning_rate": 0.00037743154728509123,
"loss": 0.3109,
"mean_token_accuracy": 0.8866820931434631,
"num_tokens": 18948236.0,
"step": 278
},
{
"epoch": 0.564206268958544,
"grad_norm": 0.1736079454421997,
"learning_rate": 0.00037724078848428707,
"loss": 0.28,
"mean_token_accuracy": 0.9002561867237091,
"num_tokens": 19017663.0,
"step": 279
},
{
"epoch": 0.5662285136501517,
"grad_norm": 0.15573325753211975,
"learning_rate": 0.0003770492812731035,
"loss": 0.3072,
"mean_token_accuracy": 0.891198180615902,
"num_tokens": 19089120.0,
"step": 280
},
{
"epoch": 0.5682507583417593,
"grad_norm": 0.18526090681552887,
"learning_rate": 0.0003768570265630471,
"loss": 0.3305,
"mean_token_accuracy": 0.8860407620668411,
"num_tokens": 19154650.0,
"step": 281
},
{
"epoch": 0.570273003033367,
"grad_norm": 0.1691296249628067,
"learning_rate": 0.00037666402526918195,
"loss": 0.3188,
"mean_token_accuracy": 0.8919213153421879,
"num_tokens": 19224445.0,
"step": 282
},
{
"epoch": 0.5722952477249748,
"grad_norm": 0.17496982216835022,
"learning_rate": 0.0003764702783101259,
"loss": 0.3032,
"mean_token_accuracy": 0.8902747184038162,
"num_tokens": 19298006.0,
"step": 283
},
{
"epoch": 0.5743174924165824,
"grad_norm": 0.14839443564414978,
"learning_rate": 0.00037627578660804565,
"loss": 0.2734,
"mean_token_accuracy": 0.8967320993542671,
"num_tokens": 19374661.0,
"step": 284
},
{
"epoch": 0.5763397371081901,
"grad_norm": 0.1658451408147812,
"learning_rate": 0.0003760805510886527,
"loss": 0.2999,
"mean_token_accuracy": 0.8904417157173157,
"num_tokens": 19450524.0,
"step": 285
},
{
"epoch": 0.5783619817997978,
"grad_norm": 0.19472143054008484,
"learning_rate": 0.000375884572681199,
"loss": 0.3083,
"mean_token_accuracy": 0.8959350101649761,
"num_tokens": 19516098.0,
"step": 286
},
{
"epoch": 0.5803842264914054,
"grad_norm": 0.17645469307899475,
"learning_rate": 0.0003756878523184721,
"loss": 0.3232,
"mean_token_accuracy": 0.8930424600839615,
"num_tokens": 19586030.0,
"step": 287
},
{
"epoch": 0.5824064711830131,
"grad_norm": 0.16119012236595154,
"learning_rate": 0.0003754903909367912,
"loss": 0.2305,
"mean_token_accuracy": 0.9079996608197689,
"num_tokens": 19652723.0,
"step": 288
},
{
"epoch": 0.5844287158746209,
"grad_norm": 0.1650022268295288,
"learning_rate": 0.00037529218947600254,
"loss": 0.2913,
"mean_token_accuracy": 0.8961706385016441,
"num_tokens": 19721013.0,
"step": 289
},
{
"epoch": 0.5864509605662285,
"grad_norm": 0.1751680225133896,
"learning_rate": 0.00037509324887947465,
"loss": 0.2996,
"mean_token_accuracy": 0.8925143517553806,
"num_tokens": 19785808.0,
"step": 290
},
{
"epoch": 0.5884732052578362,
"grad_norm": 0.14274518191814423,
"learning_rate": 0.0003748935700940942,
"loss": 0.3009,
"mean_token_accuracy": 0.8944595381617546,
"num_tokens": 19864767.0,
"step": 291
},
{
"epoch": 0.5904954499494439,
"grad_norm": 0.19173842668533325,
"learning_rate": 0.00037469315407026154,
"loss": 0.3189,
"mean_token_accuracy": 0.8890005201101303,
"num_tokens": 19934782.0,
"step": 292
},
{
"epoch": 0.5925176946410515,
"grad_norm": 0.16339226067066193,
"learning_rate": 0.0003744920017618856,
"loss": 0.2984,
"mean_token_accuracy": 0.891924075782299,
"num_tokens": 20004350.0,
"step": 293
},
{
"epoch": 0.5945399393326593,
"grad_norm": 0.1845332533121109,
"learning_rate": 0.0003742901141263802,
"loss": 0.3233,
"mean_token_accuracy": 0.8917621746659279,
"num_tokens": 20073462.0,
"step": 294
},
{
"epoch": 0.596562184024267,
"grad_norm": 0.18664658069610596,
"learning_rate": 0.00037408749212465895,
"loss": 0.3168,
"mean_token_accuracy": 0.8909800015389919,
"num_tokens": 20135987.0,
"step": 295
},
{
"epoch": 0.5985844287158746,
"grad_norm": 0.17890143394470215,
"learning_rate": 0.0003738841367211304,
"loss": 0.2679,
"mean_token_accuracy": 0.9026052355766296,
"num_tokens": 20206699.0,
"step": 296
},
{
"epoch": 0.6006066734074823,
"grad_norm": 0.18279992043972015,
"learning_rate": 0.0003736800488836944,
"loss": 0.2929,
"mean_token_accuracy": 0.8961853981018066,
"num_tokens": 20272267.0,
"step": 297
},
{
"epoch": 0.60262891809909,
"grad_norm": 0.2653316557407379,
"learning_rate": 0.00037347522958373664,
"loss": 0.286,
"mean_token_accuracy": 0.8971174284815788,
"num_tokens": 20343534.0,
"step": 298
},
{
"epoch": 0.6046511627906976,
"grad_norm": 0.15699949860572815,
"learning_rate": 0.00037326967979612425,
"loss": 0.2861,
"mean_token_accuracy": 0.9003230258822441,
"num_tokens": 20420939.0,
"step": 299
},
{
"epoch": 0.6066734074823054,
"grad_norm": 0.17641445994377136,
"learning_rate": 0.0003730634004992013,
"loss": 0.3051,
"mean_token_accuracy": 0.8907876797020435,
"num_tokens": 20488068.0,
"step": 300
},
{
"epoch": 0.6086956521739131,
"grad_norm": 0.1636650264263153,
"learning_rate": 0.0003728563926747842,
"loss": 0.2928,
"mean_token_accuracy": 0.8949981555342674,
"num_tokens": 20560510.0,
"step": 301
},
{
"epoch": 0.6107178968655207,
"grad_norm": 0.18622446060180664,
"learning_rate": 0.0003726486573081567,
"loss": 0.3156,
"mean_token_accuracy": 0.8932462483644485,
"num_tokens": 20627926.0,
"step": 302
},
{
"epoch": 0.6127401415571284,
"grad_norm": 0.18102477490901947,
"learning_rate": 0.00037244019538806546,
"loss": 0.2859,
"mean_token_accuracy": 0.897308062762022,
"num_tokens": 20695635.0,
"step": 303
},
{
"epoch": 0.6147623862487361,
"grad_norm": 0.19487911462783813,
"learning_rate": 0.00037223100790671526,
"loss": 0.3232,
"mean_token_accuracy": 0.8873684406280518,
"num_tokens": 20764073.0,
"step": 304
},
{
"epoch": 0.6167846309403437,
"grad_norm": 0.16768330335617065,
"learning_rate": 0.0003720210958597642,
"loss": 0.2856,
"mean_token_accuracy": 0.8974824510514736,
"num_tokens": 20834156.0,
"step": 305
},
{
"epoch": 0.6188068756319515,
"grad_norm": 0.17184442281723022,
"learning_rate": 0.00037181046024631944,
"loss": 0.3167,
"mean_token_accuracy": 0.8905413933098316,
"num_tokens": 20906046.0,
"step": 306
},
{
"epoch": 0.6208291203235592,
"grad_norm": 0.17979033291339874,
"learning_rate": 0.0003715991020689316,
"loss": 0.3166,
"mean_token_accuracy": 0.8910835459828377,
"num_tokens": 20969038.0,
"step": 307
},
{
"epoch": 0.6228513650151668,
"grad_norm": 0.16872760653495789,
"learning_rate": 0.0003713870223335907,
"loss": 0.3023,
"mean_token_accuracy": 0.8999812118709087,
"num_tokens": 21054878.0,
"step": 308
},
{
"epoch": 0.6248736097067745,
"grad_norm": 0.17098355293273926,
"learning_rate": 0.00037117422204972094,
"loss": 0.2918,
"mean_token_accuracy": 0.9006133303046227,
"num_tokens": 21120211.0,
"step": 309
},
{
"epoch": 0.6268958543983822,
"grad_norm": 0.19943217933177948,
"learning_rate": 0.00037096070223017634,
"loss": 0.2992,
"mean_token_accuracy": 0.8970108516514301,
"num_tokens": 21193385.0,
"step": 310
},
{
"epoch": 0.6289180990899899,
"grad_norm": 0.19835074245929718,
"learning_rate": 0.0003707464638912354,
"loss": 0.2987,
"mean_token_accuracy": 0.8971699252724648,
"num_tokens": 21258335.0,
"step": 311
},
{
"epoch": 0.6309403437815976,
"grad_norm": 0.1647316962480545,
"learning_rate": 0.0003705315080525967,
"loss": 0.2877,
"mean_token_accuracy": 0.8915503136813641,
"num_tokens": 21328815.0,
"step": 312
},
{
"epoch": 0.6329625884732053,
"grad_norm": 0.18789348006248474,
"learning_rate": 0.00037031583573737375,
"loss": 0.2973,
"mean_token_accuracy": 0.8956909030675888,
"num_tokens": 21408498.0,
"step": 313
},
{
"epoch": 0.6349848331648129,
"grad_norm": 0.23517835140228271,
"learning_rate": 0.0003700994479720903,
"loss": 0.3022,
"mean_token_accuracy": 0.8944514766335487,
"num_tokens": 21477506.0,
"step": 314
},
{
"epoch": 0.6370070778564206,
"grad_norm": 0.1805562973022461,
"learning_rate": 0.00036988234578667526,
"loss": 0.313,
"mean_token_accuracy": 0.892850112169981,
"num_tokens": 21543808.0,
"step": 315
},
{
"epoch": 0.6390293225480284,
"grad_norm": 0.2823885679244995,
"learning_rate": 0.0003696645302144582,
"loss": 0.3397,
"mean_token_accuracy": 0.8829572051763535,
"num_tokens": 21607431.0,
"step": 316
},
{
"epoch": 0.641051567239636,
"grad_norm": 0.19618524610996246,
"learning_rate": 0.00036944600229216375,
"loss": 0.3164,
"mean_token_accuracy": 0.8882573507726192,
"num_tokens": 21675489.0,
"step": 317
},
{
"epoch": 0.6430738119312437,
"grad_norm": 0.19782759249210358,
"learning_rate": 0.00036922676305990753,
"loss": 0.3211,
"mean_token_accuracy": 0.8908263929188251,
"num_tokens": 21739400.0,
"step": 318
},
{
"epoch": 0.6450960566228514,
"grad_norm": 0.20694133639335632,
"learning_rate": 0.00036900681356119043,
"loss": 0.2927,
"mean_token_accuracy": 0.8931123651564121,
"num_tokens": 21807454.0,
"step": 319
},
{
"epoch": 0.647118301314459,
"grad_norm": 0.16246715188026428,
"learning_rate": 0.00036878615484289395,
"loss": 0.3095,
"mean_token_accuracy": 0.8925521671772003,
"num_tokens": 21883534.0,
"step": 320
},
{
"epoch": 0.6491405460060667,
"grad_norm": 0.1689622849225998,
"learning_rate": 0.0003685647879552755,
"loss": 0.3198,
"mean_token_accuracy": 0.8910107761621475,
"num_tokens": 21954057.0,
"step": 321
},
{
"epoch": 0.6511627906976745,
"grad_norm": 0.21298348903656006,
"learning_rate": 0.0003683427139519628,
"loss": 0.3098,
"mean_token_accuracy": 0.8946363367140293,
"num_tokens": 22024559.0,
"step": 322
},
{
"epoch": 0.6531850353892821,
"grad_norm": 0.20307037234306335,
"learning_rate": 0.00036811993388994945,
"loss": 0.3042,
"mean_token_accuracy": 0.8996872641146183,
"num_tokens": 22083005.0,
"step": 323
},
{
"epoch": 0.6552072800808898,
"grad_norm": 0.19622348248958588,
"learning_rate": 0.00036789644882958953,
"loss": 0.3106,
"mean_token_accuracy": 0.8917652256786823,
"num_tokens": 22153882.0,
"step": 324
},
{
"epoch": 0.6572295247724975,
"grad_norm": 0.2048502266407013,
"learning_rate": 0.00036767225983459247,
"loss": 0.3072,
"mean_token_accuracy": 0.892122782766819,
"num_tokens": 22223638.0,
"step": 325
},
{
"epoch": 0.6592517694641051,
"grad_norm": 0.17371125519275665,
"learning_rate": 0.00036744736797201855,
"loss": 0.2818,
"mean_token_accuracy": 0.9024628438055515,
"num_tokens": 22287424.0,
"step": 326
},
{
"epoch": 0.6612740141557129,
"grad_norm": 0.1815844625234604,
"learning_rate": 0.0003672217743122732,
"loss": 0.3064,
"mean_token_accuracy": 0.8945932053029537,
"num_tokens": 22349842.0,
"step": 327
},
{
"epoch": 0.6632962588473206,
"grad_norm": 0.16366587579250336,
"learning_rate": 0.00036699547992910227,
"loss": 0.2836,
"mean_token_accuracy": 0.8982814475893974,
"num_tokens": 22436553.0,
"step": 328
},
{
"epoch": 0.6653185035389282,
"grad_norm": 0.1992887556552887,
"learning_rate": 0.00036676848589958663,
"loss": 0.325,
"mean_token_accuracy": 0.8879561647772789,
"num_tokens": 22493823.0,
"step": 329
},
{
"epoch": 0.6673407482305359,
"grad_norm": 0.17708779871463776,
"learning_rate": 0.0003665407933041375,
"loss": 0.3325,
"mean_token_accuracy": 0.8939349353313446,
"num_tokens": 22563840.0,
"step": 330
},
{
"epoch": 0.6693629929221436,
"grad_norm": 0.2144147753715515,
"learning_rate": 0.00036631240322649076,
"loss": 0.3335,
"mean_token_accuracy": 0.8810755871236324,
"num_tokens": 22624256.0,
"step": 331
},
{
"epoch": 0.6713852376137512,
"grad_norm": 0.16541875898838043,
"learning_rate": 0.0003660833167537022,
"loss": 0.3275,
"mean_token_accuracy": 0.8926926329731941,
"num_tokens": 22694170.0,
"step": 332
},
{
"epoch": 0.673407482305359,
"grad_norm": 0.1698412150144577,
"learning_rate": 0.00036585353497614224,
"loss": 0.3066,
"mean_token_accuracy": 0.8967249467968941,
"num_tokens": 22768633.0,
"step": 333
},
{
"epoch": 0.6754297269969667,
"grad_norm": 0.1821826696395874,
"learning_rate": 0.00036562305898749054,
"loss": 0.3208,
"mean_token_accuracy": 0.886600024998188,
"num_tokens": 22837600.0,
"step": 334
},
{
"epoch": 0.6774519716885743,
"grad_norm": 0.1860353797674179,
"learning_rate": 0.00036539188988473113,
"loss": 0.3098,
"mean_token_accuracy": 0.8903123624622822,
"num_tokens": 22896567.0,
"step": 335
},
{
"epoch": 0.679474216380182,
"grad_norm": 0.14535972476005554,
"learning_rate": 0.0003651600287681469,
"loss": 0.2686,
"mean_token_accuracy": 0.9052710346877575,
"num_tokens": 22973060.0,
"step": 336
},
{
"epoch": 0.6814964610717897,
"grad_norm": 0.19355034828186035,
"learning_rate": 0.0003649274767413145,
"loss": 0.2877,
"mean_token_accuracy": 0.8924892544746399,
"num_tokens": 23043913.0,
"step": 337
},
{
"epoch": 0.6835187057633973,
"grad_norm": 0.185837984085083,
"learning_rate": 0.00036469423491109913,
"loss": 0.2982,
"mean_token_accuracy": 0.8957228772342205,
"num_tokens": 23114457.0,
"step": 338
},
{
"epoch": 0.6855409504550051,
"grad_norm": 0.18406859040260315,
"learning_rate": 0.0003644603043876492,
"loss": 0.3066,
"mean_token_accuracy": 0.8950929716229439,
"num_tokens": 23181146.0,
"step": 339
},
{
"epoch": 0.6875631951466128,
"grad_norm": 0.19715051352977753,
"learning_rate": 0.00036422568628439095,
"loss": 0.307,
"mean_token_accuracy": 0.8925964459776878,
"num_tokens": 23245269.0,
"step": 340
},
{
"epoch": 0.6895854398382204,
"grad_norm": 0.18601888418197632,
"learning_rate": 0.0003639903817180233,
"loss": 0.2756,
"mean_token_accuracy": 0.8994149342179298,
"num_tokens": 23321463.0,
"step": 341
},
{
"epoch": 0.6916076845298281,
"grad_norm": 0.18005859851837158,
"learning_rate": 0.0003637543918085127,
"loss": 0.2958,
"mean_token_accuracy": 0.8977576531469822,
"num_tokens": 23385849.0,
"step": 342
},
{
"epoch": 0.6936299292214358,
"grad_norm": 0.19051752984523773,
"learning_rate": 0.00036351771767908727,
"loss": 0.3074,
"mean_token_accuracy": 0.8928764685988426,
"num_tokens": 23456847.0,
"step": 343
},
{
"epoch": 0.6956521739130435,
"grad_norm": 0.20482131838798523,
"learning_rate": 0.0003632803604562319,
"loss": 0.3029,
"mean_token_accuracy": 0.8909181989729404,
"num_tokens": 23526730.0,
"step": 344
},
{
"epoch": 0.6976744186046512,
"grad_norm": 0.17358487844467163,
"learning_rate": 0.00036304232126968295,
"loss": 0.2844,
"mean_token_accuracy": 0.898325003683567,
"num_tokens": 23588681.0,
"step": 345
},
{
"epoch": 0.6996966632962589,
"grad_norm": 0.1700018048286438,
"learning_rate": 0.00036280360125242234,
"loss": 0.2733,
"mean_token_accuracy": 0.9010062254965305,
"num_tokens": 23664445.0,
"step": 346
},
{
"epoch": 0.7017189079878665,
"grad_norm": 0.193056121468544,
"learning_rate": 0.0003625642015406727,
"loss": 0.3102,
"mean_token_accuracy": 0.8916714228689671,
"num_tokens": 23738941.0,
"step": 347
},
{
"epoch": 0.7037411526794742,
"grad_norm": 0.19169779121875763,
"learning_rate": 0.0003623241232738919,
"loss": 0.2957,
"mean_token_accuracy": 0.8949874453246593,
"num_tokens": 23801979.0,
"step": 348
},
{
"epoch": 0.7057633973710818,
"grad_norm": 0.16655734181404114,
"learning_rate": 0.00036208336759476704,
"loss": 0.2937,
"mean_token_accuracy": 0.896770391613245,
"num_tokens": 23868193.0,
"step": 349
},
{
"epoch": 0.7077856420626896,
"grad_norm": 0.15496356785297394,
"learning_rate": 0.0003618419356492099,
"loss": 0.2871,
"mean_token_accuracy": 0.9015951566398144,
"num_tokens": 23947204.0,
"step": 350
},
{
"epoch": 0.7098078867542973,
"grad_norm": 0.160264790058136,
"learning_rate": 0.00036159982858635105,
"loss": 0.2825,
"mean_token_accuracy": 0.9006201699376106,
"num_tokens": 24021149.0,
"step": 351
},
{
"epoch": 0.7118301314459049,
"grad_norm": 0.16146975755691528,
"learning_rate": 0.00036135704755853407,
"loss": 0.2757,
"mean_token_accuracy": 0.9038827978074551,
"num_tokens": 24092549.0,
"step": 352
},
{
"epoch": 0.7138523761375126,
"grad_norm": 0.20805270969867706,
"learning_rate": 0.0003611135937213106,
"loss": 0.3267,
"mean_token_accuracy": 0.8861317448318005,
"num_tokens": 24157474.0,
"step": 353
},
{
"epoch": 0.7158746208291203,
"grad_norm": 0.16421623528003693,
"learning_rate": 0.0003608694682334345,
"loss": 0.2935,
"mean_token_accuracy": 0.8962382674217224,
"num_tokens": 24230461.0,
"step": 354
},
{
"epoch": 0.717896865520728,
"grad_norm": 0.1796526312828064,
"learning_rate": 0.0003606246722568566,
"loss": 0.2841,
"mean_token_accuracy": 0.8999650441110134,
"num_tokens": 24296781.0,
"step": 355
},
{
"epoch": 0.7199191102123357,
"grad_norm": 0.18790611624717712,
"learning_rate": 0.0003603792069567187,
"loss": 0.3496,
"mean_token_accuracy": 0.8827480934560299,
"num_tokens": 24361770.0,
"step": 356
},
{
"epoch": 0.7219413549039434,
"grad_norm": 0.16473916172981262,
"learning_rate": 0.00036013307350134884,
"loss": 0.314,
"mean_token_accuracy": 0.8960560448467731,
"num_tokens": 24432956.0,
"step": 357
},
{
"epoch": 0.723963599595551,
"grad_norm": 0.17466352880001068,
"learning_rate": 0.0003598862730622548,
"loss": 0.3113,
"mean_token_accuracy": 0.8914259672164917,
"num_tokens": 24499417.0,
"step": 358
},
{
"epoch": 0.7259858442871587,
"grad_norm": 0.17617358267307281,
"learning_rate": 0.0003596388068141191,
"loss": 0.2961,
"mean_token_accuracy": 0.893797617405653,
"num_tokens": 24567238.0,
"step": 359
},
{
"epoch": 0.7280080889787665,
"grad_norm": 0.18195107579231262,
"learning_rate": 0.0003593906759347934,
"loss": 0.318,
"mean_token_accuracy": 0.8848773874342442,
"num_tokens": 24634769.0,
"step": 360
},
{
"epoch": 0.7300303336703741,
"grad_norm": 0.16119951009750366,
"learning_rate": 0.00035914188160529267,
"loss": 0.2863,
"mean_token_accuracy": 0.89824278652668,
"num_tokens": 24701688.0,
"step": 361
},
{
"epoch": 0.7320525783619818,
"grad_norm": 0.1530333161354065,
"learning_rate": 0.00035889242500978966,
"loss": 0.2737,
"mean_token_accuracy": 0.901515819132328,
"num_tokens": 24778487.0,
"step": 362
},
{
"epoch": 0.7340748230535895,
"grad_norm": 0.172471821308136,
"learning_rate": 0.0003586423073356092,
"loss": 0.3,
"mean_token_accuracy": 0.8986642919480801,
"num_tokens": 24844779.0,
"step": 363
},
{
"epoch": 0.7360970677451971,
"grad_norm": 0.1733032464981079,
"learning_rate": 0.00035839152977322275,
"loss": 0.2899,
"mean_token_accuracy": 0.8977354988455772,
"num_tokens": 24909088.0,
"step": 364
},
{
"epoch": 0.7381193124368048,
"grad_norm": 0.16756588220596313,
"learning_rate": 0.00035814009351624256,
"loss": 0.2923,
"mean_token_accuracy": 0.897175993770361,
"num_tokens": 24978294.0,
"step": 365
},
{
"epoch": 0.7401415571284126,
"grad_norm": 0.1823996752500534,
"learning_rate": 0.00035788799976141605,
"loss": 0.3227,
"mean_token_accuracy": 0.8896390423178673,
"num_tokens": 25043171.0,
"step": 366
},
{
"epoch": 0.7421638018200202,
"grad_norm": 0.18004441261291504,
"learning_rate": 0.0003576352497086201,
"loss": 0.2954,
"mean_token_accuracy": 0.8963689431548119,
"num_tokens": 25113306.0,
"step": 367
},
{
"epoch": 0.7441860465116279,
"grad_norm": 0.19010895490646362,
"learning_rate": 0.0003573818445608552,
"loss": 0.3013,
"mean_token_accuracy": 0.8936556875705719,
"num_tokens": 25178407.0,
"step": 368
},
{
"epoch": 0.7462082912032356,
"grad_norm": 0.2009873390197754,
"learning_rate": 0.0003571277855242401,
"loss": 0.3204,
"mean_token_accuracy": 0.8890100382268429,
"num_tokens": 25236571.0,
"step": 369
},
{
"epoch": 0.7482305358948432,
"grad_norm": 0.17589393258094788,
"learning_rate": 0.00035687307380800556,
"loss": 0.3046,
"mean_token_accuracy": 0.8946997821331024,
"num_tokens": 25298545.0,
"step": 370
},
{
"epoch": 0.750252780586451,
"grad_norm": 0.1642550528049469,
"learning_rate": 0.00035661771062448915,
"loss": 0.2808,
"mean_token_accuracy": 0.8977020867168903,
"num_tokens": 25371496.0,
"step": 371
},
{
"epoch": 0.7522750252780587,
"grad_norm": 0.178288072347641,
"learning_rate": 0.00035636169718912894,
"loss": 0.3122,
"mean_token_accuracy": 0.8912137039005756,
"num_tokens": 25434070.0,
"step": 372
},
{
"epoch": 0.7542972699696663,
"grad_norm": 0.1830630898475647,
"learning_rate": 0.0003561050347204581,
"loss": 0.3156,
"mean_token_accuracy": 0.8928086012601852,
"num_tokens": 25499661.0,
"step": 373
},
{
"epoch": 0.756319514661274,
"grad_norm": 0.15954959392547607,
"learning_rate": 0.000355847724440099,
"loss": 0.281,
"mean_token_accuracy": 0.896581944078207,
"num_tokens": 25577844.0,
"step": 374
},
{
"epoch": 0.7583417593528817,
"grad_norm": 0.200165256857872,
"learning_rate": 0.00035558976757275716,
"loss": 0.3191,
"mean_token_accuracy": 0.8899872414767742,
"num_tokens": 25638524.0,
"step": 375
},
{
"epoch": 0.7603640040444893,
"grad_norm": 0.1939467191696167,
"learning_rate": 0.00035533116534621596,
"loss": 0.3107,
"mean_token_accuracy": 0.8947361186146736,
"num_tokens": 25704939.0,
"step": 376
},
{
"epoch": 0.7623862487360971,
"grad_norm": 0.16760645806789398,
"learning_rate": 0.0003550719189913302,
"loss": 0.2895,
"mean_token_accuracy": 0.9010074771940708,
"num_tokens": 25773040.0,
"step": 377
},
{
"epoch": 0.7644084934277048,
"grad_norm": 0.17111922800540924,
"learning_rate": 0.0003548120297420204,
"loss": 0.2941,
"mean_token_accuracy": 0.8943174667656422,
"num_tokens": 25841353.0,
"step": 378
},
{
"epoch": 0.7664307381193124,
"grad_norm": 0.19698713719844818,
"learning_rate": 0.00035455149883526746,
"loss": 0.3089,
"mean_token_accuracy": 0.8988425992429256,
"num_tokens": 25908894.0,
"step": 379
},
{
"epoch": 0.7684529828109201,
"grad_norm": 0.19156275689601898,
"learning_rate": 0.00035429032751110596,
"loss": 0.2904,
"mean_token_accuracy": 0.8982725702226162,
"num_tokens": 25976883.0,
"step": 380
},
{
"epoch": 0.7704752275025278,
"grad_norm": 0.17211389541625977,
"learning_rate": 0.00035402851701261874,
"loss": 0.2999,
"mean_token_accuracy": 0.8920269943773746,
"num_tokens": 26045757.0,
"step": 381
},
{
"epoch": 0.7724974721941354,
"grad_norm": 0.17306530475616455,
"learning_rate": 0.000353766068585931,
"loss": 0.301,
"mean_token_accuracy": 0.8918648697435856,
"num_tokens": 26118719.0,
"step": 382
},
{
"epoch": 0.7745197168857432,
"grad_norm": 0.17627696692943573,
"learning_rate": 0.00035350298348020407,
"loss": 0.2979,
"mean_token_accuracy": 0.8935811407864094,
"num_tokens": 26183890.0,
"step": 383
},
{
"epoch": 0.7765419615773509,
"grad_norm": 0.16283521056175232,
"learning_rate": 0.0003532392629476298,
"loss": 0.2819,
"mean_token_accuracy": 0.895574290305376,
"num_tokens": 26254712.0,
"step": 384
},
{
"epoch": 0.7785642062689585,
"grad_norm": 0.18045000731945038,
"learning_rate": 0.00035297490824342436,
"loss": 0.307,
"mean_token_accuracy": 0.8899386301636696,
"num_tokens": 26317196.0,
"step": 385
},
{
"epoch": 0.7805864509605662,
"grad_norm": 0.15806086361408234,
"learning_rate": 0.0003527099206258223,
"loss": 0.289,
"mean_token_accuracy": 0.8989690914750099,
"num_tokens": 26385704.0,
"step": 386
},
{
"epoch": 0.782608695652174,
"grad_norm": 0.17871202528476715,
"learning_rate": 0.0003524443013560709,
"loss": 0.2968,
"mean_token_accuracy": 0.8961369805037975,
"num_tokens": 26453865.0,
"step": 387
},
{
"epoch": 0.7846309403437816,
"grad_norm": 0.17596516013145447,
"learning_rate": 0.0003521780516984234,
"loss": 0.2849,
"mean_token_accuracy": 0.8956369571387768,
"num_tokens": 26519337.0,
"step": 388
},
{
"epoch": 0.7866531850353893,
"grad_norm": 0.1952444314956665,
"learning_rate": 0.00035191117292013394,
"loss": 0.3073,
"mean_token_accuracy": 0.8928476311266422,
"num_tokens": 26590979.0,
"step": 389
},
{
"epoch": 0.788675429726997,
"grad_norm": 0.16196580231189728,
"learning_rate": 0.00035164366629145073,
"loss": 0.2858,
"mean_token_accuracy": 0.8969371728599072,
"num_tokens": 26662280.0,
"step": 390
},
{
"epoch": 0.7906976744186046,
"grad_norm": 0.18022611737251282,
"learning_rate": 0.0003513755330856104,
"loss": 0.2996,
"mean_token_accuracy": 0.8949360400438309,
"num_tokens": 26735704.0,
"step": 391
},
{
"epoch": 0.7927199191102123,
"grad_norm": 0.1670723408460617,
"learning_rate": 0.000351106774578832,
"loss": 0.3023,
"mean_token_accuracy": 0.8980297967791557,
"num_tokens": 26806733.0,
"step": 392
},
{
"epoch": 0.7947421638018201,
"grad_norm": 0.16242116689682007,
"learning_rate": 0.0003508373920503108,
"loss": 0.2683,
"mean_token_accuracy": 0.8985998295247555,
"num_tokens": 26873233.0,
"step": 393
},
{
"epoch": 0.7967644084934277,
"grad_norm": 0.15898491442203522,
"learning_rate": 0.00035056738678221176,
"loss": 0.2938,
"mean_token_accuracy": 0.8989557921886444,
"num_tokens": 26949546.0,
"step": 394
},
{
"epoch": 0.7987866531850354,
"grad_norm": 0.1636972278356552,
"learning_rate": 0.00035029676005966445,
"loss": 0.2884,
"mean_token_accuracy": 0.8981003984808922,
"num_tokens": 27014513.0,
"step": 395
},
{
"epoch": 0.8008088978766431,
"grad_norm": 0.1949148327112198,
"learning_rate": 0.000350025513170756,
"loss": 0.3172,
"mean_token_accuracy": 0.8922760672867298,
"num_tokens": 27076549.0,
"step": 396
},
{
"epoch": 0.8028311425682507,
"grad_norm": 0.18752135336399078,
"learning_rate": 0.0003497536474065254,
"loss": 0.3197,
"mean_token_accuracy": 0.8879435993731022,
"num_tokens": 27143261.0,
"step": 397
},
{
"epoch": 0.8048533872598584,
"grad_norm": 0.18382735550403595,
"learning_rate": 0.0003494811640609572,
"loss": 0.3165,
"mean_token_accuracy": 0.8949453271925449,
"num_tokens": 27208188.0,
"step": 398
},
{
"epoch": 0.8068756319514662,
"grad_norm": 0.1782997101545334,
"learning_rate": 0.0003492080644309756,
"loss": 0.3018,
"mean_token_accuracy": 0.8956249915063381,
"num_tokens": 27279349.0,
"step": 399
},
{
"epoch": 0.8088978766430738,
"grad_norm": 0.16625821590423584,
"learning_rate": 0.0003489343498164378,
"loss": 0.2909,
"mean_token_accuracy": 0.8978218026459217,
"num_tokens": 27349491.0,
"step": 400
},
{
"epoch": 0.8109201213346815,
"grad_norm": 0.2034144103527069,
"learning_rate": 0.0003486600215201284,
"loss": 0.3205,
"mean_token_accuracy": 0.8883098587393761,
"num_tokens": 27425145.0,
"step": 401
},
{
"epoch": 0.8129423660262892,
"grad_norm": 0.18235254287719727,
"learning_rate": 0.0003483850808477527,
"loss": 0.3142,
"mean_token_accuracy": 0.8946905098855495,
"num_tokens": 27493953.0,
"step": 402
},
{
"epoch": 0.8149646107178968,
"grad_norm": 0.16972221434116364,
"learning_rate": 0.00034810952910793085,
"loss": 0.3183,
"mean_token_accuracy": 0.886278223246336,
"num_tokens": 27559794.0,
"step": 403
},
{
"epoch": 0.8169868554095046,
"grad_norm": 0.17891989648342133,
"learning_rate": 0.00034783336761219137,
"loss": 0.2848,
"mean_token_accuracy": 0.8995977118611336,
"num_tokens": 27629989.0,
"step": 404
},
{
"epoch": 0.8190091001011123,
"grad_norm": 0.1790463924407959,
"learning_rate": 0.0003475565976749651,
"loss": 0.3109,
"mean_token_accuracy": 0.8868453428149223,
"num_tokens": 27688846.0,
"step": 405
},
{
"epoch": 0.8210313447927199,
"grad_norm": 0.1789504438638687,
"learning_rate": 0.00034727922061357855,
"loss": 0.3284,
"mean_token_accuracy": 0.8879125751554966,
"num_tokens": 27755235.0,
"step": 406
},
{
"epoch": 0.8230535894843276,
"grad_norm": 0.19450780749320984,
"learning_rate": 0.0003470012377482484,
"loss": 0.3079,
"mean_token_accuracy": 0.8906297236680984,
"num_tokens": 27819736.0,
"step": 407
},
{
"epoch": 0.8250758341759353,
"grad_norm": 0.21135565638542175,
"learning_rate": 0.0003467226504020743,
"loss": 0.3314,
"mean_token_accuracy": 0.8855904154479504,
"num_tokens": 27878648.0,
"step": 408
},
{
"epoch": 0.8270980788675429,
"grad_norm": 0.1756933629512787,
"learning_rate": 0.0003464434599010333,
"loss": 0.3045,
"mean_token_accuracy": 0.8893042095005512,
"num_tokens": 27937967.0,
"step": 409
},
{
"epoch": 0.8291203235591507,
"grad_norm": 0.1893833428621292,
"learning_rate": 0.0003461636675739732,
"loss": 0.3089,
"mean_token_accuracy": 0.8921520821750164,
"num_tokens": 28003500.0,
"step": 410
},
{
"epoch": 0.8311425682507584,
"grad_norm": 0.19579611718654633,
"learning_rate": 0.0003458832747526061,
"loss": 0.2954,
"mean_token_accuracy": 0.8962360806763172,
"num_tokens": 28060691.0,
"step": 411
},
{
"epoch": 0.833164812942366,
"grad_norm": 0.19954101741313934,
"learning_rate": 0.0003456022827715025,
"loss": 0.3057,
"mean_token_accuracy": 0.8955631367862225,
"num_tokens": 28119842.0,
"step": 412
},
{
"epoch": 0.8351870576339737,
"grad_norm": 0.17535583674907684,
"learning_rate": 0.0003453206929680844,
"loss": 0.3181,
"mean_token_accuracy": 0.8896914720535278,
"num_tokens": 28189519.0,
"step": 413
},
{
"epoch": 0.8372093023255814,
"grad_norm": 0.2034400850534439,
"learning_rate": 0.0003450385066826195,
"loss": 0.3132,
"mean_token_accuracy": 0.8903135284781456,
"num_tokens": 28256532.0,
"step": 414
},
{
"epoch": 0.839231547017189,
"grad_norm": 0.18071752786636353,
"learning_rate": 0.0003447557252582145,
"loss": 0.3229,
"mean_token_accuracy": 0.891409307718277,
"num_tokens": 28320211.0,
"step": 415
},
{
"epoch": 0.8412537917087968,
"grad_norm": 0.17119021713733673,
"learning_rate": 0.00034447235004080853,
"loss": 0.3096,
"mean_token_accuracy": 0.8913502097129822,
"num_tokens": 28384204.0,
"step": 416
},
{
"epoch": 0.8432760364004045,
"grad_norm": 0.17320208251476288,
"learning_rate": 0.0003441883823791671,
"loss": 0.2935,
"mean_token_accuracy": 0.8983162231743336,
"num_tokens": 28454515.0,
"step": 417
},
{
"epoch": 0.8452982810920121,
"grad_norm": 0.17323511838912964,
"learning_rate": 0.0003439038236248757,
"loss": 0.3053,
"mean_token_accuracy": 0.8946337774395943,
"num_tokens": 28524571.0,
"step": 418
},
{
"epoch": 0.8473205257836198,
"grad_norm": 0.19488638639450073,
"learning_rate": 0.00034361867513233303,
"loss": 0.3131,
"mean_token_accuracy": 0.8917714729905128,
"num_tokens": 28583638.0,
"step": 419
},
{
"epoch": 0.8493427704752275,
"grad_norm": 0.14881743490695953,
"learning_rate": 0.00034333293825874464,
"loss": 0.2561,
"mean_token_accuracy": 0.9055963829159737,
"num_tokens": 28668101.0,
"step": 420
},
{
"epoch": 0.8513650151668352,
"grad_norm": 0.17198774218559265,
"learning_rate": 0.0003430466143641168,
"loss": 0.3071,
"mean_token_accuracy": 0.8936148509383202,
"num_tokens": 28739207.0,
"step": 421
},
{
"epoch": 0.8533872598584429,
"grad_norm": 0.18449024856090546,
"learning_rate": 0.00034275970481124977,
"loss": 0.276,
"mean_token_accuracy": 0.9006006754934788,
"num_tokens": 28803993.0,
"step": 422
},
{
"epoch": 0.8554095045500506,
"grad_norm": 0.1962573230266571,
"learning_rate": 0.0003424722109657311,
"loss": 0.3059,
"mean_token_accuracy": 0.8941029235720634,
"num_tokens": 28870609.0,
"step": 423
},
{
"epoch": 0.8574317492416582,
"grad_norm": 0.1821158230304718,
"learning_rate": 0.00034218413419592953,
"loss": 0.2905,
"mean_token_accuracy": 0.890890721231699,
"num_tokens": 28943383.0,
"step": 424
},
{
"epoch": 0.8594539939332659,
"grad_norm": 0.18370237946510315,
"learning_rate": 0.00034189547587298836,
"loss": 0.3166,
"mean_token_accuracy": 0.8925870470702648,
"num_tokens": 29009040.0,
"step": 425
},
{
"epoch": 0.8614762386248737,
"grad_norm": 0.17396995425224304,
"learning_rate": 0.00034160623737081885,
"loss": 0.3011,
"mean_token_accuracy": 0.8914640247821808,
"num_tokens": 29074553.0,
"step": 426
},
{
"epoch": 0.8634984833164813,
"grad_norm": 0.19026698172092438,
"learning_rate": 0.00034131642006609365,
"loss": 0.3249,
"mean_token_accuracy": 0.8910115286707878,
"num_tokens": 29147204.0,
"step": 427
},
{
"epoch": 0.865520728008089,
"grad_norm": 0.1589595079421997,
"learning_rate": 0.00034102602533824027,
"loss": 0.2785,
"mean_token_accuracy": 0.903257142752409,
"num_tokens": 29218571.0,
"step": 428
},
{
"epoch": 0.8675429726996967,
"grad_norm": 0.1674802154302597,
"learning_rate": 0.00034073505456943463,
"loss": 0.2977,
"mean_token_accuracy": 0.8929527476429939,
"num_tokens": 29287185.0,
"step": 429
},
{
"epoch": 0.8695652173913043,
"grad_norm": 0.17129530012607574,
"learning_rate": 0.0003404435091445945,
"loss": 0.2769,
"mean_token_accuracy": 0.8992316760122776,
"num_tokens": 29355908.0,
"step": 430
},
{
"epoch": 0.871587462082912,
"grad_norm": 0.1718977391719818,
"learning_rate": 0.00034015139045137253,
"loss": 0.3137,
"mean_token_accuracy": 0.8935650922358036,
"num_tokens": 29421396.0,
"step": 431
},
{
"epoch": 0.8736097067745198,
"grad_norm": 0.17011679708957672,
"learning_rate": 0.00033985869988015016,
"loss": 0.2855,
"mean_token_accuracy": 0.8953105248510838,
"num_tokens": 29493294.0,
"step": 432
},
{
"epoch": 0.8756319514661274,
"grad_norm": 0.1868988573551178,
"learning_rate": 0.0003395654388240307,
"loss": 0.3196,
"mean_token_accuracy": 0.8894085213541985,
"num_tokens": 29555484.0,
"step": 433
},
{
"epoch": 0.8776541961577351,
"grad_norm": 0.15462960302829742,
"learning_rate": 0.0003392716086788328,
"loss": 0.2957,
"mean_token_accuracy": 0.8983679711818695,
"num_tokens": 29623656.0,
"step": 434
},
{
"epoch": 0.8796764408493428,
"grad_norm": 0.16427457332611084,
"learning_rate": 0.0003389772108430835,
"loss": 0.2979,
"mean_token_accuracy": 0.8941413648426533,
"num_tokens": 29690023.0,
"step": 435
},
{
"epoch": 0.8816986855409504,
"grad_norm": 0.1687782257795334,
"learning_rate": 0.00033868224671801243,
"loss": 0.2573,
"mean_token_accuracy": 0.9024456590414047,
"num_tokens": 29756579.0,
"step": 436
},
{
"epoch": 0.8837209302325582,
"grad_norm": 0.1602339744567871,
"learning_rate": 0.00033838671770754393,
"loss": 0.2829,
"mean_token_accuracy": 0.9009444527328014,
"num_tokens": 29823974.0,
"step": 437
},
{
"epoch": 0.8857431749241659,
"grad_norm": 0.17867590487003326,
"learning_rate": 0.00033809062521829135,
"loss": 0.3058,
"mean_token_accuracy": 0.8952712267637253,
"num_tokens": 29896076.0,
"step": 438
},
{
"epoch": 0.8877654196157735,
"grad_norm": 0.20030587911605835,
"learning_rate": 0.0003377939706595499,
"loss": 0.3275,
"mean_token_accuracy": 0.8882710337638855,
"num_tokens": 29959878.0,
"step": 439
},
{
"epoch": 0.8897876643073812,
"grad_norm": 0.18861141800880432,
"learning_rate": 0.00033749675544329007,
"loss": 0.2941,
"mean_token_accuracy": 0.894235398620367,
"num_tokens": 30029419.0,
"step": 440
},
{
"epoch": 0.8918099089989889,
"grad_norm": 0.17503049969673157,
"learning_rate": 0.0003371989809841508,
"loss": 0.2796,
"mean_token_accuracy": 0.8981444463133812,
"num_tokens": 30099980.0,
"step": 441
},
{
"epoch": 0.8938321536905965,
"grad_norm": 0.17344842851161957,
"learning_rate": 0.00033690064869943304,
"loss": 0.2806,
"mean_token_accuracy": 0.9028143547475338,
"num_tokens": 30160123.0,
"step": 442
},
{
"epoch": 0.8958543983822043,
"grad_norm": 0.21486879885196686,
"learning_rate": 0.00033660176000909256,
"loss": 0.3017,
"mean_token_accuracy": 0.8955220691859722,
"num_tokens": 30221040.0,
"step": 443
},
{
"epoch": 0.897876643073812,
"grad_norm": 0.16732099652290344,
"learning_rate": 0.0003363023163357335,
"loss": 0.3038,
"mean_token_accuracy": 0.8961573019623756,
"num_tokens": 30288318.0,
"step": 444
},
{
"epoch": 0.8998988877654196,
"grad_norm": 0.17499873042106628,
"learning_rate": 0.00033600231910460153,
"loss": 0.2942,
"mean_token_accuracy": 0.8975008726119995,
"num_tokens": 30351020.0,
"step": 445
},
{
"epoch": 0.9019211324570273,
"grad_norm": 0.18658067286014557,
"learning_rate": 0.0003357017697435771,
"loss": 0.2997,
"mean_token_accuracy": 0.8956367336213589,
"num_tokens": 30425559.0,
"step": 446
},
{
"epoch": 0.9039433771486349,
"grad_norm": 0.19921845197677612,
"learning_rate": 0.0003354006696831685,
"loss": 0.321,
"mean_token_accuracy": 0.8870183601975441,
"num_tokens": 30487225.0,
"step": 447
},
{
"epoch": 0.9059656218402427,
"grad_norm": 0.15201924741268158,
"learning_rate": 0.00033509902035650527,
"loss": 0.2805,
"mean_token_accuracy": 0.8986309170722961,
"num_tokens": 30566969.0,
"step": 448
},
{
"epoch": 0.9079878665318504,
"grad_norm": 0.14417074620723724,
"learning_rate": 0.00033479682319933124,
"loss": 0.2746,
"mean_token_accuracy": 0.9016837328672409,
"num_tokens": 30657549.0,
"step": 449
},
{
"epoch": 0.910010111223458,
"grad_norm": 0.20164437592029572,
"learning_rate": 0.00033449407964999755,
"loss": 0.307,
"mean_token_accuracy": 0.8908158242702484,
"num_tokens": 30719396.0,
"step": 450
},
{
"epoch": 0.9120323559150657,
"grad_norm": 0.15949569642543793,
"learning_rate": 0.0003341907911494562,
"loss": 0.2813,
"mean_token_accuracy": 0.8971740826964378,
"num_tokens": 30796942.0,
"step": 451
},
{
"epoch": 0.9140546006066734,
"grad_norm": 0.18862098455429077,
"learning_rate": 0.0003338869591412529,
"loss": 0.3339,
"mean_token_accuracy": 0.8874437399208546,
"num_tokens": 30858913.0,
"step": 452
},
{
"epoch": 0.916076845298281,
"grad_norm": 0.19091889262199402,
"learning_rate": 0.0003335825850715203,
"loss": 0.3099,
"mean_token_accuracy": 0.8915912732481956,
"num_tokens": 30923946.0,
"step": 453
},
{
"epoch": 0.9180990899898888,
"grad_norm": 0.17616930603981018,
"learning_rate": 0.0003332776703889708,
"loss": 0.302,
"mean_token_accuracy": 0.8977428935468197,
"num_tokens": 30991635.0,
"step": 454
},
{
"epoch": 0.9201213346814965,
"grad_norm": 0.16347502171993256,
"learning_rate": 0.00033297221654489026,
"loss": 0.2968,
"mean_token_accuracy": 0.8974283151328564,
"num_tokens": 31065527.0,
"step": 455
},
{
"epoch": 0.9221435793731041,
"grad_norm": 0.15494075417518616,
"learning_rate": 0.0003326662249931307,
"loss": 0.2745,
"mean_token_accuracy": 0.9003672078251839,
"num_tokens": 31139389.0,
"step": 456
},
{
"epoch": 0.9241658240647118,
"grad_norm": 0.14488424360752106,
"learning_rate": 0.0003323596971901032,
"loss": 0.2315,
"mean_token_accuracy": 0.9032083451747894,
"num_tokens": 31211644.0,
"step": 457
},
{
"epoch": 0.9261880687563195,
"grad_norm": 0.17343585193157196,
"learning_rate": 0.0003320526345947716,
"loss": 0.2834,
"mean_token_accuracy": 0.8971737772226334,
"num_tokens": 31281551.0,
"step": 458
},
{
"epoch": 0.9282103134479271,
"grad_norm": 0.20809240639209747,
"learning_rate": 0.0003317450386686447,
"loss": 0.3392,
"mean_token_accuracy": 0.8834185339510441,
"num_tokens": 31339866.0,
"step": 459
},
{
"epoch": 0.9302325581395349,
"grad_norm": 0.1745264083147049,
"learning_rate": 0.00033143691087577016,
"loss": 0.3135,
"mean_token_accuracy": 0.8907811567187309,
"num_tokens": 31397435.0,
"step": 460
},
{
"epoch": 0.9322548028311426,
"grad_norm": 0.19855932891368866,
"learning_rate": 0.00033112825268272693,
"loss": 0.2874,
"mean_token_accuracy": 0.9011034667491913,
"num_tokens": 31477769.0,
"step": 461
},
{
"epoch": 0.9342770475227502,
"grad_norm": 0.18550598621368408,
"learning_rate": 0.0003308190655586185,
"loss": 0.3026,
"mean_token_accuracy": 0.8910555392503738,
"num_tokens": 31543808.0,
"step": 462
},
{
"epoch": 0.9362992922143579,
"grad_norm": 0.17249254882335663,
"learning_rate": 0.000330509350975066,
"loss": 0.2988,
"mean_token_accuracy": 0.8944742307066917,
"num_tokens": 31608876.0,
"step": 463
},
{
"epoch": 0.9383215369059656,
"grad_norm": 0.15075324475765228,
"learning_rate": 0.0003301991104062009,
"loss": 0.272,
"mean_token_accuracy": 0.90623002871871,
"num_tokens": 31680601.0,
"step": 464
},
{
"epoch": 0.9403437815975733,
"grad_norm": 0.18637825548648834,
"learning_rate": 0.00032988834532865827,
"loss": 0.3234,
"mean_token_accuracy": 0.8885620683431625,
"num_tokens": 31747402.0,
"step": 465
},
{
"epoch": 0.942366026289181,
"grad_norm": 0.1554325670003891,
"learning_rate": 0.0003295770572215697,
"loss": 0.2836,
"mean_token_accuracy": 0.9002716057002544,
"num_tokens": 31818720.0,
"step": 466
},
{
"epoch": 0.9443882709807887,
"grad_norm": 0.17428986728191376,
"learning_rate": 0.00032926524756655615,
"loss": 0.2917,
"mean_token_accuracy": 0.8964979350566864,
"num_tokens": 31891824.0,
"step": 467
},
{
"epoch": 0.9464105156723963,
"grad_norm": 0.16667652130126953,
"learning_rate": 0.000328952917847721,
"loss": 0.2742,
"mean_token_accuracy": 0.901694979518652,
"num_tokens": 31967670.0,
"step": 468
},
{
"epoch": 0.948432760364004,
"grad_norm": 0.17575259506702423,
"learning_rate": 0.00032864006955164287,
"loss": 0.3164,
"mean_token_accuracy": 0.8907586932182312,
"num_tokens": 32033261.0,
"step": 469
},
{
"epoch": 0.9504550050556118,
"grad_norm": 0.17919106781482697,
"learning_rate": 0.0003283267041673687,
"loss": 0.303,
"mean_token_accuracy": 0.8939293213188648,
"num_tokens": 32096462.0,
"step": 470
},
{
"epoch": 0.9524772497472194,
"grad_norm": 0.18951061367988586,
"learning_rate": 0.0003280128231864066,
"loss": 0.3249,
"mean_token_accuracy": 0.8879147619009018,
"num_tokens": 32157870.0,
"step": 471
},
{
"epoch": 0.9544994944388271,
"grad_norm": 0.1526096761226654,
"learning_rate": 0.0003276984281027186,
"loss": 0.2505,
"mean_token_accuracy": 0.9095052257180214,
"num_tokens": 32236445.0,
"step": 472
},
{
"epoch": 0.9565217391304348,
"grad_norm": 0.16995003819465637,
"learning_rate": 0.00032738352041271395,
"loss": 0.3174,
"mean_token_accuracy": 0.8889270462095737,
"num_tokens": 32304171.0,
"step": 473
},
{
"epoch": 0.9585439838220424,
"grad_norm": 0.16517885029315948,
"learning_rate": 0.0003270681016152414,
"loss": 0.3144,
"mean_token_accuracy": 0.8923964686691761,
"num_tokens": 32372702.0,
"step": 474
},
{
"epoch": 0.9605662285136501,
"grad_norm": 0.18384018540382385,
"learning_rate": 0.00032675217321158264,
"loss": 0.2903,
"mean_token_accuracy": 0.8964046128094196,
"num_tokens": 32442132.0,
"step": 475
},
{
"epoch": 0.9625884732052579,
"grad_norm": 0.1601627767086029,
"learning_rate": 0.0003264357367054449,
"loss": 0.2766,
"mean_token_accuracy": 0.9007900506258011,
"num_tokens": 32514430.0,
"step": 476
},
{
"epoch": 0.9646107178968655,
"grad_norm": 0.18358251452445984,
"learning_rate": 0.00032611879360295345,
"loss": 0.2927,
"mean_token_accuracy": 0.8977400958538055,
"num_tokens": 32579788.0,
"step": 477
},
{
"epoch": 0.9666329625884732,
"grad_norm": 0.2047470211982727,
"learning_rate": 0.0003258013454126452,
"loss": 0.3131,
"mean_token_accuracy": 0.8929316326975822,
"num_tokens": 32642283.0,
"step": 478
},
{
"epoch": 0.9686552072800809,
"grad_norm": 0.1662026345729828,
"learning_rate": 0.0003254833936454609,
"loss": 0.2841,
"mean_token_accuracy": 0.8985595107078552,
"num_tokens": 32709386.0,
"step": 479
},
{
"epoch": 0.9706774519716885,
"grad_norm": 0.1934393048286438,
"learning_rate": 0.00032516493981473826,
"loss": 0.2869,
"mean_token_accuracy": 0.8976165167987347,
"num_tokens": 32778573.0,
"step": 480
},
{
"epoch": 0.9726996966632963,
"grad_norm": 0.1651667058467865,
"learning_rate": 0.0003248459854362044,
"loss": 0.2993,
"mean_token_accuracy": 0.893569964915514,
"num_tokens": 32853785.0,
"step": 481
},
{
"epoch": 0.974721941354904,
"grad_norm": 0.18779976665973663,
"learning_rate": 0.00032452653202796915,
"loss": 0.3223,
"mean_token_accuracy": 0.8855483829975128,
"num_tokens": 32917542.0,
"step": 482
},
{
"epoch": 0.9767441860465116,
"grad_norm": 0.14583131670951843,
"learning_rate": 0.00032420658111051746,
"loss": 0.2772,
"mean_token_accuracy": 0.8998262621462345,
"num_tokens": 32987391.0,
"step": 483
},
{
"epoch": 0.9787664307381193,
"grad_norm": 0.23910751938819885,
"learning_rate": 0.00032388613420670213,
"loss": 0.3257,
"mean_token_accuracy": 0.8845948688685894,
"num_tokens": 33053804.0,
"step": 484
},
{
"epoch": 0.980788675429727,
"grad_norm": 0.1679566651582718,
"learning_rate": 0.00032356519284173666,
"loss": 0.2988,
"mean_token_accuracy": 0.8954810760915279,
"num_tokens": 33123281.0,
"step": 485
},
{
"epoch": 0.9828109201213346,
"grad_norm": 0.17945775389671326,
"learning_rate": 0.0003232437585431883,
"loss": 0.3127,
"mean_token_accuracy": 0.8931021988391876,
"num_tokens": 33188358.0,
"step": 486
},
{
"epoch": 0.9848331648129424,
"grad_norm": 0.18727077543735504,
"learning_rate": 0.00032292183284097023,
"loss": 0.3259,
"mean_token_accuracy": 0.8901765421032906,
"num_tokens": 33251289.0,
"step": 487
},
{
"epoch": 0.9868554095045501,
"grad_norm": 0.1629391312599182,
"learning_rate": 0.0003225994172673346,
"loss": 0.3004,
"mean_token_accuracy": 0.8926238007843494,
"num_tokens": 33322968.0,
"step": 488
},
{
"epoch": 0.9888776541961577,
"grad_norm": 0.1630707085132599,
"learning_rate": 0.00032227651335686513,
"loss": 0.2809,
"mean_token_accuracy": 0.9002612978219986,
"num_tokens": 33393350.0,
"step": 489
},
{
"epoch": 0.9908998988877654,
"grad_norm": 0.17929117381572723,
"learning_rate": 0.0003219531226464699,
"loss": 0.3214,
"mean_token_accuracy": 0.8894147910177708,
"num_tokens": 33458431.0,
"step": 490
},
{
"epoch": 0.9929221435793731,
"grad_norm": 0.1639278680086136,
"learning_rate": 0.00032162924667537406,
"loss": 0.2891,
"mean_token_accuracy": 0.8945626839995384,
"num_tokens": 33526451.0,
"step": 491
},
{
"epoch": 0.9949443882709808,
"grad_norm": 0.1808111071586609,
"learning_rate": 0.0003213048869851124,
"loss": 0.2965,
"mean_token_accuracy": 0.8966854028403759,
"num_tokens": 33589564.0,
"step": 492
},
{
"epoch": 0.9969666329625885,
"grad_norm": 0.1905975043773651,
"learning_rate": 0.00032098004511952184,
"loss": 0.3017,
"mean_token_accuracy": 0.8935710862278938,
"num_tokens": 33649359.0,
"step": 493
},
{
"epoch": 0.9989888776541962,
"grad_norm": 0.17898094654083252,
"learning_rate": 0.00032065472262473443,
"loss": 0.3193,
"mean_token_accuracy": 0.8906168565154076,
"num_tokens": 33721593.0,
"step": 494
},
{
"epoch": 1.0,
"grad_norm": 0.22628654539585114,
"learning_rate": 0.00032032892104917,
"loss": 0.3083,
"mean_token_accuracy": 0.8914947211742401,
"num_tokens": 33755641.0,
"step": 495
},
{
"epoch": 1.0020222446916076,
"grad_norm": 0.13782188296318054,
"learning_rate": 0.00032000264194352845,
"loss": 0.2663,
"mean_token_accuracy": 0.8996973298490047,
"num_tokens": 33834819.0,
"step": 496
},
{
"epoch": 1.0040444893832154,
"grad_norm": 0.17569021880626678,
"learning_rate": 0.0003196758868607825,
"loss": 0.2952,
"mean_token_accuracy": 0.8985786736011505,
"num_tokens": 33902435.0,
"step": 497
},
{
"epoch": 1.006066734074823,
"grad_norm": 0.2067909836769104,
"learning_rate": 0.0003193486573561705,
"loss": 0.3225,
"mean_token_accuracy": 0.8876040019094944,
"num_tokens": 33965666.0,
"step": 498
},
{
"epoch": 1.0080889787664307,
"grad_norm": 0.16878552734851837,
"learning_rate": 0.0003190209549871888,
"loss": 0.2942,
"mean_token_accuracy": 0.8955768346786499,
"num_tokens": 34032445.0,
"step": 499
},
{
"epoch": 1.0101112234580385,
"grad_norm": 0.15274177491664886,
"learning_rate": 0.00031869278131358455,
"loss": 0.2427,
"mean_token_accuracy": 0.9117574766278267,
"num_tokens": 34114342.0,
"step": 500
},
{
"epoch": 1.012133468149646,
"grad_norm": 0.22229406237602234,
"learning_rate": 0.0003183641378973478,
"loss": 0.2961,
"mean_token_accuracy": 0.8931870721280575,
"num_tokens": 34170031.0,
"step": 501
},
{
"epoch": 1.0141557128412537,
"grad_norm": 0.17795279622077942,
"learning_rate": 0.0003180350263027049,
"loss": 0.2921,
"mean_token_accuracy": 0.8974833749234676,
"num_tokens": 34232994.0,
"step": 502
},
{
"epoch": 1.0161779575328616,
"grad_norm": 0.1530430167913437,
"learning_rate": 0.0003177054480961101,
"loss": 0.2587,
"mean_token_accuracy": 0.8979953937232494,
"num_tokens": 34306018.0,
"step": 503
},
{
"epoch": 1.0182002022244692,
"grad_norm": 0.17740803956985474,
"learning_rate": 0.00031737540484623895,
"loss": 0.3102,
"mean_token_accuracy": 0.8884628489613533,
"num_tokens": 34374661.0,
"step": 504
},
{
"epoch": 1.0202224469160768,
"grad_norm": 0.177719384431839,
"learning_rate": 0.00031704489812398013,
"loss": 0.2953,
"mean_token_accuracy": 0.8939866498112679,
"num_tokens": 34438514.0,
"step": 505
},
{
"epoch": 1.0222446916076846,
"grad_norm": 0.168897345662117,
"learning_rate": 0.00031671392950242836,
"loss": 0.269,
"mean_token_accuracy": 0.9047276936471462,
"num_tokens": 34505982.0,
"step": 506
},
{
"epoch": 1.0242669362992922,
"grad_norm": 0.15597204864025116,
"learning_rate": 0.0003163825005568769,
"loss": 0.2585,
"mean_token_accuracy": 0.9080711491405964,
"num_tokens": 34578668.0,
"step": 507
},
{
"epoch": 1.0262891809908998,
"grad_norm": 0.17869000136852264,
"learning_rate": 0.00031605061286481013,
"loss": 0.3069,
"mean_token_accuracy": 0.8951312974095345,
"num_tokens": 34649274.0,
"step": 508
},
{
"epoch": 1.0283114256825077,
"grad_norm": 0.15539689362049103,
"learning_rate": 0.0003157182680058955,
"loss": 0.2495,
"mean_token_accuracy": 0.9083127416670322,
"num_tokens": 34727319.0,
"step": 509
},
{
"epoch": 1.0303336703741153,
"grad_norm": 0.18144549429416656,
"learning_rate": 0.00031538546756197693,
"loss": 0.2856,
"mean_token_accuracy": 0.9019791670143604,
"num_tokens": 34797454.0,
"step": 510
},
{
"epoch": 1.0323559150657229,
"grad_norm": 0.18584753572940826,
"learning_rate": 0.0003150522131170663,
"loss": 0.2954,
"mean_token_accuracy": 0.8972033709287643,
"num_tokens": 34864905.0,
"step": 511
},
{
"epoch": 1.0343781597573307,
"grad_norm": 0.19840823113918304,
"learning_rate": 0.0003147185062573365,
"loss": 0.28,
"mean_token_accuracy": 0.901741374284029,
"num_tokens": 34928661.0,
"step": 512
},
{
"epoch": 1.0364004044489383,
"grad_norm": 0.14095668494701385,
"learning_rate": 0.00031438434857111405,
"loss": 0.2666,
"mean_token_accuracy": 0.9036082923412323,
"num_tokens": 35002573.0,
"step": 513
},
{
"epoch": 1.038422649140546,
"grad_norm": 0.13482429087162018,
"learning_rate": 0.0003140497416488708,
"loss": 0.2603,
"mean_token_accuracy": 0.9059791043400764,
"num_tokens": 35083602.0,
"step": 514
},
{
"epoch": 1.0404448938321538,
"grad_norm": 0.20816905796527863,
"learning_rate": 0.00031371468708321713,
"loss": 0.3049,
"mean_token_accuracy": 0.8949435539543629,
"num_tokens": 35150470.0,
"step": 515
},
{
"epoch": 1.0424671385237614,
"grad_norm": 0.17933416366577148,
"learning_rate": 0.0003133791864688939,
"loss": 0.2972,
"mean_token_accuracy": 0.8948968909680843,
"num_tokens": 35216813.0,
"step": 516
},
{
"epoch": 1.044489383215369,
"grad_norm": 0.17087870836257935,
"learning_rate": 0.00031304324140276496,
"loss": 0.2891,
"mean_token_accuracy": 0.8967925682663918,
"num_tokens": 35287089.0,
"step": 517
},
{
"epoch": 1.0465116279069768,
"grad_norm": 0.19874465465545654,
"learning_rate": 0.0003127068534838098,
"loss": 0.2864,
"mean_token_accuracy": 0.8976041786372662,
"num_tokens": 35348784.0,
"step": 518
},
{
"epoch": 1.0485338725985844,
"grad_norm": 0.17467646300792694,
"learning_rate": 0.0003123700243131155,
"loss": 0.2742,
"mean_token_accuracy": 0.9038321636617184,
"num_tokens": 35430257.0,
"step": 519
},
{
"epoch": 1.050556117290192,
"grad_norm": 0.20859748125076294,
"learning_rate": 0.00031203275549386935,
"loss": 0.29,
"mean_token_accuracy": 0.8973617292940617,
"num_tokens": 35492098.0,
"step": 520
},
{
"epoch": 1.0525783619817999,
"grad_norm": 0.1560591757297516,
"learning_rate": 0.00031169504863135157,
"loss": 0.2593,
"mean_token_accuracy": 0.9061496220529079,
"num_tokens": 35578894.0,
"step": 521
},
{
"epoch": 1.0546006066734075,
"grad_norm": 0.17322826385498047,
"learning_rate": 0.0003113569053329268,
"loss": 0.2656,
"mean_token_accuracy": 0.9077408090233803,
"num_tokens": 35658590.0,
"step": 522
},
{
"epoch": 1.056622851365015,
"grad_norm": 0.16736696660518646,
"learning_rate": 0.0003110183272080373,
"loss": 0.2647,
"mean_token_accuracy": 0.9043499119579792,
"num_tokens": 35722339.0,
"step": 523
},
{
"epoch": 1.058645096056623,
"grad_norm": 0.20183323323726654,
"learning_rate": 0.00031067931586819473,
"loss": 0.2937,
"mean_token_accuracy": 0.8954190462827682,
"num_tokens": 35782293.0,
"step": 524
},
{
"epoch": 1.0606673407482305,
"grad_norm": 0.16886426508426666,
"learning_rate": 0.000310339872926973,
"loss": 0.2841,
"mean_token_accuracy": 0.9006736651062965,
"num_tokens": 35849795.0,
"step": 525
},
{
"epoch": 1.0626895854398382,
"grad_norm": 0.16396957635879517,
"learning_rate": 0.00031,
"loss": 0.2747,
"mean_token_accuracy": 0.9040698818862438,
"num_tokens": 35926179.0,
"step": 526
},
{
"epoch": 1.064711830131446,
"grad_norm": 0.17668411135673523,
"learning_rate": 0.00030965969870495034,
"loss": 0.293,
"mean_token_accuracy": 0.8949432447552681,
"num_tokens": 35992037.0,
"step": 527
},
{
"epoch": 1.0667340748230536,
"grad_norm": 0.16346760094165802,
"learning_rate": 0.0003093189706615375,
"loss": 0.2524,
"mean_token_accuracy": 0.9064350612461567,
"num_tokens": 36060378.0,
"step": 528
},
{
"epoch": 1.0687563195146612,
"grad_norm": 0.17525459825992584,
"learning_rate": 0.000308977817491506,
"loss": 0.2943,
"mean_token_accuracy": 0.8935273364186287,
"num_tokens": 36126013.0,
"step": 529
},
{
"epoch": 1.070778564206269,
"grad_norm": 0.16501343250274658,
"learning_rate": 0.00030863624081862415,
"loss": 0.2789,
"mean_token_accuracy": 0.8968185931444168,
"num_tokens": 36196795.0,
"step": 530
},
{
"epoch": 1.0728008088978767,
"grad_norm": 0.16026921570301056,
"learning_rate": 0.0003082942422686754,
"loss": 0.2671,
"mean_token_accuracy": 0.9082406982779503,
"num_tokens": 36275178.0,
"step": 531
},
{
"epoch": 1.0748230535894843,
"grad_norm": 0.19023281335830688,
"learning_rate": 0.0003079518234694519,
"loss": 0.3116,
"mean_token_accuracy": 0.8914121352136135,
"num_tokens": 36338049.0,
"step": 532
},
{
"epoch": 1.076845298281092,
"grad_norm": 0.18959233164787292,
"learning_rate": 0.00030760898605074546,
"loss": 0.2626,
"mean_token_accuracy": 0.9018443673849106,
"num_tokens": 36420122.0,
"step": 533
},
{
"epoch": 1.0788675429726997,
"grad_norm": 0.18601641058921814,
"learning_rate": 0.00030726573164434074,
"loss": 0.2946,
"mean_token_accuracy": 0.8955305181443691,
"num_tokens": 36486673.0,
"step": 534
},
{
"epoch": 1.0808897876643073,
"grad_norm": 0.17861206829547882,
"learning_rate": 0.0003069220618840067,
"loss": 0.2638,
"mean_token_accuracy": 0.9000630341470242,
"num_tokens": 36548189.0,
"step": 535
},
{
"epoch": 1.0829120323559152,
"grad_norm": 0.16839022934436798,
"learning_rate": 0.0003065779784054898,
"loss": 0.2821,
"mean_token_accuracy": 0.901892576366663,
"num_tokens": 36619289.0,
"step": 536
},
{
"epoch": 1.0849342770475228,
"grad_norm": 0.16797274351119995,
"learning_rate": 0.0003062334828465052,
"loss": 0.2722,
"mean_token_accuracy": 0.901667632162571,
"num_tokens": 36690144.0,
"step": 537
},
{
"epoch": 1.0869565217391304,
"grad_norm": 0.1743130087852478,
"learning_rate": 0.00030588857684672955,
"loss": 0.2567,
"mean_token_accuracy": 0.9072123803198338,
"num_tokens": 36761617.0,
"step": 538
},
{
"epoch": 1.0889787664307382,
"grad_norm": 0.1802840232849121,
"learning_rate": 0.0003055432620477931,
"loss": 0.2822,
"mean_token_accuracy": 0.8998791016638279,
"num_tokens": 36828873.0,
"step": 539
},
{
"epoch": 1.0910010111223458,
"grad_norm": 0.19156496226787567,
"learning_rate": 0.00030519754009327186,
"loss": 0.3002,
"mean_token_accuracy": 0.8940830379724503,
"num_tokens": 36893847.0,
"step": 540
},
{
"epoch": 1.0930232558139534,
"grad_norm": 0.18583235144615173,
"learning_rate": 0.0003048514126286796,
"loss": 0.2692,
"mean_token_accuracy": 0.9024544768035412,
"num_tokens": 36963240.0,
"step": 541
},
{
"epoch": 1.0950455005055613,
"grad_norm": 0.17397500574588776,
"learning_rate": 0.00030450488130146034,
"loss": 0.2691,
"mean_token_accuracy": 0.9022202827036381,
"num_tokens": 37026381.0,
"step": 542
},
{
"epoch": 1.0970677451971689,
"grad_norm": 0.24742691218852997,
"learning_rate": 0.0003041579477609803,
"loss": 0.3287,
"mean_token_accuracy": 0.8853081800043583,
"num_tokens": 37085095.0,
"step": 543
},
{
"epoch": 1.0990899898887765,
"grad_norm": 0.16266337037086487,
"learning_rate": 0.00030381061365852006,
"loss": 0.2669,
"mean_token_accuracy": 0.908314511179924,
"num_tokens": 37156057.0,
"step": 544
},
{
"epoch": 1.1011122345803843,
"grad_norm": 0.1805969476699829,
"learning_rate": 0.00030346288064726676,
"loss": 0.2762,
"mean_token_accuracy": 0.9019368290901184,
"num_tokens": 37218048.0,
"step": 545
},
{
"epoch": 1.103134479271992,
"grad_norm": 0.2024918794631958,
"learning_rate": 0.00030311475038230615,
"loss": 0.2948,
"mean_token_accuracy": 0.8978271037340164,
"num_tokens": 37283475.0,
"step": 546
},
{
"epoch": 1.1051567239635995,
"grad_norm": 0.16442124545574188,
"learning_rate": 0.00030276622452061477,
"loss": 0.2746,
"mean_token_accuracy": 0.9010177366435528,
"num_tokens": 37358871.0,
"step": 547
},
{
"epoch": 1.1071789686552074,
"grad_norm": 0.17242524027824402,
"learning_rate": 0.0003024173047210522,
"loss": 0.2975,
"mean_token_accuracy": 0.8940832912921906,
"num_tokens": 37421863.0,
"step": 548
},
{
"epoch": 1.109201213346815,
"grad_norm": 0.2123114913702011,
"learning_rate": 0.00030206799264435294,
"loss": 0.3084,
"mean_token_accuracy": 0.8925547078251839,
"num_tokens": 37486615.0,
"step": 549
},
{
"epoch": 1.1112234580384226,
"grad_norm": 0.16941364109516144,
"learning_rate": 0.00030171828995311845,
"loss": 0.2997,
"mean_token_accuracy": 0.8960695490241051,
"num_tokens": 37556657.0,
"step": 550
},
{
"epoch": 1.1132457027300304,
"grad_norm": 0.18581314384937286,
"learning_rate": 0.0003013681983118096,
"loss": 0.3056,
"mean_token_accuracy": 0.8949491046369076,
"num_tokens": 37623124.0,
"step": 551
},
{
"epoch": 1.115267947421638,
"grad_norm": 0.17790380120277405,
"learning_rate": 0.0003010177193867383,
"loss": 0.2849,
"mean_token_accuracy": 0.8990210555493832,
"num_tokens": 37688876.0,
"step": 552
},
{
"epoch": 1.1172901921132457,
"grad_norm": 0.17190231382846832,
"learning_rate": 0.00030066685484606004,
"loss": 0.2805,
"mean_token_accuracy": 0.8991851061582565,
"num_tokens": 37757188.0,
"step": 553
},
{
"epoch": 1.1193124368048535,
"grad_norm": 0.17098551988601685,
"learning_rate": 0.00030031560635976557,
"loss": 0.2809,
"mean_token_accuracy": 0.8985818810760975,
"num_tokens": 37822088.0,
"step": 554
},
{
"epoch": 1.121334681496461,
"grad_norm": 0.16426457464694977,
"learning_rate": 0.0002999639755996731,
"loss": 0.271,
"mean_token_accuracy": 0.9015116766095161,
"num_tokens": 37885778.0,
"step": 555
},
{
"epoch": 1.1233569261880687,
"grad_norm": 0.16016022861003876,
"learning_rate": 0.00029961196423942027,
"loss": 0.2436,
"mean_token_accuracy": 0.9075723215937614,
"num_tokens": 37956105.0,
"step": 556
},
{
"epoch": 1.1253791708796763,
"grad_norm": 0.17624878883361816,
"learning_rate": 0.0002992595739544563,
"loss": 0.2851,
"mean_token_accuracy": 0.8980127796530724,
"num_tokens": 38022057.0,
"step": 557
},
{
"epoch": 1.1274014155712841,
"grad_norm": 0.2018936723470688,
"learning_rate": 0.00029890680642203395,
"loss": 0.2971,
"mean_token_accuracy": 0.8927877955138683,
"num_tokens": 38088320.0,
"step": 558
},
{
"epoch": 1.1294236602628918,
"grad_norm": 0.19130869209766388,
"learning_rate": 0.0002985536633212016,
"loss": 0.2797,
"mean_token_accuracy": 0.8997831009328365,
"num_tokens": 38149395.0,
"step": 559
},
{
"epoch": 1.1314459049544996,
"grad_norm": 0.19779284298419952,
"learning_rate": 0.0002982001463327951,
"loss": 0.3127,
"mean_token_accuracy": 0.8897297792136669,
"num_tokens": 38211779.0,
"step": 560
},
{
"epoch": 1.1334681496461072,
"grad_norm": 0.1628047674894333,
"learning_rate": 0.0002978462571394299,
"loss": 0.2637,
"mean_token_accuracy": 0.9051007218658924,
"num_tokens": 38279919.0,
"step": 561
},
{
"epoch": 1.1354903943377148,
"grad_norm": 0.1489226073026657,
"learning_rate": 0.00029749199742549315,
"loss": 0.2525,
"mean_token_accuracy": 0.9131556376814842,
"num_tokens": 38348885.0,
"step": 562
},
{
"epoch": 1.1375126390293224,
"grad_norm": 0.16562367975711823,
"learning_rate": 0.0002971373688771353,
"loss": 0.2804,
"mean_token_accuracy": 0.9060126468539238,
"num_tokens": 38414361.0,
"step": 563
},
{
"epoch": 1.1395348837209303,
"grad_norm": 0.18426918983459473,
"learning_rate": 0.00029678237318226254,
"loss": 0.3034,
"mean_token_accuracy": 0.8923818841576576,
"num_tokens": 38478031.0,
"step": 564
},
{
"epoch": 1.1415571284125379,
"grad_norm": 0.18996812403202057,
"learning_rate": 0.0002964270120305284,
"loss": 0.3118,
"mean_token_accuracy": 0.8920970819890499,
"num_tokens": 38537650.0,
"step": 565
},
{
"epoch": 1.1435793731041457,
"grad_norm": 0.1744386851787567,
"learning_rate": 0.0002960712871133259,
"loss": 0.3105,
"mean_token_accuracy": 0.8955930359661579,
"num_tokens": 38599799.0,
"step": 566
},
{
"epoch": 1.1456016177957533,
"grad_norm": 0.1756746470928192,
"learning_rate": 0.0002957152001237796,
"loss": 0.2879,
"mean_token_accuracy": 0.8998842090368271,
"num_tokens": 38665696.0,
"step": 567
},
{
"epoch": 1.147623862487361,
"grad_norm": 0.17731311917304993,
"learning_rate": 0.00029535875275673706,
"loss": 0.3028,
"mean_token_accuracy": 0.896138958632946,
"num_tokens": 38736012.0,
"step": 568
},
{
"epoch": 1.1496461071789685,
"grad_norm": 0.16211020946502686,
"learning_rate": 0.00029500194670876155,
"loss": 0.2661,
"mean_token_accuracy": 0.9007462747395039,
"num_tokens": 38813042.0,
"step": 569
},
{
"epoch": 1.1516683518705764,
"grad_norm": 0.16605907678604126,
"learning_rate": 0.00029464478367812304,
"loss": 0.2708,
"mean_token_accuracy": 0.9033683091402054,
"num_tokens": 38884323.0,
"step": 570
},
{
"epoch": 1.153690596562184,
"grad_norm": 0.16346529126167297,
"learning_rate": 0.0002942872653647911,
"loss": 0.2787,
"mean_token_accuracy": 0.8993464335799217,
"num_tokens": 38954581.0,
"step": 571
},
{
"epoch": 1.1557128412537918,
"grad_norm": 0.1715569943189621,
"learning_rate": 0.0002939293934704259,
"loss": 0.2876,
"mean_token_accuracy": 0.899021927267313,
"num_tokens": 39024859.0,
"step": 572
},
{
"epoch": 1.1577350859453994,
"grad_norm": 0.1708040088415146,
"learning_rate": 0.00029357116969837093,
"loss": 0.2716,
"mean_token_accuracy": 0.9040286540985107,
"num_tokens": 39084032.0,
"step": 573
},
{
"epoch": 1.159757330637007,
"grad_norm": 0.15547077357769012,
"learning_rate": 0.00029321259575364406,
"loss": 0.2876,
"mean_token_accuracy": 0.9014556109905243,
"num_tokens": 39158216.0,
"step": 574
},
{
"epoch": 1.1617795753286146,
"grad_norm": 0.1835734099149704,
"learning_rate": 0.0002928536733429302,
"loss": 0.2904,
"mean_token_accuracy": 0.8962517976760864,
"num_tokens": 39219228.0,
"step": 575
},
{
"epoch": 1.1638018200202225,
"grad_norm": 0.21164695918560028,
"learning_rate": 0.00029249440417457274,
"loss": 0.3095,
"mean_token_accuracy": 0.8903193324804306,
"num_tokens": 39279145.0,
"step": 576
},
{
"epoch": 1.16582406471183,
"grad_norm": 0.16395002603530884,
"learning_rate": 0.00029213478995856535,
"loss": 0.2658,
"mean_token_accuracy": 0.9063084498047829,
"num_tokens": 39346035.0,
"step": 577
},
{
"epoch": 1.167846309403438,
"grad_norm": 0.15447662770748138,
"learning_rate": 0.0002917748324065443,
"loss": 0.2609,
"mean_token_accuracy": 0.9043813906610012,
"num_tokens": 39419464.0,
"step": 578
},
{
"epoch": 1.1698685540950455,
"grad_norm": 0.18628905713558197,
"learning_rate": 0.0002914145332317798,
"loss": 0.3079,
"mean_token_accuracy": 0.892396155744791,
"num_tokens": 39476986.0,
"step": 579
},
{
"epoch": 1.1718907987866531,
"grad_norm": 0.15657448768615723,
"learning_rate": 0.0002910538941491681,
"loss": 0.2596,
"mean_token_accuracy": 0.9103246405720711,
"num_tokens": 39547007.0,
"step": 580
},
{
"epoch": 1.1739130434782608,
"grad_norm": 0.16723878681659698,
"learning_rate": 0.00029069291687522337,
"loss": 0.2578,
"mean_token_accuracy": 0.9113052189350128,
"num_tokens": 39615140.0,
"step": 581
},
{
"epoch": 1.1759352881698686,
"grad_norm": 0.21382521092891693,
"learning_rate": 0.00029033160312806925,
"loss": 0.2843,
"mean_token_accuracy": 0.9006746262311935,
"num_tokens": 39676629.0,
"step": 582
},
{
"epoch": 1.1779575328614762,
"grad_norm": 0.17140787839889526,
"learning_rate": 0.0002899699546274312,
"loss": 0.2973,
"mean_token_accuracy": 0.8942140191793442,
"num_tokens": 39744182.0,
"step": 583
},
{
"epoch": 1.179979777553084,
"grad_norm": 0.16415606439113617,
"learning_rate": 0.0002896079730946277,
"loss": 0.248,
"mean_token_accuracy": 0.9046668969094753,
"num_tokens": 39809087.0,
"step": 584
},
{
"epoch": 1.1820020222446916,
"grad_norm": 0.15275758504867554,
"learning_rate": 0.0002892456602525625,
"loss": 0.2528,
"mean_token_accuracy": 0.9055165685713291,
"num_tokens": 39883376.0,
"step": 585
},
{
"epoch": 1.1840242669362993,
"grad_norm": 0.1598130762577057,
"learning_rate": 0.00028888301782571614,
"loss": 0.2571,
"mean_token_accuracy": 0.9055753275752068,
"num_tokens": 39950688.0,
"step": 586
},
{
"epoch": 1.1860465116279069,
"grad_norm": 0.16630232334136963,
"learning_rate": 0.000288520047540138,
"loss": 0.2857,
"mean_token_accuracy": 0.9000633843243122,
"num_tokens": 40015260.0,
"step": 587
},
{
"epoch": 1.1880687563195147,
"grad_norm": 0.19941283762454987,
"learning_rate": 0.00028815675112343794,
"loss": 0.2954,
"mean_token_accuracy": 0.8945838250219822,
"num_tokens": 40079394.0,
"step": 588
},
{
"epoch": 1.1900910010111223,
"grad_norm": 0.19106529653072357,
"learning_rate": 0.00028779313030477793,
"loss": 0.3112,
"mean_token_accuracy": 0.8897448740899563,
"num_tokens": 40144909.0,
"step": 589
},
{
"epoch": 1.1921132457027301,
"grad_norm": 0.17041806876659393,
"learning_rate": 0.0002874291868148642,
"loss": 0.2819,
"mean_token_accuracy": 0.8990175537765026,
"num_tokens": 40217254.0,
"step": 590
},
{
"epoch": 1.1941354903943378,
"grad_norm": 0.16470171511173248,
"learning_rate": 0.0002870649223859386,
"loss": 0.2773,
"mean_token_accuracy": 0.9041831828653812,
"num_tokens": 40280417.0,
"step": 591
},
{
"epoch": 1.1961577350859454,
"grad_norm": 0.1665530502796173,
"learning_rate": 0.00028670033875177053,
"loss": 0.2663,
"mean_token_accuracy": 0.9013455249369144,
"num_tokens": 40350231.0,
"step": 592
},
{
"epoch": 1.198179979777553,
"grad_norm": 0.19251202046871185,
"learning_rate": 0.00028633543764764894,
"loss": 0.3157,
"mean_token_accuracy": 0.8875606693327427,
"num_tokens": 40413686.0,
"step": 593
},
{
"epoch": 1.2002022244691608,
"grad_norm": 0.17525707185268402,
"learning_rate": 0.00028597022081037354,
"loss": 0.2933,
"mean_token_accuracy": 0.8971122018992901,
"num_tokens": 40479649.0,
"step": 594
},
{
"epoch": 1.2022244691607684,
"grad_norm": 0.19120153784751892,
"learning_rate": 0.000285604689978247,
"loss": 0.275,
"mean_token_accuracy": 0.8998171053826809,
"num_tokens": 40548513.0,
"step": 595
},
{
"epoch": 1.2042467138523762,
"grad_norm": 0.15362586081027985,
"learning_rate": 0.0002852388468910663,
"loss": 0.2655,
"mean_token_accuracy": 0.9043829254806042,
"num_tokens": 40621501.0,
"step": 596
},
{
"epoch": 1.2062689585439839,
"grad_norm": 0.1648460179567337,
"learning_rate": 0.00028487269329011497,
"loss": 0.2765,
"mean_token_accuracy": 0.9020786061882973,
"num_tokens": 40696483.0,
"step": 597
},
{
"epoch": 1.2082912032355915,
"grad_norm": 0.1793263554573059,
"learning_rate": 0.000284506230918154,
"loss": 0.2914,
"mean_token_accuracy": 0.8994336612522602,
"num_tokens": 40765538.0,
"step": 598
},
{
"epoch": 1.210313447927199,
"grad_norm": 0.17354300618171692,
"learning_rate": 0.00028413946151941463,
"loss": 0.2929,
"mean_token_accuracy": 0.9005281217396259,
"num_tokens": 40833551.0,
"step": 599
},
{
"epoch": 1.212335692618807,
"grad_norm": 0.1781807243824005,
"learning_rate": 0.00028377238683958885,
"loss": 0.2849,
"mean_token_accuracy": 0.8987740390002728,
"num_tokens": 40895246.0,
"step": 600
},
{
"epoch": 1.2143579373104145,
"grad_norm": 0.16701123118400574,
"learning_rate": 0.0002834050086258221,
"loss": 0.2607,
"mean_token_accuracy": 0.9041876047849655,
"num_tokens": 40964580.0,
"step": 601
},
{
"epoch": 1.2163801820020224,
"grad_norm": 0.15654708445072174,
"learning_rate": 0.00028303732862670417,
"loss": 0.2702,
"mean_token_accuracy": 0.9014758616685867,
"num_tokens": 41039130.0,
"step": 602
},
{
"epoch": 1.21840242669363,
"grad_norm": 0.18177339434623718,
"learning_rate": 0.0002826693485922616,
"loss": 0.2701,
"mean_token_accuracy": 0.9032718986272812,
"num_tokens": 41095473.0,
"step": 603
},
{
"epoch": 1.2204246713852376,
"grad_norm": 0.16560594737529755,
"learning_rate": 0.00028230107027394876,
"loss": 0.2939,
"mean_token_accuracy": 0.8934713453054428,
"num_tokens": 41157491.0,
"step": 604
},
{
"epoch": 1.2224469160768452,
"grad_norm": 0.18375754356384277,
"learning_rate": 0.00028193249542463977,
"loss": 0.2909,
"mean_token_accuracy": 0.8953644298017025,
"num_tokens": 41225218.0,
"step": 605
},
{
"epoch": 1.224469160768453,
"grad_norm": 0.14936794340610504,
"learning_rate": 0.0002815636257986204,
"loss": 0.2539,
"mean_token_accuracy": 0.9058601558208466,
"num_tokens": 41307770.0,
"step": 606
},
{
"epoch": 1.2264914054600606,
"grad_norm": 0.16326607763767242,
"learning_rate": 0.00028119446315157896,
"loss": 0.2507,
"mean_token_accuracy": 0.9078186601400375,
"num_tokens": 41371178.0,
"step": 607
},
{
"epoch": 1.2285136501516685,
"grad_norm": 0.16785994172096252,
"learning_rate": 0.0002808250092405989,
"loss": 0.2589,
"mean_token_accuracy": 0.9010850116610527,
"num_tokens": 41444090.0,
"step": 608
},
{
"epoch": 1.230535894843276,
"grad_norm": 0.17225563526153564,
"learning_rate": 0.0002804552658241496,
"loss": 0.2667,
"mean_token_accuracy": 0.9027063623070717,
"num_tokens": 41512243.0,
"step": 609
},
{
"epoch": 1.2325581395348837,
"grad_norm": 0.16818945109844208,
"learning_rate": 0.0002800852346620788,
"loss": 0.2704,
"mean_token_accuracy": 0.9012492336332798,
"num_tokens": 41582048.0,
"step": 610
},
{
"epoch": 1.2345803842264913,
"grad_norm": 0.1885753571987152,
"learning_rate": 0.00027971491751560345,
"loss": 0.2859,
"mean_token_accuracy": 0.8967389948666096,
"num_tokens": 41646351.0,
"step": 611
},
{
"epoch": 1.2366026289180991,
"grad_norm": 0.15571804344654083,
"learning_rate": 0.0002793443161473017,
"loss": 0.2707,
"mean_token_accuracy": 0.9040926285088062,
"num_tokens": 41715042.0,
"step": 612
},
{
"epoch": 1.2386248736097067,
"grad_norm": 0.1665385216474533,
"learning_rate": 0.0002789734323211048,
"loss": 0.2633,
"mean_token_accuracy": 0.9024609327316284,
"num_tokens": 41787021.0,
"step": 613
},
{
"epoch": 1.2406471183013146,
"grad_norm": 0.17233288288116455,
"learning_rate": 0.0002786022678022882,
"loss": 0.3058,
"mean_token_accuracy": 0.8898206166923046,
"num_tokens": 41851767.0,
"step": 614
},
{
"epoch": 1.2426693629929222,
"grad_norm": 0.1737981140613556,
"learning_rate": 0.0002782308243574633,
"loss": 0.2933,
"mean_token_accuracy": 0.8971287794411182,
"num_tokens": 41914797.0,
"step": 615
},
{
"epoch": 1.2446916076845298,
"grad_norm": 0.16172519326210022,
"learning_rate": 0.0002778591037545691,
"loss": 0.2665,
"mean_token_accuracy": 0.9057141467928886,
"num_tokens": 41986868.0,
"step": 616
},
{
"epoch": 1.2467138523761374,
"grad_norm": 0.15280866622924805,
"learning_rate": 0.0002774871077628639,
"loss": 0.2688,
"mean_token_accuracy": 0.9038811773061752,
"num_tokens": 42062995.0,
"step": 617
},
{
"epoch": 1.2487360970677452,
"grad_norm": 0.17397160828113556,
"learning_rate": 0.0002771148381529166,
"loss": 0.2863,
"mean_token_accuracy": 0.8941488154232502,
"num_tokens": 42124939.0,
"step": 618
},
{
"epoch": 1.2507583417593529,
"grad_norm": 0.1617380529642105,
"learning_rate": 0.00027674229669659856,
"loss": 0.2536,
"mean_token_accuracy": 0.9045982100069523,
"num_tokens": 42194011.0,
"step": 619
},
{
"epoch": 1.2527805864509607,
"grad_norm": 0.15885986387729645,
"learning_rate": 0.0002763694851670749,
"loss": 0.2703,
"mean_token_accuracy": 0.9061401709914207,
"num_tokens": 42265919.0,
"step": 620
},
{
"epoch": 1.2548028311425683,
"grad_norm": 0.16419966518878937,
"learning_rate": 0.00027599640533879636,
"loss": 0.2769,
"mean_token_accuracy": 0.9034353755414486,
"num_tokens": 42334638.0,
"step": 621
},
{
"epoch": 1.256825075834176,
"grad_norm": 0.16629813611507416,
"learning_rate": 0.0002756230589874905,
"loss": 0.2687,
"mean_token_accuracy": 0.9030461423099041,
"num_tokens": 42404575.0,
"step": 622
},
{
"epoch": 1.2588473205257835,
"grad_norm": 0.17728988826274872,
"learning_rate": 0.00027524944789015366,
"loss": 0.2751,
"mean_token_accuracy": 0.9014569260179996,
"num_tokens": 42475814.0,
"step": 623
},
{
"epoch": 1.2608695652173914,
"grad_norm": 0.17427091300487518,
"learning_rate": 0.00027487557382504195,
"loss": 0.2657,
"mean_token_accuracy": 0.9044037610292435,
"num_tokens": 42543660.0,
"step": 624
},
{
"epoch": 1.262891809908999,
"grad_norm": 0.1894424855709076,
"learning_rate": 0.00027450143857166344,
"loss": 0.2969,
"mean_token_accuracy": 0.8965917490422726,
"num_tokens": 42607124.0,
"step": 625
},
{
"epoch": 1.2649140546006068,
"grad_norm": 0.15993963181972504,
"learning_rate": 0.00027412704391076916,
"loss": 0.2782,
"mean_token_accuracy": 0.9031428508460522,
"num_tokens": 42676066.0,
"step": 626
},
{
"epoch": 1.2669362992922144,
"grad_norm": 0.17840322852134705,
"learning_rate": 0.00027375239162434503,
"loss": 0.2688,
"mean_token_accuracy": 0.9015723317861557,
"num_tokens": 42746212.0,
"step": 627
},
{
"epoch": 1.268958543983822,
"grad_norm": 0.20184557139873505,
"learning_rate": 0.00027337748349560276,
"loss": 0.2963,
"mean_token_accuracy": 0.8969193771481514,
"num_tokens": 42803557.0,
"step": 628
},
{
"epoch": 1.2709807886754296,
"grad_norm": 0.16635443270206451,
"learning_rate": 0.0002730023213089724,
"loss": 0.2884,
"mean_token_accuracy": 0.8960177823901176,
"num_tokens": 42866158.0,
"step": 629
},
{
"epoch": 1.2730030333670375,
"grad_norm": 0.19960255920886993,
"learning_rate": 0.0002726269068500926,
"loss": 0.2841,
"mean_token_accuracy": 0.8968143723905087,
"num_tokens": 42927025.0,
"step": 630
},
{
"epoch": 1.275025278058645,
"grad_norm": 0.1719711273908615,
"learning_rate": 0.0002722512419058032,
"loss": 0.2728,
"mean_token_accuracy": 0.9018568396568298,
"num_tokens": 43007744.0,
"step": 631
},
{
"epoch": 1.277047522750253,
"grad_norm": 0.17668215930461884,
"learning_rate": 0.00027187532826413607,
"loss": 0.2683,
"mean_token_accuracy": 0.9023380614817142,
"num_tokens": 43071417.0,
"step": 632
},
{
"epoch": 1.2790697674418605,
"grad_norm": 0.17645464837551117,
"learning_rate": 0.00027149916771430677,
"loss": 0.2787,
"mean_token_accuracy": 0.9030827060341835,
"num_tokens": 43143504.0,
"step": 633
},
{
"epoch": 1.2810920121334681,
"grad_norm": 0.18298184871673584,
"learning_rate": 0.00027112276204670617,
"loss": 0.2886,
"mean_token_accuracy": 0.8980408012866974,
"num_tokens": 43219433.0,
"step": 634
},
{
"epoch": 1.2831142568250757,
"grad_norm": 0.15996871888637543,
"learning_rate": 0.00027074611305289147,
"loss": 0.2622,
"mean_token_accuracy": 0.902827687561512,
"num_tokens": 43286472.0,
"step": 635
},
{
"epoch": 1.2851365015166836,
"grad_norm": 0.1937190294265747,
"learning_rate": 0.00027036922252557865,
"loss": 0.2937,
"mean_token_accuracy": 0.897728331387043,
"num_tokens": 43346390.0,
"step": 636
},
{
"epoch": 1.2871587462082912,
"grad_norm": 0.17584164440631866,
"learning_rate": 0.00026999209225863263,
"loss": 0.2896,
"mean_token_accuracy": 0.897246178239584,
"num_tokens": 43413853.0,
"step": 637
},
{
"epoch": 1.289180990899899,
"grad_norm": 0.17733249068260193,
"learning_rate": 0.0002696147240470598,
"loss": 0.2882,
"mean_token_accuracy": 0.8957457803189754,
"num_tokens": 43478722.0,
"step": 638
},
{
"epoch": 1.2912032355915066,
"grad_norm": 0.17890246212482452,
"learning_rate": 0.0002692371196869992,
"loss": 0.288,
"mean_token_accuracy": 0.8960468098521233,
"num_tokens": 43540378.0,
"step": 639
},
{
"epoch": 1.2932254802831142,
"grad_norm": 0.15859632194042206,
"learning_rate": 0.0002688592809757134,
"loss": 0.2792,
"mean_token_accuracy": 0.9036918766796589,
"num_tokens": 43612284.0,
"step": 640
},
{
"epoch": 1.2952477249747218,
"grad_norm": 0.16566091775894165,
"learning_rate": 0.0002684812097115808,
"loss": 0.2785,
"mean_token_accuracy": 0.9012075029313564,
"num_tokens": 43677352.0,
"step": 641
},
{
"epoch": 1.2972699696663297,
"grad_norm": 0.17786841094493866,
"learning_rate": 0.0002681029076940862,
"loss": 0.2911,
"mean_token_accuracy": 0.9009424708783627,
"num_tokens": 43739163.0,
"step": 642
},
{
"epoch": 1.2992922143579373,
"grad_norm": 0.15567278861999512,
"learning_rate": 0.0002677243767238135,
"loss": 0.2591,
"mean_token_accuracy": 0.9091448336839676,
"num_tokens": 43819970.0,
"step": 643
},
{
"epoch": 1.3013144590495451,
"grad_norm": 0.20501317083835602,
"learning_rate": 0.00026734561860243544,
"loss": 0.3186,
"mean_token_accuracy": 0.8898426368832588,
"num_tokens": 43879943.0,
"step": 644
},
{
"epoch": 1.3033367037411527,
"grad_norm": 0.18259315192699432,
"learning_rate": 0.0002669666351327066,
"loss": 0.2772,
"mean_token_accuracy": 0.8982793055474758,
"num_tokens": 43941000.0,
"step": 645
},
{
"epoch": 1.3053589484327603,
"grad_norm": 0.18504492938518524,
"learning_rate": 0.00026658742811845376,
"loss": 0.2905,
"mean_token_accuracy": 0.896319292485714,
"num_tokens": 44000567.0,
"step": 646
},
{
"epoch": 1.307381193124368,
"grad_norm": 0.17783911526203156,
"learning_rate": 0.00026620799936456774,
"loss": 0.2813,
"mean_token_accuracy": 0.9009971134364605,
"num_tokens": 44071352.0,
"step": 647
},
{
"epoch": 1.3094034378159758,
"grad_norm": 0.21716438233852386,
"learning_rate": 0.00026582835067699495,
"loss": 0.2906,
"mean_token_accuracy": 0.8958504274487495,
"num_tokens": 44129790.0,
"step": 648
},
{
"epoch": 1.3114256825075834,
"grad_norm": 0.1822315752506256,
"learning_rate": 0.0002654484838627284,
"loss": 0.2867,
"mean_token_accuracy": 0.9037492237985134,
"num_tokens": 44195417.0,
"step": 649
},
{
"epoch": 1.3134479271991912,
"grad_norm": 0.15820986032485962,
"learning_rate": 0.00026506840072979947,
"loss": 0.2546,
"mean_token_accuracy": 0.9098224155604839,
"num_tokens": 44273153.0,
"step": 650
},
{
"epoch": 1.3154701718907988,
"grad_norm": 0.1899651139974594,
"learning_rate": 0.00026468810308726893,
"loss": 0.28,
"mean_token_accuracy": 0.8995106518268585,
"num_tokens": 44349738.0,
"step": 651
},
{
"epoch": 1.3174924165824065,
"grad_norm": 0.18798086047172546,
"learning_rate": 0.00026430759274521877,
"loss": 0.2964,
"mean_token_accuracy": 0.8899718299508095,
"num_tokens": 44415133.0,
"step": 652
},
{
"epoch": 1.319514661274014,
"grad_norm": 0.13753436505794525,
"learning_rate": 0.0002639268715147432,
"loss": 0.2307,
"mean_token_accuracy": 0.9101770743727684,
"num_tokens": 44484697.0,
"step": 653
},
{
"epoch": 1.321536905965622,
"grad_norm": 0.20119944214820862,
"learning_rate": 0.00026354594120794016,
"loss": 0.2926,
"mean_token_accuracy": 0.897066742181778,
"num_tokens": 44551987.0,
"step": 654
},
{
"epoch": 1.3235591506572295,
"grad_norm": 0.18725383281707764,
"learning_rate": 0.000263164803637903,
"loss": 0.2742,
"mean_token_accuracy": 0.9033515900373459,
"num_tokens": 44617511.0,
"step": 655
},
{
"epoch": 1.3255813953488373,
"grad_norm": 0.15222612023353577,
"learning_rate": 0.0002627834606187112,
"loss": 0.2518,
"mean_token_accuracy": 0.9108999036252499,
"num_tokens": 44698150.0,
"step": 656
},
{
"epoch": 1.327603640040445,
"grad_norm": 0.16968220472335815,
"learning_rate": 0.0002624019139654223,
"loss": 0.2834,
"mean_token_accuracy": 0.9003202244639397,
"num_tokens": 44769993.0,
"step": 657
},
{
"epoch": 1.3296258847320526,
"grad_norm": 0.1526424139738083,
"learning_rate": 0.000262020165494063,
"loss": 0.2493,
"mean_token_accuracy": 0.9069892205297947,
"num_tokens": 44848710.0,
"step": 658
},
{
"epoch": 1.3316481294236602,
"grad_norm": 0.16174714267253876,
"learning_rate": 0.00026163821702162074,
"loss": 0.2581,
"mean_token_accuracy": 0.9058538265526295,
"num_tokens": 44932916.0,
"step": 659
},
{
"epoch": 1.333670374115268,
"grad_norm": 0.18540237843990326,
"learning_rate": 0.0002612560703660346,
"loss": 0.2823,
"mean_token_accuracy": 0.9005630798637867,
"num_tokens": 44997865.0,
"step": 660
},
{
"epoch": 1.3356926188068756,
"grad_norm": 0.145268976688385,
"learning_rate": 0.0002608737273461872,
"loss": 0.2402,
"mean_token_accuracy": 0.9093809016048908,
"num_tokens": 45074165.0,
"step": 661
},
{
"epoch": 1.3377148634984835,
"grad_norm": 0.16983529925346375,
"learning_rate": 0.0002604911897818957,
"loss": 0.2763,
"mean_token_accuracy": 0.9002145752310753,
"num_tokens": 45140578.0,
"step": 662
},
{
"epoch": 1.339737108190091,
"grad_norm": 0.18206650018692017,
"learning_rate": 0.00026010845949390326,
"loss": 0.271,
"mean_token_accuracy": 0.9040128998458385,
"num_tokens": 45206573.0,
"step": 663
},
{
"epoch": 1.3417593528816987,
"grad_norm": 0.17423690855503082,
"learning_rate": 0.00025972553830387027,
"loss": 0.276,
"mean_token_accuracy": 0.9035660028457642,
"num_tokens": 45273772.0,
"step": 664
},
{
"epoch": 1.3437815975733063,
"grad_norm": 0.17948757112026215,
"learning_rate": 0.0002593424280343656,
"loss": 0.3073,
"mean_token_accuracy": 0.8898307755589485,
"num_tokens": 45333260.0,
"step": 665
},
{
"epoch": 1.3458038422649141,
"grad_norm": 0.1973046064376831,
"learning_rate": 0.0002589591305088585,
"loss": 0.298,
"mean_token_accuracy": 0.8946604765951633,
"num_tokens": 45397184.0,
"step": 666
},
{
"epoch": 1.3478260869565217,
"grad_norm": 0.16013695299625397,
"learning_rate": 0.0002585756475517092,
"loss": 0.2698,
"mean_token_accuracy": 0.905727930366993,
"num_tokens": 45478638.0,
"step": 667
},
{
"epoch": 1.3498483316481296,
"grad_norm": 0.1567625254392624,
"learning_rate": 0.00025819198098816034,
"loss": 0.2765,
"mean_token_accuracy": 0.9000396579504013,
"num_tokens": 45548715.0,
"step": 668
},
{
"epoch": 1.3518705763397372,
"grad_norm": 0.16354252398014069,
"learning_rate": 0.00025780813264432884,
"loss": 0.2659,
"mean_token_accuracy": 0.9028089232742786,
"num_tokens": 45624018.0,
"step": 669
},
{
"epoch": 1.3538928210313448,
"grad_norm": 0.19890683889389038,
"learning_rate": 0.0002574241043471967,
"loss": 0.3082,
"mean_token_accuracy": 0.89163389056921,
"num_tokens": 45692190.0,
"step": 670
},
{
"epoch": 1.3559150657229524,
"grad_norm": 0.1480788290500641,
"learning_rate": 0.0002570398979246023,
"loss": 0.2605,
"mean_token_accuracy": 0.905091181397438,
"num_tokens": 45771127.0,
"step": 671
},
{
"epoch": 1.3579373104145602,
"grad_norm": 0.17679338157176971,
"learning_rate": 0.00025665551520523194,
"loss": 0.2831,
"mean_token_accuracy": 0.8965117931365967,
"num_tokens": 45835910.0,
"step": 672
},
{
"epoch": 1.3599595551061678,
"grad_norm": 0.17713719606399536,
"learning_rate": 0.00025627095801861107,
"loss": 0.2905,
"mean_token_accuracy": 0.8971158005297184,
"num_tokens": 45901225.0,
"step": 673
},
{
"epoch": 1.3619817997977754,
"grad_norm": 0.17695656418800354,
"learning_rate": 0.0002558862281950955,
"loss": 0.3268,
"mean_token_accuracy": 0.8890945613384247,
"num_tokens": 45972893.0,
"step": 674
},
{
"epoch": 1.3640040444893833,
"grad_norm": 0.15022985637187958,
"learning_rate": 0.0002555013275658627,
"loss": 0.28,
"mean_token_accuracy": 0.9022598974406719,
"num_tokens": 46053862.0,
"step": 675
},
{
"epoch": 1.366026289180991,
"grad_norm": 0.16728746891021729,
"learning_rate": 0.0002551162579629031,
"loss": 0.2735,
"mean_token_accuracy": 0.9004092961549759,
"num_tokens": 46123535.0,
"step": 676
},
{
"epoch": 1.3680485338725985,
"grad_norm": 0.17287185788154602,
"learning_rate": 0.0002547310212190115,
"loss": 0.2803,
"mean_token_accuracy": 0.8980144336819649,
"num_tokens": 46193498.0,
"step": 677
},
{
"epoch": 1.3700707785642063,
"grad_norm": 0.184726744890213,
"learning_rate": 0.0002543456191677781,
"loss": 0.2927,
"mean_token_accuracy": 0.8962498530745506,
"num_tokens": 46261698.0,
"step": 678
},
{
"epoch": 1.372093023255814,
"grad_norm": 0.15757699310779572,
"learning_rate": 0.00025396005364357994,
"loss": 0.2809,
"mean_token_accuracy": 0.8978969343006611,
"num_tokens": 46329372.0,
"step": 679
},
{
"epoch": 1.3741152679474216,
"grad_norm": 0.18496832251548767,
"learning_rate": 0.0002535743264815723,
"loss": 0.2948,
"mean_token_accuracy": 0.8964893855154514,
"num_tokens": 46389989.0,
"step": 680
},
{
"epoch": 1.3761375126390294,
"grad_norm": 0.19771555066108704,
"learning_rate": 0.0002531884395176794,
"loss": 0.3045,
"mean_token_accuracy": 0.8947297558188438,
"num_tokens": 46451529.0,
"step": 681
},
{
"epoch": 1.378159757330637,
"grad_norm": 0.1643752008676529,
"learning_rate": 0.0002528023945885866,
"loss": 0.2691,
"mean_token_accuracy": 0.9002487845718861,
"num_tokens": 46518234.0,
"step": 682
},
{
"epoch": 1.3801820020222446,
"grad_norm": 0.15709805488586426,
"learning_rate": 0.00025241619353173056,
"loss": 0.2517,
"mean_token_accuracy": 0.9091945327818394,
"num_tokens": 46590312.0,
"step": 683
},
{
"epoch": 1.3822042467138524,
"grad_norm": 0.17834722995758057,
"learning_rate": 0.00025202983818529154,
"loss": 0.294,
"mean_token_accuracy": 0.8986290767788887,
"num_tokens": 46658404.0,
"step": 684
},
{
"epoch": 1.38422649140546,
"grad_norm": 0.15814678370952606,
"learning_rate": 0.00025164333038818384,
"loss": 0.2708,
"mean_token_accuracy": 0.9031675830483437,
"num_tokens": 46724887.0,
"step": 685
},
{
"epoch": 1.3862487360970677,
"grad_norm": 0.17998504638671875,
"learning_rate": 0.0002512566719800475,
"loss": 0.2856,
"mean_token_accuracy": 0.89876314625144,
"num_tokens": 46795038.0,
"step": 686
},
{
"epoch": 1.3882709807886755,
"grad_norm": 0.17202328145503998,
"learning_rate": 0.0002508698648012394,
"loss": 0.2965,
"mean_token_accuracy": 0.8947253711521626,
"num_tokens": 46856174.0,
"step": 687
},
{
"epoch": 1.3902932254802831,
"grad_norm": 0.16402584314346313,
"learning_rate": 0.00025048291069282443,
"loss": 0.2633,
"mean_token_accuracy": 0.9063729159533978,
"num_tokens": 46925752.0,
"step": 688
},
{
"epoch": 1.3923154701718907,
"grad_norm": 0.19435186684131622,
"learning_rate": 0.00025009581149656703,
"loss": 0.2756,
"mean_token_accuracy": 0.9030190780758858,
"num_tokens": 46993260.0,
"step": 689
},
{
"epoch": 1.3943377148634986,
"grad_norm": 0.18806155025959015,
"learning_rate": 0.000249708569054922,
"loss": 0.3033,
"mean_token_accuracy": 0.896921843290329,
"num_tokens": 47060294.0,
"step": 690
},
{
"epoch": 1.3963599595551062,
"grad_norm": 0.19206839799880981,
"learning_rate": 0.000249321185211026,
"loss": 0.282,
"mean_token_accuracy": 0.8990140780806541,
"num_tokens": 47123248.0,
"step": 691
},
{
"epoch": 1.3983822042467138,
"grad_norm": 0.16943977773189545,
"learning_rate": 0.00024893366180868875,
"loss": 0.2728,
"mean_token_accuracy": 0.9020564220845699,
"num_tokens": 47185179.0,
"step": 692
},
{
"epoch": 1.4004044489383216,
"grad_norm": 0.1619652956724167,
"learning_rate": 0.00024854600069238407,
"loss": 0.2728,
"mean_token_accuracy": 0.9024368785321712,
"num_tokens": 47259239.0,
"step": 693
},
{
"epoch": 1.4024266936299292,
"grad_norm": 0.17677046358585358,
"learning_rate": 0.00024815820370724156,
"loss": 0.2697,
"mean_token_accuracy": 0.90378213301301,
"num_tokens": 47322333.0,
"step": 694
},
{
"epoch": 1.4044489383215368,
"grad_norm": 0.15612858533859253,
"learning_rate": 0.0002477702726990372,
"loss": 0.2826,
"mean_token_accuracy": 0.9020431824028492,
"num_tokens": 47391001.0,
"step": 695
},
{
"epoch": 1.4064711830131447,
"grad_norm": 0.16640524566173553,
"learning_rate": 0.000247382209514185,
"loss": 0.2948,
"mean_token_accuracy": 0.8942111246287823,
"num_tokens": 47455737.0,
"step": 696
},
{
"epoch": 1.4084934277047523,
"grad_norm": 0.16898459196090698,
"learning_rate": 0.0002469940159997281,
"loss": 0.2687,
"mean_token_accuracy": 0.9056588634848595,
"num_tokens": 47525615.0,
"step": 697
},
{
"epoch": 1.4105156723963599,
"grad_norm": 0.18844769895076752,
"learning_rate": 0.00024660569400332996,
"loss": 0.2946,
"mean_token_accuracy": 0.895747821778059,
"num_tokens": 47592079.0,
"step": 698
},
{
"epoch": 1.4125379170879677,
"grad_norm": 0.16074754297733307,
"learning_rate": 0.00024621724537326545,
"loss": 0.2831,
"mean_token_accuracy": 0.9034741893410683,
"num_tokens": 47667233.0,
"step": 699
},
{
"epoch": 1.4145601617795753,
"grad_norm": 0.16710326075553894,
"learning_rate": 0.00024582867195841227,
"loss": 0.2863,
"mean_token_accuracy": 0.9007730670273304,
"num_tokens": 47743310.0,
"step": 700
},
{
"epoch": 1.416582406471183,
"grad_norm": 0.18456129729747772,
"learning_rate": 0.0002454399756082422,
"loss": 0.2765,
"mean_token_accuracy": 0.8989297412335873,
"num_tokens": 47804656.0,
"step": 701
},
{
"epoch": 1.4186046511627908,
"grad_norm": 0.14485791325569153,
"learning_rate": 0.0002450511581728118,
"loss": 0.2378,
"mean_token_accuracy": 0.9135924205183983,
"num_tokens": 47877505.0,
"step": 702
},
{
"epoch": 1.4206268958543984,
"grad_norm": 0.16109082102775574,
"learning_rate": 0.00024466222150275427,
"loss": 0.2701,
"mean_token_accuracy": 0.9057381004095078,
"num_tokens": 47947797.0,
"step": 703
},
{
"epoch": 1.422649140546006,
"grad_norm": 0.17397062480449677,
"learning_rate": 0.00024427316744927015,
"loss": 0.2748,
"mean_token_accuracy": 0.9010849967598915,
"num_tokens": 48013032.0,
"step": 704
},
{
"epoch": 1.4246713852376138,
"grad_norm": 0.17228464782238007,
"learning_rate": 0.0002438839978641188,
"loss": 0.2902,
"mean_token_accuracy": 0.8968134559690952,
"num_tokens": 48077137.0,
"step": 705
},
{
"epoch": 1.4266936299292214,
"grad_norm": 0.15708769857883453,
"learning_rate": 0.00024349471459960933,
"loss": 0.2639,
"mean_token_accuracy": 0.9076020307838917,
"num_tokens": 48148193.0,
"step": 706
},
{
"epoch": 1.428715874620829,
"grad_norm": 0.16323234140872955,
"learning_rate": 0.000243105319508592,
"loss": 0.2767,
"mean_token_accuracy": 0.9031167514622211,
"num_tokens": 48216944.0,
"step": 707
},
{
"epoch": 1.4307381193124369,
"grad_norm": 0.19718225300312042,
"learning_rate": 0.00024271581444444936,
"loss": 0.2857,
"mean_token_accuracy": 0.8991989493370056,
"num_tokens": 48289278.0,
"step": 708
},
{
"epoch": 1.4327603640040445,
"grad_norm": 0.18652518093585968,
"learning_rate": 0.0002423262012610874,
"loss": 0.2761,
"mean_token_accuracy": 0.8964316956698895,
"num_tokens": 48356711.0,
"step": 709
},
{
"epoch": 1.434782608695652,
"grad_norm": 0.15871575474739075,
"learning_rate": 0.00024193648181292657,
"loss": 0.2667,
"mean_token_accuracy": 0.9004132300615311,
"num_tokens": 48431698.0,
"step": 710
},
{
"epoch": 1.43680485338726,
"grad_norm": 0.1658415049314499,
"learning_rate": 0.00024154665795489324,
"loss": 0.2923,
"mean_token_accuracy": 0.8983742482960224,
"num_tokens": 48499782.0,
"step": 711
},
{
"epoch": 1.4388270980788676,
"grad_norm": 0.14790105819702148,
"learning_rate": 0.00024115673154241082,
"loss": 0.2752,
"mean_token_accuracy": 0.9012794457376003,
"num_tokens": 48575015.0,
"step": 712
},
{
"epoch": 1.4408493427704752,
"grad_norm": 0.1578913778066635,
"learning_rate": 0.00024076670443139056,
"loss": 0.2717,
"mean_token_accuracy": 0.9049608968198299,
"num_tokens": 48645644.0,
"step": 713
},
{
"epoch": 1.442871587462083,
"grad_norm": 0.14726778864860535,
"learning_rate": 0.00024037657847822327,
"loss": 0.2472,
"mean_token_accuracy": 0.9099989496171474,
"num_tokens": 48721939.0,
"step": 714
},
{
"epoch": 1.4448938321536906,
"grad_norm": 0.1682555377483368,
"learning_rate": 0.00023998635553977,
"loss": 0.255,
"mean_token_accuracy": 0.9088139645755291,
"num_tokens": 48781700.0,
"step": 715
},
{
"epoch": 1.4469160768452982,
"grad_norm": 0.1937257945537567,
"learning_rate": 0.00023959603747335364,
"loss": 0.2787,
"mean_token_accuracy": 0.9022819362580776,
"num_tokens": 48848209.0,
"step": 716
},
{
"epoch": 1.448938321536906,
"grad_norm": 0.18163816630840302,
"learning_rate": 0.0002392056261367497,
"loss": 0.2603,
"mean_token_accuracy": 0.9066541865468025,
"num_tokens": 48908683.0,
"step": 717
},
{
"epoch": 1.4509605662285137,
"grad_norm": 0.17626726627349854,
"learning_rate": 0.00023881512338817763,
"loss": 0.2719,
"mean_token_accuracy": 0.9030824415385723,
"num_tokens": 48971539.0,
"step": 718
},
{
"epoch": 1.4529828109201213,
"grad_norm": 0.19325651228427887,
"learning_rate": 0.00023842453108629207,
"loss": 0.2825,
"mean_token_accuracy": 0.9008334875106812,
"num_tokens": 49036641.0,
"step": 719
},
{
"epoch": 1.4550050556117289,
"grad_norm": 0.15112407505512238,
"learning_rate": 0.00023803385109017375,
"loss": 0.2491,
"mean_token_accuracy": 0.908204834908247,
"num_tokens": 49116609.0,
"step": 720
},
{
"epoch": 1.4570273003033367,
"grad_norm": 0.1619442254304886,
"learning_rate": 0.000237643085259321,
"loss": 0.2674,
"mean_token_accuracy": 0.9027148932218552,
"num_tokens": 49184904.0,
"step": 721
},
{
"epoch": 1.4590495449949443,
"grad_norm": 0.18082739412784576,
"learning_rate": 0.00023725223545364036,
"loss": 0.2897,
"mean_token_accuracy": 0.8995592929422855,
"num_tokens": 49242882.0,
"step": 722
},
{
"epoch": 1.4610717896865522,
"grad_norm": 0.16797882318496704,
"learning_rate": 0.00023686130353343842,
"loss": 0.2752,
"mean_token_accuracy": 0.9008001163601875,
"num_tokens": 49314113.0,
"step": 723
},
{
"epoch": 1.4630940343781598,
"grad_norm": 0.16804397106170654,
"learning_rate": 0.00023647029135941247,
"loss": 0.28,
"mean_token_accuracy": 0.9004204832017422,
"num_tokens": 49380492.0,
"step": 724
},
{
"epoch": 1.4651162790697674,
"grad_norm": 0.189345121383667,
"learning_rate": 0.00023607920079264164,
"loss": 0.3136,
"mean_token_accuracy": 0.8898900300264359,
"num_tokens": 49442489.0,
"step": 725
},
{
"epoch": 1.467138523761375,
"grad_norm": 0.1601288765668869,
"learning_rate": 0.0002356880336945785,
"loss": 0.2766,
"mean_token_accuracy": 0.8993977271020412,
"num_tokens": 49515310.0,
"step": 726
},
{
"epoch": 1.4691607684529828,
"grad_norm": 0.16616767644882202,
"learning_rate": 0.00023529679192703956,
"loss": 0.2233,
"mean_token_accuracy": 0.9060333073139191,
"num_tokens": 49579141.0,
"step": 727
},
{
"epoch": 1.4711830131445904,
"grad_norm": 0.17813973128795624,
"learning_rate": 0.00023490547735219682,
"loss": 0.2772,
"mean_token_accuracy": 0.902538850903511,
"num_tokens": 49651616.0,
"step": 728
},
{
"epoch": 1.4732052578361983,
"grad_norm": 0.16227717697620392,
"learning_rate": 0.0002345140918325689,
"loss": 0.2725,
"mean_token_accuracy": 0.9031726457178593,
"num_tokens": 49723462.0,
"step": 729
},
{
"epoch": 1.4752275025278059,
"grad_norm": 0.17003865540027618,
"learning_rate": 0.00023412263723101214,
"loss": 0.2961,
"mean_token_accuracy": 0.8977791368961334,
"num_tokens": 49787491.0,
"step": 730
},
{
"epoch": 1.4772497472194135,
"grad_norm": 0.16923342645168304,
"learning_rate": 0.0002337311154107115,
"loss": 0.2787,
"mean_token_accuracy": 0.9015961550176144,
"num_tokens": 49854833.0,
"step": 731
},
{
"epoch": 1.479271991911021,
"grad_norm": 0.1851927489042282,
"learning_rate": 0.00023333952823517194,
"loss": 0.2898,
"mean_token_accuracy": 0.8972079865634441,
"num_tokens": 49922341.0,
"step": 732
},
{
"epoch": 1.481294236602629,
"grad_norm": 0.1822906881570816,
"learning_rate": 0.0002329478775682095,
"loss": 0.2829,
"mean_token_accuracy": 0.900902509689331,
"num_tokens": 49979729.0,
"step": 733
},
{
"epoch": 1.4833164812942365,
"grad_norm": 0.1649109125137329,
"learning_rate": 0.00023255616527394256,
"loss": 0.2727,
"mean_token_accuracy": 0.9016978107392788,
"num_tokens": 50047775.0,
"step": 734
},
{
"epoch": 1.4853387259858444,
"grad_norm": 0.1738775372505188,
"learning_rate": 0.00023216439321678266,
"loss": 0.281,
"mean_token_accuracy": 0.9027018882334232,
"num_tokens": 50118326.0,
"step": 735
},
{
"epoch": 1.487360970677452,
"grad_norm": 0.1651855707168579,
"learning_rate": 0.00023177256326142577,
"loss": 0.2885,
"mean_token_accuracy": 0.9000568836927414,
"num_tokens": 50188336.0,
"step": 736
},
{
"epoch": 1.4893832153690596,
"grad_norm": 0.17814993858337402,
"learning_rate": 0.00023138067727284352,
"loss": 0.2649,
"mean_token_accuracy": 0.9053604751825333,
"num_tokens": 50253602.0,
"step": 737
},
{
"epoch": 1.4914054600606672,
"grad_norm": 0.18156695365905762,
"learning_rate": 0.00023098873711627427,
"loss": 0.2789,
"mean_token_accuracy": 0.9026945792138577,
"num_tokens": 50320254.0,
"step": 738
},
{
"epoch": 1.493427704752275,
"grad_norm": 0.1529979407787323,
"learning_rate": 0.00023059674465721402,
"loss": 0.2575,
"mean_token_accuracy": 0.9098235592246056,
"num_tokens": 50394210.0,
"step": 739
},
{
"epoch": 1.4954499494438827,
"grad_norm": 0.18546129763126373,
"learning_rate": 0.000230204701761408,
"loss": 0.2723,
"mean_token_accuracy": 0.9047368690371513,
"num_tokens": 50462482.0,
"step": 740
},
{
"epoch": 1.4974721941354905,
"grad_norm": 0.17348864674568176,
"learning_rate": 0.00022981261029484117,
"loss": 0.2877,
"mean_token_accuracy": 0.9010139890015125,
"num_tokens": 50533752.0,
"step": 741
},
{
"epoch": 1.499494438827098,
"grad_norm": 0.18445433676242828,
"learning_rate": 0.00022942047212372996,
"loss": 0.2889,
"mean_token_accuracy": 0.8973320014774799,
"num_tokens": 50595611.0,
"step": 742
},
{
"epoch": 1.5015166835187057,
"grad_norm": 0.1771615445613861,
"learning_rate": 0.00022902828911451284,
"loss": 0.2869,
"mean_token_accuracy": 0.9018849320709705,
"num_tokens": 50660163.0,
"step": 743
},
{
"epoch": 1.5035389282103133,
"grad_norm": 0.17673981189727783,
"learning_rate": 0.00022863606313384193,
"loss": 0.2745,
"mean_token_accuracy": 0.9061728455126286,
"num_tokens": 50735476.0,
"step": 744
},
{
"epoch": 1.5055611729019212,
"grad_norm": 0.16728192567825317,
"learning_rate": 0.00022824379604857376,
"loss": 0.27,
"mean_token_accuracy": 0.8988127410411835,
"num_tokens": 50802788.0,
"step": 745
},
{
"epoch": 1.5075834175935288,
"grad_norm": 0.15720367431640625,
"learning_rate": 0.0002278514897257605,
"loss": 0.2768,
"mean_token_accuracy": 0.903729647397995,
"num_tokens": 50871752.0,
"step": 746
},
{
"epoch": 1.5096056622851366,
"grad_norm": 0.1581096202135086,
"learning_rate": 0.00022745914603264114,
"loss": 0.2782,
"mean_token_accuracy": 0.9031247049570084,
"num_tokens": 50946163.0,
"step": 747
},
{
"epoch": 1.5116279069767442,
"grad_norm": 0.16542676091194153,
"learning_rate": 0.00022706676683663239,
"loss": 0.2615,
"mean_token_accuracy": 0.9070020318031311,
"num_tokens": 51020476.0,
"step": 748
},
{
"epoch": 1.5136501516683518,
"grad_norm": 0.15188099443912506,
"learning_rate": 0.00022667435400532013,
"loss": 0.2683,
"mean_token_accuracy": 0.9043072015047073,
"num_tokens": 51099534.0,
"step": 749
},
{
"epoch": 1.5156723963599594,
"grad_norm": 0.16521647572517395,
"learning_rate": 0.00022628190940645023,
"loss": 0.2762,
"mean_token_accuracy": 0.9001554064452648,
"num_tokens": 51160512.0,
"step": 750
},
{
"epoch": 1.5176946410515673,
"grad_norm": 0.14251260459423065,
"learning_rate": 0.00022588943490791974,
"loss": 0.2354,
"mean_token_accuracy": 0.9080785401165485,
"num_tokens": 51240154.0,
"step": 751
},
{
"epoch": 1.5197168857431749,
"grad_norm": 0.18312643468379974,
"learning_rate": 0.00022549693237776812,
"loss": 0.2882,
"mean_token_accuracy": 0.896622322499752,
"num_tokens": 51306825.0,
"step": 752
},
{
"epoch": 1.5217391304347827,
"grad_norm": 0.1863006204366684,
"learning_rate": 0.00022510440368416813,
"loss": 0.2827,
"mean_token_accuracy": 0.9015981592237949,
"num_tokens": 51374019.0,
"step": 753
},
{
"epoch": 1.5237613751263903,
"grad_norm": 0.2003999501466751,
"learning_rate": 0.0002247118506954172,
"loss": 0.2999,
"mean_token_accuracy": 0.8948666267096996,
"num_tokens": 51437280.0,
"step": 754
},
{
"epoch": 1.525783619817998,
"grad_norm": 0.15196073055267334,
"learning_rate": 0.00022431927527992822,
"loss": 0.2457,
"mean_token_accuracy": 0.9064719304442406,
"num_tokens": 51516774.0,
"step": 755
},
{
"epoch": 1.5278058645096055,
"grad_norm": 0.16066138446331024,
"learning_rate": 0.00022392667930622105,
"loss": 0.2567,
"mean_token_accuracy": 0.9101277217268944,
"num_tokens": 51587203.0,
"step": 756
},
{
"epoch": 1.5298281092012134,
"grad_norm": 0.2019067108631134,
"learning_rate": 0.0002235340646429131,
"loss": 0.288,
"mean_token_accuracy": 0.8997247666120529,
"num_tokens": 51647601.0,
"step": 757
},
{
"epoch": 1.531850353892821,
"grad_norm": 0.1630539447069168,
"learning_rate": 0.00022314143315871107,
"loss": 0.2839,
"mean_token_accuracy": 0.9003589190542698,
"num_tokens": 51709791.0,
"step": 758
},
{
"epoch": 1.5338725985844288,
"grad_norm": 0.17768684029579163,
"learning_rate": 0.0002227487867224014,
"loss": 0.2953,
"mean_token_accuracy": 0.8950943425297737,
"num_tokens": 51775485.0,
"step": 759
},
{
"epoch": 1.5358948432760364,
"grad_norm": 0.16720645129680634,
"learning_rate": 0.000222356127202842,
"loss": 0.268,
"mean_token_accuracy": 0.9044617936015129,
"num_tokens": 51840213.0,
"step": 760
},
{
"epoch": 1.537917087967644,
"grad_norm": 0.18721389770507812,
"learning_rate": 0.00022196345646895282,
"loss": 0.3132,
"mean_token_accuracy": 0.8925869949162006,
"num_tokens": 51902000.0,
"step": 761
},
{
"epoch": 1.5399393326592516,
"grad_norm": 0.16676832735538483,
"learning_rate": 0.00022157077638970733,
"loss": 0.2685,
"mean_token_accuracy": 0.9057548753917217,
"num_tokens": 51971547.0,
"step": 762
},
{
"epoch": 1.5419615773508595,
"grad_norm": 0.17367734014987946,
"learning_rate": 0.00022117808883412337,
"loss": 0.2919,
"mean_token_accuracy": 0.8966298326849937,
"num_tokens": 52041743.0,
"step": 763
},
{
"epoch": 1.543983822042467,
"grad_norm": 0.15831947326660156,
"learning_rate": 0.0002207853956712544,
"loss": 0.2713,
"mean_token_accuracy": 0.9037296660244465,
"num_tokens": 52114445.0,
"step": 764
},
{
"epoch": 1.546006066734075,
"grad_norm": 0.1643955409526825,
"learning_rate": 0.00022039269877018066,
"loss": 0.2555,
"mean_token_accuracy": 0.9053449369966984,
"num_tokens": 52184749.0,
"step": 765
},
{
"epoch": 1.5480283114256825,
"grad_norm": 0.19596439599990845,
"learning_rate": 0.00022000000000000003,
"loss": 0.2991,
"mean_token_accuracy": 0.8983559235930443,
"num_tokens": 52246858.0,
"step": 766
},
{
"epoch": 1.5500505561172901,
"grad_norm": 0.17947359383106232,
"learning_rate": 0.00021960730122981938,
"loss": 0.3053,
"mean_token_accuracy": 0.894125934690237,
"num_tokens": 52311538.0,
"step": 767
},
{
"epoch": 1.5520728008088978,
"grad_norm": 0.1566184163093567,
"learning_rate": 0.00021921460432874565,
"loss": 0.2471,
"mean_token_accuracy": 0.9079805836081505,
"num_tokens": 52377316.0,
"step": 768
},
{
"epoch": 1.5540950455005056,
"grad_norm": 0.1782991886138916,
"learning_rate": 0.0002188219111658767,
"loss": 0.293,
"mean_token_accuracy": 0.8960098177194595,
"num_tokens": 52439738.0,
"step": 769
},
{
"epoch": 1.5561172901921132,
"grad_norm": 0.1581069380044937,
"learning_rate": 0.0002184292236102927,
"loss": 0.2728,
"mean_token_accuracy": 0.901589822024107,
"num_tokens": 52511123.0,
"step": 770
},
{
"epoch": 1.558139534883721,
"grad_norm": 0.16994433104991913,
"learning_rate": 0.0002180365435310472,
"loss": 0.2735,
"mean_token_accuracy": 0.9033331945538521,
"num_tokens": 52576097.0,
"step": 771
},
{
"epoch": 1.5601617795753286,
"grad_norm": 0.1678851991891861,
"learning_rate": 0.00021764387279715806,
"loss": 0.2903,
"mean_token_accuracy": 0.8981217853724957,
"num_tokens": 52651544.0,
"step": 772
},
{
"epoch": 1.5621840242669363,
"grad_norm": 0.19909563660621643,
"learning_rate": 0.00021725121327759866,
"loss": 0.2981,
"mean_token_accuracy": 0.8940173611044884,
"num_tokens": 52710252.0,
"step": 773
},
{
"epoch": 1.5642062689585439,
"grad_norm": 0.15204082429409027,
"learning_rate": 0.00021685856684128897,
"loss": 0.2523,
"mean_token_accuracy": 0.9075472876429558,
"num_tokens": 52781084.0,
"step": 774
},
{
"epoch": 1.5662285136501517,
"grad_norm": 0.19516132771968842,
"learning_rate": 0.00021646593535708695,
"loss": 0.2984,
"mean_token_accuracy": 0.8923540487885475,
"num_tokens": 52844889.0,
"step": 775
},
{
"epoch": 1.5682507583417593,
"grad_norm": 0.16001375019550323,
"learning_rate": 0.00021607332069377902,
"loss": 0.2668,
"mean_token_accuracy": 0.898894976824522,
"num_tokens": 52910879.0,
"step": 776
},
{
"epoch": 1.5702730030333671,
"grad_norm": 0.1823982298374176,
"learning_rate": 0.00021568072472007185,
"loss": 0.301,
"mean_token_accuracy": 0.8939221948385239,
"num_tokens": 52970597.0,
"step": 777
},
{
"epoch": 1.5722952477249748,
"grad_norm": 0.17761389911174774,
"learning_rate": 0.0002152881493045829,
"loss": 0.2601,
"mean_token_accuracy": 0.9084571748971939,
"num_tokens": 53042768.0,
"step": 778
},
{
"epoch": 1.5743174924165824,
"grad_norm": 0.17410063743591309,
"learning_rate": 0.00021489559631583194,
"loss": 0.272,
"mean_token_accuracy": 0.8999650180339813,
"num_tokens": 53103091.0,
"step": 779
},
{
"epoch": 1.57633973710819,
"grad_norm": 0.15084944665431976,
"learning_rate": 0.00021450306762223198,
"loss": 0.2387,
"mean_token_accuracy": 0.9114542976021767,
"num_tokens": 53180173.0,
"step": 780
},
{
"epoch": 1.5783619817997978,
"grad_norm": 0.18222583830356598,
"learning_rate": 0.00021411056509208033,
"loss": 0.2994,
"mean_token_accuracy": 0.8931626752018929,
"num_tokens": 53243670.0,
"step": 781
},
{
"epoch": 1.5803842264914054,
"grad_norm": 0.19381971657276154,
"learning_rate": 0.0002137180905935499,
"loss": 0.3116,
"mean_token_accuracy": 0.8938373290002346,
"num_tokens": 53301834.0,
"step": 782
},
{
"epoch": 1.5824064711830133,
"grad_norm": 0.157192200422287,
"learning_rate": 0.00021332564599467997,
"loss": 0.2654,
"mean_token_accuracy": 0.9061449654400349,
"num_tokens": 53368342.0,
"step": 783
},
{
"epoch": 1.5844287158746209,
"grad_norm": 0.1753574013710022,
"learning_rate": 0.00021293323316336774,
"loss": 0.2793,
"mean_token_accuracy": 0.901081707328558,
"num_tokens": 53430874.0,
"step": 784
},
{
"epoch": 1.5864509605662285,
"grad_norm": 0.17201204597949982,
"learning_rate": 0.00021254085396735895,
"loss": 0.2965,
"mean_token_accuracy": 0.8940661884844303,
"num_tokens": 53503559.0,
"step": 785
},
{
"epoch": 1.588473205257836,
"grad_norm": 0.16792644560337067,
"learning_rate": 0.00021214851027423953,
"loss": 0.2853,
"mean_token_accuracy": 0.8981418162584305,
"num_tokens": 53570685.0,
"step": 786
},
{
"epoch": 1.590495449949444,
"grad_norm": 0.1627027541399002,
"learning_rate": 0.00021175620395142631,
"loss": 0.2726,
"mean_token_accuracy": 0.9035519734025002,
"num_tokens": 53641626.0,
"step": 787
},
{
"epoch": 1.5925176946410515,
"grad_norm": 0.1961667835712433,
"learning_rate": 0.00021136393686615814,
"loss": 0.2932,
"mean_token_accuracy": 0.8953234739601612,
"num_tokens": 53703211.0,
"step": 788
},
{
"epoch": 1.5945399393326594,
"grad_norm": 0.16637316346168518,
"learning_rate": 0.00021097171088548718,
"loss": 0.2643,
"mean_token_accuracy": 0.8997809141874313,
"num_tokens": 53774031.0,
"step": 789
},
{
"epoch": 1.596562184024267,
"grad_norm": 0.16356298327445984,
"learning_rate": 0.0002105795278762701,
"loss": 0.2812,
"mean_token_accuracy": 0.9001871235668659,
"num_tokens": 53842430.0,
"step": 790
},
{
"epoch": 1.5985844287158746,
"grad_norm": 0.15379726886749268,
"learning_rate": 0.00021018738970515885,
"loss": 0.2753,
"mean_token_accuracy": 0.9013938829302788,
"num_tokens": 53918815.0,
"step": 791
},
{
"epoch": 1.6006066734074822,
"grad_norm": 0.17770731449127197,
"learning_rate": 0.000209795298238592,
"loss": 0.2775,
"mean_token_accuracy": 0.9020545892417431,
"num_tokens": 53984486.0,
"step": 792
},
{
"epoch": 1.60262891809909,
"grad_norm": 0.18510940670967102,
"learning_rate": 0.00020940325534278596,
"loss": 0.3084,
"mean_token_accuracy": 0.8904885537922382,
"num_tokens": 54049329.0,
"step": 793
},
{
"epoch": 1.6046511627906976,
"grad_norm": 0.15585996210575104,
"learning_rate": 0.00020901126288372574,
"loss": 0.243,
"mean_token_accuracy": 0.9050154872238636,
"num_tokens": 54121377.0,
"step": 794
},
{
"epoch": 1.6066734074823055,
"grad_norm": 0.2045961320400238,
"learning_rate": 0.0002086193227271565,
"loss": 0.3026,
"mean_token_accuracy": 0.8903013169765472,
"num_tokens": 54181133.0,
"step": 795
},
{
"epoch": 1.608695652173913,
"grad_norm": 0.15141364932060242,
"learning_rate": 0.00020822743673857424,
"loss": 0.2622,
"mean_token_accuracy": 0.90499372407794,
"num_tokens": 54263363.0,
"step": 796
},
{
"epoch": 1.6107178968655207,
"grad_norm": 0.15345874428749084,
"learning_rate": 0.0002078356067832174,
"loss": 0.2547,
"mean_token_accuracy": 0.904791995882988,
"num_tokens": 54334487.0,
"step": 797
},
{
"epoch": 1.6127401415571283,
"grad_norm": 0.16000673174858093,
"learning_rate": 0.00020744383472605745,
"loss": 0.2731,
"mean_token_accuracy": 0.9041004255414009,
"num_tokens": 54403142.0,
"step": 798
},
{
"epoch": 1.6147623862487361,
"grad_norm": 0.16879165172576904,
"learning_rate": 0.0002070521224317905,
"loss": 0.2736,
"mean_token_accuracy": 0.9043679311871529,
"num_tokens": 54467728.0,
"step": 799
},
{
"epoch": 1.6167846309403437,
"grad_norm": 0.16487041115760803,
"learning_rate": 0.00020666047176482816,
"loss": 0.2929,
"mean_token_accuracy": 0.8993552401661873,
"num_tokens": 54537407.0,
"step": 800
},
{
"epoch": 1.6188068756319516,
"grad_norm": 0.17032210528850555,
"learning_rate": 0.00020626888458928858,
"loss": 0.2799,
"mean_token_accuracy": 0.8998575955629349,
"num_tokens": 54599784.0,
"step": 801
},
{
"epoch": 1.6208291203235592,
"grad_norm": 0.17334811389446259,
"learning_rate": 0.00020587736276898798,
"loss": 0.2606,
"mean_token_accuracy": 0.901070773601532,
"num_tokens": 54663420.0,
"step": 802
},
{
"epoch": 1.6228513650151668,
"grad_norm": 0.1741548776626587,
"learning_rate": 0.00020548590816743108,
"loss": 0.2823,
"mean_token_accuracy": 0.8988193459808826,
"num_tokens": 54727696.0,
"step": 803
},
{
"epoch": 1.6248736097067744,
"grad_norm": 0.1664174348115921,
"learning_rate": 0.00020509452264780325,
"loss": 0.2643,
"mean_token_accuracy": 0.9047059267759323,
"num_tokens": 54791606.0,
"step": 804
},
{
"epoch": 1.6268958543983822,
"grad_norm": 0.14816100895404816,
"learning_rate": 0.0002047032080729605,
"loss": 0.2449,
"mean_token_accuracy": 0.903932623565197,
"num_tokens": 54865039.0,
"step": 805
},
{
"epoch": 1.6289180990899899,
"grad_norm": 0.12588512897491455,
"learning_rate": 0.00020431196630542152,
"loss": 0.227,
"mean_token_accuracy": 0.9160388633608818,
"num_tokens": 54958620.0,
"step": 806
},
{
"epoch": 1.6309403437815977,
"grad_norm": 0.17510341107845306,
"learning_rate": 0.00020392079920735835,
"loss": 0.2963,
"mean_token_accuracy": 0.8962272480130196,
"num_tokens": 55024008.0,
"step": 807
},
{
"epoch": 1.6329625884732053,
"grad_norm": 0.1542372852563858,
"learning_rate": 0.00020352970864058757,
"loss": 0.2614,
"mean_token_accuracy": 0.9044002443552017,
"num_tokens": 55087163.0,
"step": 808
},
{
"epoch": 1.634984833164813,
"grad_norm": 0.16116388142108917,
"learning_rate": 0.00020313869646656162,
"loss": 0.2721,
"mean_token_accuracy": 0.9043215177953243,
"num_tokens": 55154699.0,
"step": 809
},
{
"epoch": 1.6370070778564205,
"grad_norm": 0.1448214203119278,
"learning_rate": 0.0002027477645463597,
"loss": 0.2598,
"mean_token_accuracy": 0.9074460677802563,
"num_tokens": 55224995.0,
"step": 810
},
{
"epoch": 1.6390293225480284,
"grad_norm": 0.17575567960739136,
"learning_rate": 0.00020235691474067912,
"loss": 0.2647,
"mean_token_accuracy": 0.9028755128383636,
"num_tokens": 55291465.0,
"step": 811
},
{
"epoch": 1.641051567239636,
"grad_norm": 0.1718558669090271,
"learning_rate": 0.0002019661489098263,
"loss": 0.2658,
"mean_token_accuracy": 0.9058180525898933,
"num_tokens": 55356793.0,
"step": 812
},
{
"epoch": 1.6430738119312438,
"grad_norm": 0.16898474097251892,
"learning_rate": 0.00020157546891370797,
"loss": 0.2868,
"mean_token_accuracy": 0.9008054211735725,
"num_tokens": 55428748.0,
"step": 813
},
{
"epoch": 1.6450960566228514,
"grad_norm": 0.1628302037715912,
"learning_rate": 0.00020118487661182241,
"loss": 0.2667,
"mean_token_accuracy": 0.9067884795367718,
"num_tokens": 55499100.0,
"step": 814
},
{
"epoch": 1.647118301314459,
"grad_norm": 0.18391703069210052,
"learning_rate": 0.00020079437386325032,
"loss": 0.3138,
"mean_token_accuracy": 0.8893741592764854,
"num_tokens": 55570834.0,
"step": 815
},
{
"epoch": 1.6491405460060666,
"grad_norm": 0.17336952686309814,
"learning_rate": 0.00020040396252664642,
"loss": 0.2778,
"mean_token_accuracy": 0.9026199728250504,
"num_tokens": 55640251.0,
"step": 816
},
{
"epoch": 1.6511627906976745,
"grad_norm": 0.14611810445785522,
"learning_rate": 0.00020001364446023002,
"loss": 0.2433,
"mean_token_accuracy": 0.9094121158123016,
"num_tokens": 55722284.0,
"step": 817
},
{
"epoch": 1.653185035389282,
"grad_norm": 0.15244677662849426,
"learning_rate": 0.0001996234215217768,
"loss": 0.2825,
"mean_token_accuracy": 0.8958746008574963,
"num_tokens": 55792901.0,
"step": 818
},
{
"epoch": 1.65520728008089,
"grad_norm": 0.17220915853977203,
"learning_rate": 0.00019923329556860954,
"loss": 0.2887,
"mean_token_accuracy": 0.8924598507583141,
"num_tokens": 55854966.0,
"step": 819
},
{
"epoch": 1.6572295247724975,
"grad_norm": 0.1598389595746994,
"learning_rate": 0.00019884326845758925,
"loss": 0.2718,
"mean_token_accuracy": 0.9044957980513573,
"num_tokens": 55929575.0,
"step": 820
},
{
"epoch": 1.6592517694641051,
"grad_norm": 0.1721997857093811,
"learning_rate": 0.0001984533420451068,
"loss": 0.2944,
"mean_token_accuracy": 0.8970884680747986,
"num_tokens": 55997255.0,
"step": 821
},
{
"epoch": 1.6612740141557127,
"grad_norm": 0.184437558054924,
"learning_rate": 0.0001980635181870735,
"loss": 0.2996,
"mean_token_accuracy": 0.8938624709844589,
"num_tokens": 56059407.0,
"step": 822
},
{
"epoch": 1.6632962588473206,
"grad_norm": 0.17128629982471466,
"learning_rate": 0.0001976737987389127,
"loss": 0.2999,
"mean_token_accuracy": 0.892108865082264,
"num_tokens": 56124709.0,
"step": 823
},
{
"epoch": 1.6653185035389282,
"grad_norm": 0.18039193749427795,
"learning_rate": 0.00019728418555555068,
"loss": 0.248,
"mean_token_accuracy": 0.8995288237929344,
"num_tokens": 56191016.0,
"step": 824
},
{
"epoch": 1.667340748230536,
"grad_norm": 0.16719485819339752,
"learning_rate": 0.00019689468049140802,
"loss": 0.2673,
"mean_token_accuracy": 0.9037236869335175,
"num_tokens": 56259334.0,
"step": 825
},
{
"epoch": 1.6693629929221436,
"grad_norm": 0.16847628355026245,
"learning_rate": 0.00019650528540039077,
"loss": 0.266,
"mean_token_accuracy": 0.9061383940279484,
"num_tokens": 56326477.0,
"step": 826
},
{
"epoch": 1.6713852376137512,
"grad_norm": 0.17857936024665833,
"learning_rate": 0.00019611600213588127,
"loss": 0.3023,
"mean_token_accuracy": 0.8900899365544319,
"num_tokens": 56386327.0,
"step": 827
},
{
"epoch": 1.6734074823053589,
"grad_norm": 0.18187786638736725,
"learning_rate": 0.0001957268325507299,
"loss": 0.3001,
"mean_token_accuracy": 0.8927515000104904,
"num_tokens": 56446400.0,
"step": 828
},
{
"epoch": 1.6754297269969667,
"grad_norm": 0.15920601785182953,
"learning_rate": 0.0001953377784972458,
"loss": 0.2834,
"mean_token_accuracy": 0.8984440118074417,
"num_tokens": 56516627.0,
"step": 829
},
{
"epoch": 1.6774519716885743,
"grad_norm": 0.16971920430660248,
"learning_rate": 0.00019494884182718827,
"loss": 0.2845,
"mean_token_accuracy": 0.8991547487676144,
"num_tokens": 56586404.0,
"step": 830
},
{
"epoch": 1.6794742163801821,
"grad_norm": 0.16059236228466034,
"learning_rate": 0.00019456002439175794,
"loss": 0.2658,
"mean_token_accuracy": 0.9038873426616192,
"num_tokens": 56657253.0,
"step": 831
},
{
"epoch": 1.6814964610717897,
"grad_norm": 0.16817672550678253,
"learning_rate": 0.00019417132804158777,
"loss": 0.2825,
"mean_token_accuracy": 0.8981058970093727,
"num_tokens": 56725926.0,
"step": 832
},
{
"epoch": 1.6835187057633973,
"grad_norm": 0.15651072561740875,
"learning_rate": 0.00019378275462673464,
"loss": 0.2683,
"mean_token_accuracy": 0.9055442661046982,
"num_tokens": 56794928.0,
"step": 833
},
{
"epoch": 1.685540950455005,
"grad_norm": 0.16662436723709106,
"learning_rate": 0.00019339430599667009,
"loss": 0.2795,
"mean_token_accuracy": 0.9005163908004761,
"num_tokens": 56861202.0,
"step": 834
},
{
"epoch": 1.6875631951466128,
"grad_norm": 0.15520507097244263,
"learning_rate": 0.0001930059840002719,
"loss": 0.2789,
"mean_token_accuracy": 0.9018525704741478,
"num_tokens": 56940546.0,
"step": 835
},
{
"epoch": 1.6895854398382204,
"grad_norm": 0.16705678403377533,
"learning_rate": 0.00019261779048581498,
"loss": 0.2817,
"mean_token_accuracy": 0.9004562273621559,
"num_tokens": 57010510.0,
"step": 836
},
{
"epoch": 1.6916076845298282,
"grad_norm": 0.17928999662399292,
"learning_rate": 0.00019222972730096281,
"loss": 0.2898,
"mean_token_accuracy": 0.8954050242900848,
"num_tokens": 57076063.0,
"step": 837
},
{
"epoch": 1.6936299292214358,
"grad_norm": 0.17176282405853271,
"learning_rate": 0.00019184179629275842,
"loss": 0.2784,
"mean_token_accuracy": 0.9002024792134762,
"num_tokens": 57139142.0,
"step": 838
},
{
"epoch": 1.6956521739130435,
"grad_norm": 0.2011646181344986,
"learning_rate": 0.00019145399930761592,
"loss": 0.33,
"mean_token_accuracy": 0.8854256272315979,
"num_tokens": 57196478.0,
"step": 839
},
{
"epoch": 1.697674418604651,
"grad_norm": 0.1873674988746643,
"learning_rate": 0.00019106633819131132,
"loss": 0.2956,
"mean_token_accuracy": 0.8927418142557144,
"num_tokens": 57257834.0,
"step": 840
},
{
"epoch": 1.699696663296259,
"grad_norm": 0.15767039358615875,
"learning_rate": 0.00019067881478897406,
"loss": 0.2603,
"mean_token_accuracy": 0.8997323326766491,
"num_tokens": 57332979.0,
"step": 841
},
{
"epoch": 1.7017189079878665,
"grad_norm": 0.1793752908706665,
"learning_rate": 0.00019029143094507803,
"loss": 0.2914,
"mean_token_accuracy": 0.8960652127861977,
"num_tokens": 57393796.0,
"step": 842
},
{
"epoch": 1.7037411526794743,
"grad_norm": 0.17136353254318237,
"learning_rate": 0.00018990418850343299,
"loss": 0.2831,
"mean_token_accuracy": 0.8985873088240623,
"num_tokens": 57461020.0,
"step": 843
},
{
"epoch": 1.705763397371082,
"grad_norm": 0.15511257946491241,
"learning_rate": 0.0001895170893071756,
"loss": 0.2648,
"mean_token_accuracy": 0.9044994860887527,
"num_tokens": 57534185.0,
"step": 844
},
{
"epoch": 1.7077856420626896,
"grad_norm": 0.16191929578781128,
"learning_rate": 0.00018913013519876066,
"loss": 0.2689,
"mean_token_accuracy": 0.9016621857881546,
"num_tokens": 57602240.0,
"step": 845
},
{
"epoch": 1.7098078867542972,
"grad_norm": 0.17678587138652802,
"learning_rate": 0.00018874332801995257,
"loss": 0.2791,
"mean_token_accuracy": 0.9009885340929031,
"num_tokens": 57665999.0,
"step": 846
},
{
"epoch": 1.7118301314459048,
"grad_norm": 0.14890553057193756,
"learning_rate": 0.0001883566696118162,
"loss": 0.2469,
"mean_token_accuracy": 0.9083396308124065,
"num_tokens": 57745250.0,
"step": 847
},
{
"epoch": 1.7138523761375126,
"grad_norm": 0.16399073600769043,
"learning_rate": 0.00018797016181470856,
"loss": 0.2699,
"mean_token_accuracy": 0.9010614044964314,
"num_tokens": 57820665.0,
"step": 848
},
{
"epoch": 1.7158746208291205,
"grad_norm": 0.17773596942424774,
"learning_rate": 0.00018758380646826943,
"loss": 0.2801,
"mean_token_accuracy": 0.9002369157969952,
"num_tokens": 57882848.0,
"step": 849
},
{
"epoch": 1.717896865520728,
"grad_norm": 0.17527812719345093,
"learning_rate": 0.00018719760541141347,
"loss": 0.285,
"mean_token_accuracy": 0.8988419659435749,
"num_tokens": 57956449.0,
"step": 850
},
{
"epoch": 1.7199191102123357,
"grad_norm": 0.17075812816619873,
"learning_rate": 0.00018681156048232063,
"loss": 0.2797,
"mean_token_accuracy": 0.8975733481347561,
"num_tokens": 58019620.0,
"step": 851
},
{
"epoch": 1.7219413549039433,
"grad_norm": 0.162892147898674,
"learning_rate": 0.00018642567351842776,
"loss": 0.3048,
"mean_token_accuracy": 0.8936393298208714,
"num_tokens": 58084770.0,
"step": 852
},
{
"epoch": 1.723963599595551,
"grad_norm": 0.1569058746099472,
"learning_rate": 0.0001860399463564201,
"loss": 0.2779,
"mean_token_accuracy": 0.90330421179533,
"num_tokens": 58149930.0,
"step": 853
},
{
"epoch": 1.7259858442871587,
"grad_norm": 0.15333376824855804,
"learning_rate": 0.00018565438083222193,
"loss": 0.2431,
"mean_token_accuracy": 0.9056979790329933,
"num_tokens": 58218764.0,
"step": 854
},
{
"epoch": 1.7280080889787666,
"grad_norm": 0.18095627427101135,
"learning_rate": 0.00018526897878098857,
"loss": 0.2914,
"mean_token_accuracy": 0.8964138776063919,
"num_tokens": 58280108.0,
"step": 855
},
{
"epoch": 1.7300303336703742,
"grad_norm": 0.17549115419387817,
"learning_rate": 0.00018488374203709694,
"loss": 0.2715,
"mean_token_accuracy": 0.9019583091139793,
"num_tokens": 58349603.0,
"step": 856
},
{
"epoch": 1.7320525783619818,
"grad_norm": 0.14637798070907593,
"learning_rate": 0.00018449867243413732,
"loss": 0.2479,
"mean_token_accuracy": 0.9110586978495121,
"num_tokens": 58423158.0,
"step": 857
},
{
"epoch": 1.7340748230535894,
"grad_norm": 0.18153415620326996,
"learning_rate": 0.00018411377180490454,
"loss": 0.2838,
"mean_token_accuracy": 0.8981715328991413,
"num_tokens": 58489878.0,
"step": 858
},
{
"epoch": 1.736097067745197,
"grad_norm": 0.14081305265426636,
"learning_rate": 0.00018372904198138895,
"loss": 0.2421,
"mean_token_accuracy": 0.9120564199984074,
"num_tokens": 58567119.0,
"step": 859
},
{
"epoch": 1.7381193124368048,
"grad_norm": 0.19423925876617432,
"learning_rate": 0.0001833444847947681,
"loss": 0.2827,
"mean_token_accuracy": 0.8960412628948689,
"num_tokens": 58629512.0,
"step": 860
},
{
"epoch": 1.7401415571284127,
"grad_norm": 0.1835591346025467,
"learning_rate": 0.00018296010207539775,
"loss": 0.3066,
"mean_token_accuracy": 0.8935861364006996,
"num_tokens": 58692056.0,
"step": 861
},
{
"epoch": 1.7421638018200203,
"grad_norm": 0.17017914354801178,
"learning_rate": 0.00018257589565280337,
"loss": 0.2839,
"mean_token_accuracy": 0.8955631256103516,
"num_tokens": 58757823.0,
"step": 862
},
{
"epoch": 1.744186046511628,
"grad_norm": 0.17654229700565338,
"learning_rate": 0.0001821918673556712,
"loss": 0.2856,
"mean_token_accuracy": 0.9003425352275372,
"num_tokens": 58820185.0,
"step": 863
},
{
"epoch": 1.7462082912032355,
"grad_norm": 0.18433596193790436,
"learning_rate": 0.00018180801901183967,
"loss": 0.276,
"mean_token_accuracy": 0.9058196842670441,
"num_tokens": 58888573.0,
"step": 864
},
{
"epoch": 1.7482305358948431,
"grad_norm": 0.16942624747753143,
"learning_rate": 0.0001814243524482909,
"loss": 0.2676,
"mean_token_accuracy": 0.9044988267123699,
"num_tokens": 58953010.0,
"step": 865
},
{
"epoch": 1.750252780586451,
"grad_norm": 0.1317698061466217,
"learning_rate": 0.0001810408694911415,
"loss": 0.2423,
"mean_token_accuracy": 0.9077907241880894,
"num_tokens": 59032037.0,
"step": 866
},
{
"epoch": 1.7522750252780588,
"grad_norm": 0.17676536738872528,
"learning_rate": 0.00018065757196563444,
"loss": 0.2834,
"mean_token_accuracy": 0.90084283426404,
"num_tokens": 59102101.0,
"step": 867
},
{
"epoch": 1.7542972699696664,
"grad_norm": 0.16460995376110077,
"learning_rate": 0.00018027446169612983,
"loss": 0.266,
"mean_token_accuracy": 0.8994225487112999,
"num_tokens": 59175507.0,
"step": 868
},
{
"epoch": 1.756319514661274,
"grad_norm": 0.15954379737377167,
"learning_rate": 0.0001798915405060968,
"loss": 0.2613,
"mean_token_accuracy": 0.9075300879776478,
"num_tokens": 59241915.0,
"step": 869
},
{
"epoch": 1.7583417593528816,
"grad_norm": 0.17243851721286774,
"learning_rate": 0.00017950881021810435,
"loss": 0.2653,
"mean_token_accuracy": 0.9034992009401321,
"num_tokens": 59305436.0,
"step": 870
},
{
"epoch": 1.7603640040444892,
"grad_norm": 0.17760290205478668,
"learning_rate": 0.00017912627265381285,
"loss": 0.2885,
"mean_token_accuracy": 0.8978960253298283,
"num_tokens": 59370395.0,
"step": 871
},
{
"epoch": 1.762386248736097,
"grad_norm": 0.17663156986236572,
"learning_rate": 0.00017874392963396552,
"loss": 0.2931,
"mean_token_accuracy": 0.8978605940937996,
"num_tokens": 59435634.0,
"step": 872
},
{
"epoch": 1.764408493427705,
"grad_norm": 0.17674268782138824,
"learning_rate": 0.00017836178297837938,
"loss": 0.2717,
"mean_token_accuracy": 0.9010186977684498,
"num_tokens": 59500074.0,
"step": 873
},
{
"epoch": 1.7664307381193125,
"grad_norm": 0.16433486342430115,
"learning_rate": 0.0001779798345059371,
"loss": 0.2598,
"mean_token_accuracy": 0.9028237722814083,
"num_tokens": 59574564.0,
"step": 874
},
{
"epoch": 1.7684529828109201,
"grad_norm": 0.15955936908721924,
"learning_rate": 0.0001775980860345778,
"loss": 0.2677,
"mean_token_accuracy": 0.9025723747909069,
"num_tokens": 59650978.0,
"step": 875
},
{
"epoch": 1.7704752275025277,
"grad_norm": 0.1750318855047226,
"learning_rate": 0.00017721653938128888,
"loss": 0.2866,
"mean_token_accuracy": 0.8999117016792297,
"num_tokens": 59714437.0,
"step": 876
},
{
"epoch": 1.7724974721941353,
"grad_norm": 0.15407449007034302,
"learning_rate": 0.00017683519636209707,
"loss": 0.2586,
"mean_token_accuracy": 0.9031764194369316,
"num_tokens": 59795096.0,
"step": 877
},
{
"epoch": 1.7745197168857432,
"grad_norm": 0.16260726749897003,
"learning_rate": 0.00017645405879205983,
"loss": 0.275,
"mean_token_accuracy": 0.9040297567844391,
"num_tokens": 59862394.0,
"step": 878
},
{
"epoch": 1.776541961577351,
"grad_norm": 0.16649970412254333,
"learning_rate": 0.0001760731284852568,
"loss": 0.278,
"mean_token_accuracy": 0.8974411375820637,
"num_tokens": 59932031.0,
"step": 879
},
{
"epoch": 1.7785642062689586,
"grad_norm": 0.1494332104921341,
"learning_rate": 0.0001756924072547813,
"loss": 0.2579,
"mean_token_accuracy": 0.905670553445816,
"num_tokens": 60011025.0,
"step": 880
},
{
"epoch": 1.7805864509605662,
"grad_norm": 0.18167705833911896,
"learning_rate": 0.00017531189691273106,
"loss": 0.2776,
"mean_token_accuracy": 0.8976808004081249,
"num_tokens": 60068820.0,
"step": 881
},
{
"epoch": 1.7826086956521738,
"grad_norm": 0.16186164319515228,
"learning_rate": 0.00017493159927020054,
"loss": 0.2811,
"mean_token_accuracy": 0.9016175977885723,
"num_tokens": 60130140.0,
"step": 882
},
{
"epoch": 1.7846309403437814,
"grad_norm": 0.17380307614803314,
"learning_rate": 0.0001745515161372716,
"loss": 0.2945,
"mean_token_accuracy": 0.8955324217677116,
"num_tokens": 60193073.0,
"step": 883
},
{
"epoch": 1.7866531850353893,
"grad_norm": 0.17945754528045654,
"learning_rate": 0.00017417164932300502,
"loss": 0.2722,
"mean_token_accuracy": 0.8948768936097622,
"num_tokens": 60255959.0,
"step": 884
},
{
"epoch": 1.7886754297269971,
"grad_norm": 0.16201643645763397,
"learning_rate": 0.00017379200063543225,
"loss": 0.2761,
"mean_token_accuracy": 0.8984379507601261,
"num_tokens": 60331653.0,
"step": 885
},
{
"epoch": 1.7906976744186047,
"grad_norm": 0.17004264891147614,
"learning_rate": 0.00017341257188154625,
"loss": 0.2785,
"mean_token_accuracy": 0.902726124972105,
"num_tokens": 60397891.0,
"step": 886
},
{
"epoch": 1.7927199191102123,
"grad_norm": 0.17423401772975922,
"learning_rate": 0.0001730333648672934,
"loss": 0.2663,
"mean_token_accuracy": 0.9040607661008835,
"num_tokens": 60463271.0,
"step": 887
},
{
"epoch": 1.79474216380182,
"grad_norm": 0.17113754153251648,
"learning_rate": 0.00017265438139756455,
"loss": 0.2754,
"mean_token_accuracy": 0.901301734149456,
"num_tokens": 60527757.0,
"step": 888
},
{
"epoch": 1.7967644084934276,
"grad_norm": 0.1624325066804886,
"learning_rate": 0.00017227562327618655,
"loss": 0.264,
"mean_token_accuracy": 0.8982259891927242,
"num_tokens": 60590938.0,
"step": 889
},
{
"epoch": 1.7987866531850354,
"grad_norm": 0.17364652454853058,
"learning_rate": 0.00017189709230591376,
"loss": 0.2768,
"mean_token_accuracy": 0.8977219946682453,
"num_tokens": 60666355.0,
"step": 890
},
{
"epoch": 1.8008088978766432,
"grad_norm": 0.15866470336914062,
"learning_rate": 0.00017151879028841935,
"loss": 0.2556,
"mean_token_accuracy": 0.9094675220549107,
"num_tokens": 60734511.0,
"step": 891
},
{
"epoch": 1.8028311425682508,
"grad_norm": 0.16959354281425476,
"learning_rate": 0.0001711407190242867,
"loss": 0.2827,
"mean_token_accuracy": 0.9035419821739197,
"num_tokens": 60800072.0,
"step": 892
},
{
"epoch": 1.8048533872598584,
"grad_norm": 0.13593734800815582,
"learning_rate": 0.00017076288031300086,
"loss": 0.2233,
"mean_token_accuracy": 0.9116230644285679,
"num_tokens": 60877569.0,
"step": 893
},
{
"epoch": 1.806875631951466,
"grad_norm": 0.159558966755867,
"learning_rate": 0.00017038527595294016,
"loss": 0.2713,
"mean_token_accuracy": 0.9025290682911873,
"num_tokens": 60946273.0,
"step": 894
},
{
"epoch": 1.8088978766430737,
"grad_norm": 0.14993025362491608,
"learning_rate": 0.00017000790774136744,
"loss": 0.2563,
"mean_token_accuracy": 0.906671367585659,
"num_tokens": 61021490.0,
"step": 895
},
{
"epoch": 1.8109201213346815,
"grad_norm": 0.16624176502227783,
"learning_rate": 0.00016963077747442147,
"loss": 0.285,
"mean_token_accuracy": 0.9001613892614841,
"num_tokens": 61087077.0,
"step": 896
},
{
"epoch": 1.8129423660262893,
"grad_norm": 0.18598856031894684,
"learning_rate": 0.00016925388694710857,
"loss": 0.2816,
"mean_token_accuracy": 0.8991341292858124,
"num_tokens": 61155366.0,
"step": 897
},
{
"epoch": 1.814964610717897,
"grad_norm": 0.15858127176761627,
"learning_rate": 0.00016887723795329395,
"loss": 0.259,
"mean_token_accuracy": 0.9013683348894119,
"num_tokens": 61227279.0,
"step": 898
},
{
"epoch": 1.8169868554095046,
"grad_norm": 0.17217408120632172,
"learning_rate": 0.00016850083228569327,
"loss": 0.3001,
"mean_token_accuracy": 0.8970577903091908,
"num_tokens": 61294506.0,
"step": 899
},
{
"epoch": 1.8190091001011122,
"grad_norm": 0.15035738050937653,
"learning_rate": 0.00016812467173586395,
"loss": 0.2645,
"mean_token_accuracy": 0.9000033251941204,
"num_tokens": 61365391.0,
"step": 900
},
{
"epoch": 1.8210313447927198,
"grad_norm": 0.17095452547073364,
"learning_rate": 0.0001677487580941968,
"loss": 0.2723,
"mean_token_accuracy": 0.9036833345890045,
"num_tokens": 61430318.0,
"step": 901
},
{
"epoch": 1.8230535894843276,
"grad_norm": 0.18995128571987152,
"learning_rate": 0.00016737309314990742,
"loss": 0.2963,
"mean_token_accuracy": 0.897097785025835,
"num_tokens": 61490667.0,
"step": 902
},
{
"epoch": 1.8250758341759354,
"grad_norm": 0.15838812291622162,
"learning_rate": 0.00016699767869102767,
"loss": 0.2597,
"mean_token_accuracy": 0.9020838551223278,
"num_tokens": 61566103.0,
"step": 903
},
{
"epoch": 1.827098078867543,
"grad_norm": 0.17972201108932495,
"learning_rate": 0.00016662251650439725,
"loss": 0.2853,
"mean_token_accuracy": 0.899272233247757,
"num_tokens": 61628595.0,
"step": 904
},
{
"epoch": 1.8291203235591507,
"grad_norm": 0.1463383138179779,
"learning_rate": 0.0001662476083756551,
"loss": 0.26,
"mean_token_accuracy": 0.9080706797540188,
"num_tokens": 61703786.0,
"step": 905
},
{
"epoch": 1.8311425682507583,
"grad_norm": 0.16255010664463043,
"learning_rate": 0.00016587295608923088,
"loss": 0.2805,
"mean_token_accuracy": 0.9013442508876324,
"num_tokens": 61776819.0,
"step": 906
},
{
"epoch": 1.8331648129423659,
"grad_norm": 0.17431674897670746,
"learning_rate": 0.0001654985614283366,
"loss": 0.2877,
"mean_token_accuracy": 0.9047906063497066,
"num_tokens": 61846922.0,
"step": 907
},
{
"epoch": 1.8351870576339737,
"grad_norm": 0.1731417030096054,
"learning_rate": 0.00016512442617495804,
"loss": 0.2809,
"mean_token_accuracy": 0.8943095356225967,
"num_tokens": 61913305.0,
"step": 908
},
{
"epoch": 1.8372093023255816,
"grad_norm": 0.17473085224628448,
"learning_rate": 0.00016475055210984641,
"loss": 0.2765,
"mean_token_accuracy": 0.9039146527647972,
"num_tokens": 61974613.0,
"step": 909
},
{
"epoch": 1.8392315470171892,
"grad_norm": 0.1697629690170288,
"learning_rate": 0.00016437694101250952,
"loss": 0.2672,
"mean_token_accuracy": 0.9050569906830788,
"num_tokens": 62042608.0,
"step": 910
},
{
"epoch": 1.8412537917087968,
"grad_norm": 0.1614944189786911,
"learning_rate": 0.00016400359466120366,
"loss": 0.2737,
"mean_token_accuracy": 0.9029634855687618,
"num_tokens": 62112444.0,
"step": 911
},
{
"epoch": 1.8432760364004044,
"grad_norm": 0.17687106132507324,
"learning_rate": 0.00016363051483292513,
"loss": 0.2648,
"mean_token_accuracy": 0.9044081643223763,
"num_tokens": 62181562.0,
"step": 912
},
{
"epoch": 1.845298281092012,
"grad_norm": 0.1807907372713089,
"learning_rate": 0.0001632577033034015,
"loss": 0.274,
"mean_token_accuracy": 0.9006009586155415,
"num_tokens": 62245198.0,
"step": 913
},
{
"epoch": 1.8473205257836198,
"grad_norm": 0.19810381531715393,
"learning_rate": 0.00016288516184708346,
"loss": 0.2893,
"mean_token_accuracy": 0.8972717076539993,
"num_tokens": 62308012.0,
"step": 914
},
{
"epoch": 1.8493427704752277,
"grad_norm": 0.15699312090873718,
"learning_rate": 0.00016251289223713616,
"loss": 0.2744,
"mean_token_accuracy": 0.9030490145087242,
"num_tokens": 62379728.0,
"step": 915
},
{
"epoch": 1.8513650151668353,
"grad_norm": 0.1709468960762024,
"learning_rate": 0.000162140896245431,
"loss": 0.2471,
"mean_token_accuracy": 0.9071713648736477,
"num_tokens": 62446402.0,
"step": 916
},
{
"epoch": 1.8533872598584429,
"grad_norm": 0.152323380112648,
"learning_rate": 0.00016176917564253679,
"loss": 0.231,
"mean_token_accuracy": 0.9163475334644318,
"num_tokens": 62521000.0,
"step": 917
},
{
"epoch": 1.8554095045500505,
"grad_norm": 0.1929645538330078,
"learning_rate": 0.00016139773219771186,
"loss": 0.2972,
"mean_token_accuracy": 0.8963135108351707,
"num_tokens": 62582288.0,
"step": 918
},
{
"epoch": 1.857431749241658,
"grad_norm": 0.14357374608516693,
"learning_rate": 0.00016102656767889522,
"loss": 0.2525,
"mean_token_accuracy": 0.905899915844202,
"num_tokens": 62659214.0,
"step": 919
},
{
"epoch": 1.859453993933266,
"grad_norm": 0.18147152662277222,
"learning_rate": 0.00016065568385269834,
"loss": 0.3062,
"mean_token_accuracy": 0.891651626676321,
"num_tokens": 62721172.0,
"step": 920
},
{
"epoch": 1.8614762386248738,
"grad_norm": 0.15798717737197876,
"learning_rate": 0.0001602850824843967,
"loss": 0.2441,
"mean_token_accuracy": 0.9044736139476299,
"num_tokens": 62790693.0,
"step": 921
},
{
"epoch": 1.8634984833164814,
"grad_norm": 0.20527228713035583,
"learning_rate": 0.00015991476533792125,
"loss": 0.2862,
"mean_token_accuracy": 0.9020938500761986,
"num_tokens": 62861978.0,
"step": 922
},
{
"epoch": 1.865520728008089,
"grad_norm": 0.17211146652698517,
"learning_rate": 0.00015954473417585042,
"loss": 0.278,
"mean_token_accuracy": 0.901647973805666,
"num_tokens": 62928176.0,
"step": 923
},
{
"epoch": 1.8675429726996966,
"grad_norm": 0.14656521379947662,
"learning_rate": 0.00015917499075940116,
"loss": 0.2436,
"mean_token_accuracy": 0.9071595072746277,
"num_tokens": 63008955.0,
"step": 924
},
{
"epoch": 1.8695652173913042,
"grad_norm": 0.16858512163162231,
"learning_rate": 0.000158805536848421,
"loss": 0.2838,
"mean_token_accuracy": 0.8976234942674637,
"num_tokens": 63077352.0,
"step": 925
},
{
"epoch": 1.871587462082912,
"grad_norm": 0.15110129117965698,
"learning_rate": 0.00015843637420137965,
"loss": 0.2491,
"mean_token_accuracy": 0.9075470231473446,
"num_tokens": 63155136.0,
"step": 926
},
{
"epoch": 1.8736097067745199,
"grad_norm": 0.16917841136455536,
"learning_rate": 0.00015806750457536016,
"loss": 0.2777,
"mean_token_accuracy": 0.9005607068538666,
"num_tokens": 63228469.0,
"step": 927
},
{
"epoch": 1.8756319514661275,
"grad_norm": 0.15289200842380524,
"learning_rate": 0.00015769892972605125,
"loss": 0.2535,
"mean_token_accuracy": 0.9035063087940216,
"num_tokens": 63299131.0,
"step": 928
},
{
"epoch": 1.877654196157735,
"grad_norm": 0.16520720720291138,
"learning_rate": 0.00015733065140773845,
"loss": 0.2742,
"mean_token_accuracy": 0.9034424312412739,
"num_tokens": 63370295.0,
"step": 929
},
{
"epoch": 1.8796764408493427,
"grad_norm": 0.15712064504623413,
"learning_rate": 0.00015696267137329584,
"loss": 0.2667,
"mean_token_accuracy": 0.9040120244026184,
"num_tokens": 63437736.0,
"step": 930
},
{
"epoch": 1.8816986855409503,
"grad_norm": 0.1603911817073822,
"learning_rate": 0.00015659499137417798,
"loss": 0.2507,
"mean_token_accuracy": 0.9087044671177864,
"num_tokens": 63509676.0,
"step": 931
},
{
"epoch": 1.8837209302325582,
"grad_norm": 0.16669879853725433,
"learning_rate": 0.00015622761316041114,
"loss": 0.278,
"mean_token_accuracy": 0.8977540507912636,
"num_tokens": 63576754.0,
"step": 932
},
{
"epoch": 1.885743174924166,
"grad_norm": 0.17182767391204834,
"learning_rate": 0.00015586053848058536,
"loss": 0.2526,
"mean_token_accuracy": 0.9016401395201683,
"num_tokens": 63643843.0,
"step": 933
},
{
"epoch": 1.8877654196157736,
"grad_norm": 0.17400912940502167,
"learning_rate": 0.00015549376908184596,
"loss": 0.282,
"mean_token_accuracy": 0.8970470912754536,
"num_tokens": 63712033.0,
"step": 934
},
{
"epoch": 1.8897876643073812,
"grad_norm": 0.16362541913986206,
"learning_rate": 0.00015512730670988508,
"loss": 0.2794,
"mean_token_accuracy": 0.9033955708146095,
"num_tokens": 63783615.0,
"step": 935
},
{
"epoch": 1.8918099089989888,
"grad_norm": 0.20413319766521454,
"learning_rate": 0.00015476115310893374,
"loss": 0.2973,
"mean_token_accuracy": 0.8986438475549221,
"num_tokens": 63837579.0,
"step": 936
},
{
"epoch": 1.8938321536905964,
"grad_norm": 0.173280730843544,
"learning_rate": 0.00015439531002175305,
"loss": 0.2614,
"mean_token_accuracy": 0.9053931087255478,
"num_tokens": 63904296.0,
"step": 937
},
{
"epoch": 1.8958543983822043,
"grad_norm": 0.16067558526992798,
"learning_rate": 0.00015402977918962653,
"loss": 0.2688,
"mean_token_accuracy": 0.905962623655796,
"num_tokens": 63982577.0,
"step": 938
},
{
"epoch": 1.897876643073812,
"grad_norm": 0.18021517992019653,
"learning_rate": 0.00015366456235235113,
"loss": 0.2935,
"mean_token_accuracy": 0.8951955139636993,
"num_tokens": 64038048.0,
"step": 939
},
{
"epoch": 1.8998988877654197,
"grad_norm": 0.14851278066635132,
"learning_rate": 0.0001532996612482295,
"loss": 0.2661,
"mean_token_accuracy": 0.9066961444914341,
"num_tokens": 64113768.0,
"step": 940
},
{
"epoch": 1.9019211324570273,
"grad_norm": 0.17288359999656677,
"learning_rate": 0.00015293507761406148,
"loss": 0.271,
"mean_token_accuracy": 0.9030660726130009,
"num_tokens": 64178434.0,
"step": 941
},
{
"epoch": 1.903943377148635,
"grad_norm": 0.16324573755264282,
"learning_rate": 0.00015257081318513583,
"loss": 0.274,
"mean_token_accuracy": 0.9019493535161018,
"num_tokens": 64249882.0,
"step": 942
},
{
"epoch": 1.9059656218402425,
"grad_norm": 0.15509222447872162,
"learning_rate": 0.0001522068696952221,
"loss": 0.2354,
"mean_token_accuracy": 0.9143304452300072,
"num_tokens": 64322937.0,
"step": 943
},
{
"epoch": 1.9079878665318504,
"grad_norm": 0.1547105610370636,
"learning_rate": 0.00015184324887656208,
"loss": 0.2553,
"mean_token_accuracy": 0.9079734869301319,
"num_tokens": 64393253.0,
"step": 944
},
{
"epoch": 1.910010111223458,
"grad_norm": 0.15001994371414185,
"learning_rate": 0.00015147995245986203,
"loss": 0.2549,
"mean_token_accuracy": 0.9065254330635071,
"num_tokens": 64470294.0,
"step": 945
},
{
"epoch": 1.9120323559150658,
"grad_norm": 0.17263031005859375,
"learning_rate": 0.00015111698217428385,
"loss": 0.2766,
"mean_token_accuracy": 0.9019508697092533,
"num_tokens": 64541359.0,
"step": 946
},
{
"epoch": 1.9140546006066734,
"grad_norm": 0.19937334954738617,
"learning_rate": 0.0001507543397474375,
"loss": 0.2893,
"mean_token_accuracy": 0.8960909508168697,
"num_tokens": 64601687.0,
"step": 947
},
{
"epoch": 1.916076845298281,
"grad_norm": 0.20299410820007324,
"learning_rate": 0.00015039202690537233,
"loss": 0.2875,
"mean_token_accuracy": 0.8969489298760891,
"num_tokens": 64662730.0,
"step": 948
},
{
"epoch": 1.9180990899898887,
"grad_norm": 0.17673259973526,
"learning_rate": 0.0001500300453725688,
"loss": 0.285,
"mean_token_accuracy": 0.8983747102320194,
"num_tokens": 64726699.0,
"step": 949
},
{
"epoch": 1.9201213346814965,
"grad_norm": 0.14203934371471405,
"learning_rate": 0.00014966839687193074,
"loss": 0.2413,
"mean_token_accuracy": 0.9119373075664043,
"num_tokens": 64804474.0,
"step": 950
},
{
"epoch": 1.922143579373104,
"grad_norm": 0.18115116655826569,
"learning_rate": 0.0001493070831247767,
"loss": 0.2618,
"mean_token_accuracy": 0.9050916060805321,
"num_tokens": 64867023.0,
"step": 951
},
{
"epoch": 1.924165824064712,
"grad_norm": 0.15658792853355408,
"learning_rate": 0.00014894610585083196,
"loss": 0.2539,
"mean_token_accuracy": 0.9065564014017582,
"num_tokens": 64933593.0,
"step": 952
},
{
"epoch": 1.9261880687563195,
"grad_norm": 0.18066135048866272,
"learning_rate": 0.00014858546676822023,
"loss": 0.2731,
"mean_token_accuracy": 0.9004339128732681,
"num_tokens": 64997732.0,
"step": 953
},
{
"epoch": 1.9282103134479271,
"grad_norm": 0.15237212181091309,
"learning_rate": 0.0001482251675934557,
"loss": 0.2476,
"mean_token_accuracy": 0.9087250605225563,
"num_tokens": 65080000.0,
"step": 954
},
{
"epoch": 1.9302325581395348,
"grad_norm": 0.18067006766796112,
"learning_rate": 0.00014786521004143467,
"loss": 0.2712,
"mean_token_accuracy": 0.9025260508060455,
"num_tokens": 65148696.0,
"step": 955
},
{
"epoch": 1.9322548028311426,
"grad_norm": 0.15837518870830536,
"learning_rate": 0.00014750559582542736,
"loss": 0.2606,
"mean_token_accuracy": 0.9080248959362507,
"num_tokens": 65223230.0,
"step": 956
},
{
"epoch": 1.9342770475227502,
"grad_norm": 0.16518649458885193,
"learning_rate": 0.00014714632665706985,
"loss": 0.2539,
"mean_token_accuracy": 0.9098630361258984,
"num_tokens": 65292846.0,
"step": 957
},
{
"epoch": 1.936299292214358,
"grad_norm": 0.18779224157333374,
"learning_rate": 0.000146787404246356,
"loss": 0.282,
"mean_token_accuracy": 0.8994725160300732,
"num_tokens": 65354948.0,
"step": 958
},
{
"epoch": 1.9383215369059656,
"grad_norm": 0.16804009675979614,
"learning_rate": 0.0001464288303016292,
"loss": 0.2521,
"mean_token_accuracy": 0.9077105298638344,
"num_tokens": 65425082.0,
"step": 959
},
{
"epoch": 1.9403437815975733,
"grad_norm": 0.17569729685783386,
"learning_rate": 0.00014607060652957414,
"loss": 0.2914,
"mean_token_accuracy": 0.8924459666013718,
"num_tokens": 65491402.0,
"step": 960
},
{
"epoch": 1.9423660262891809,
"grad_norm": 0.14672434329986572,
"learning_rate": 0.00014571273463520897,
"loss": 0.2628,
"mean_token_accuracy": 0.9076977856457233,
"num_tokens": 65563535.0,
"step": 961
},
{
"epoch": 1.9443882709807887,
"grad_norm": 0.1623447835445404,
"learning_rate": 0.00014535521632187703,
"loss": 0.2759,
"mean_token_accuracy": 0.9029062166810036,
"num_tokens": 65629601.0,
"step": 962
},
{
"epoch": 1.9464105156723963,
"grad_norm": 0.1764685958623886,
"learning_rate": 0.00014499805329123858,
"loss": 0.3043,
"mean_token_accuracy": 0.8929594941437244,
"num_tokens": 65689192.0,
"step": 963
},
{
"epoch": 1.9484327603640041,
"grad_norm": 0.1544012725353241,
"learning_rate": 0.000144641247243263,
"loss": 0.2664,
"mean_token_accuracy": 0.9056011252105236,
"num_tokens": 65761600.0,
"step": 964
},
{
"epoch": 1.9504550050556118,
"grad_norm": 0.17178235948085785,
"learning_rate": 0.00014428479987622055,
"loss": 0.2688,
"mean_token_accuracy": 0.9024265073239803,
"num_tokens": 65824048.0,
"step": 965
},
{
"epoch": 1.9524772497472194,
"grad_norm": 0.17977994680404663,
"learning_rate": 0.00014392871288667415,
"loss": 0.2762,
"mean_token_accuracy": 0.9031669199466705,
"num_tokens": 65889268.0,
"step": 966
},
{
"epoch": 1.954499494438827,
"grad_norm": 0.15329943597316742,
"learning_rate": 0.00014357298796947168,
"loss": 0.2841,
"mean_token_accuracy": 0.8999549075961113,
"num_tokens": 65961165.0,
"step": 967
},
{
"epoch": 1.9565217391304348,
"grad_norm": 0.17066965997219086,
"learning_rate": 0.00014321762681773762,
"loss": 0.2636,
"mean_token_accuracy": 0.9027245566248894,
"num_tokens": 66022951.0,
"step": 968
},
{
"epoch": 1.9585439838220424,
"grad_norm": 0.1542961746454239,
"learning_rate": 0.00014286263112286472,
"loss": 0.2441,
"mean_token_accuracy": 0.9134857915341854,
"num_tokens": 66099844.0,
"step": 969
},
{
"epoch": 1.9605662285136503,
"grad_norm": 0.17265184223651886,
"learning_rate": 0.00014250800257450684,
"loss": 0.2797,
"mean_token_accuracy": 0.9043730795383453,
"num_tokens": 66173153.0,
"step": 970
},
{
"epoch": 1.9625884732052579,
"grad_norm": 0.1839493066072464,
"learning_rate": 0.00014215374286057005,
"loss": 0.2908,
"mean_token_accuracy": 0.8951999023556709,
"num_tokens": 66234689.0,
"step": 971
},
{
"epoch": 1.9646107178968655,
"grad_norm": 0.15913142263889313,
"learning_rate": 0.00014179985366720495,
"loss": 0.2837,
"mean_token_accuracy": 0.9044655375182629,
"num_tokens": 66305941.0,
"step": 972
},
{
"epoch": 1.966632962588473,
"grad_norm": 0.13867108523845673,
"learning_rate": 0.0001414463366787984,
"loss": 0.216,
"mean_token_accuracy": 0.9169092550873756,
"num_tokens": 66381037.0,
"step": 973
},
{
"epoch": 1.968655207280081,
"grad_norm": 0.1802113801240921,
"learning_rate": 0.00014109319357796606,
"loss": 0.3038,
"mean_token_accuracy": 0.893009040504694,
"num_tokens": 66440797.0,
"step": 974
},
{
"epoch": 1.9706774519716885,
"grad_norm": 0.17021583020687103,
"learning_rate": 0.00014074042604554374,
"loss": 0.2733,
"mean_token_accuracy": 0.9027226865291595,
"num_tokens": 66505699.0,
"step": 975
},
{
"epoch": 1.9726996966632964,
"grad_norm": 0.19243739545345306,
"learning_rate": 0.00014038803576057985,
"loss": 0.3087,
"mean_token_accuracy": 0.8891540095210075,
"num_tokens": 66567155.0,
"step": 976
},
{
"epoch": 1.974721941354904,
"grad_norm": 0.15427738428115845,
"learning_rate": 0.00014003602440032693,
"loss": 0.3055,
"mean_token_accuracy": 0.8956649079918861,
"num_tokens": 66644385.0,
"step": 977
},
{
"epoch": 1.9767441860465116,
"grad_norm": 0.15167449414730072,
"learning_rate": 0.00013968439364023442,
"loss": 0.2612,
"mean_token_accuracy": 0.9073714017868042,
"num_tokens": 66714503.0,
"step": 978
},
{
"epoch": 1.9787664307381192,
"grad_norm": 0.1547987014055252,
"learning_rate": 0.00013933314515393995,
"loss": 0.2721,
"mean_token_accuracy": 0.9024667181074619,
"num_tokens": 66779572.0,
"step": 979
},
{
"epoch": 1.980788675429727,
"grad_norm": 0.14774559438228607,
"learning_rate": 0.0001389822806132617,
"loss": 0.2571,
"mean_token_accuracy": 0.907380323857069,
"num_tokens": 66855257.0,
"step": 980
},
{
"epoch": 1.9828109201213346,
"grad_norm": 0.16043910384178162,
"learning_rate": 0.00013863180168819048,
"loss": 0.2523,
"mean_token_accuracy": 0.9105048142373562,
"num_tokens": 66919243.0,
"step": 981
},
{
"epoch": 1.9848331648129425,
"grad_norm": 0.18005625903606415,
"learning_rate": 0.0001382817100468816,
"loss": 0.2914,
"mean_token_accuracy": 0.8974611833691597,
"num_tokens": 66987494.0,
"step": 982
},
{
"epoch": 1.98685540950455,
"grad_norm": 0.1641789674758911,
"learning_rate": 0.00013793200735564716,
"loss": 0.2826,
"mean_token_accuracy": 0.8971075974404812,
"num_tokens": 67053248.0,
"step": 983
},
{
"epoch": 1.9888776541961577,
"grad_norm": 0.15405279397964478,
"learning_rate": 0.00013758269527894778,
"loss": 0.2559,
"mean_token_accuracy": 0.9002925455570221,
"num_tokens": 67124156.0,
"step": 984
},
{
"epoch": 1.9908998988877653,
"grad_norm": 0.15380239486694336,
"learning_rate": 0.00013723377547938522,
"loss": 0.2569,
"mean_token_accuracy": 0.9101624749600887,
"num_tokens": 67196079.0,
"step": 985
},
{
"epoch": 1.9929221435793731,
"grad_norm": 0.16303087770938873,
"learning_rate": 0.00013688524961769395,
"loss": 0.2669,
"mean_token_accuracy": 0.9014462493360043,
"num_tokens": 67264659.0,
"step": 986
},
{
"epoch": 1.9949443882709808,
"grad_norm": 0.17709141969680786,
"learning_rate": 0.00013653711935273326,
"loss": 0.2831,
"mean_token_accuracy": 0.9000302441418171,
"num_tokens": 67326567.0,
"step": 987
},
{
"epoch": 1.9969666329625886,
"grad_norm": 0.16408245265483856,
"learning_rate": 0.00013618938634147996,
"loss": 0.2563,
"mean_token_accuracy": 0.9081169851124287,
"num_tokens": 67393706.0,
"step": 988
},
{
"epoch": 1.9989888776541962,
"grad_norm": 0.1415863037109375,
"learning_rate": 0.00013584205223901976,
"loss": 0.2386,
"mean_token_accuracy": 0.909894797950983,
"num_tokens": 67472677.0,
"step": 989
},
{
"epoch": 2.0,
"grad_norm": 0.19028227031230927,
"learning_rate": 0.00013549511869853973,
"loss": 0.2248,
"mean_token_accuracy": 0.9155159220099449,
"num_tokens": 67511282.0,
"step": 990
}
],
"logging_steps": 1,
"max_steps": 1485,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 8.243396605005267e+18,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}