Transformers
Safetensors
English
PCB
EDA
KiCAD
Hardware-Design
Schematic-Generation
LLM
Circuit-Design
Instructions to use microsoft/SchGen with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use microsoft/SchGen with Transformers:
# Load model directly from transformers import AutoModel model = AutoModel.from_pretrained("microsoft/SchGen", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 2.0, | |
| "eval_steps": 500, | |
| "global_step": 990, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.0020222446916076846, | |
| "grad_norm": 3.467946767807007, | |
| "learning_rate": 0.0, | |
| "loss": 3.4468, | |
| "mean_token_accuracy": 0.4403058011084795, | |
| "num_tokens": 69017.0, | |
| "step": 1 | |
| }, | |
| { | |
| "epoch": 0.004044489383215369, | |
| "grad_norm": 3.8678574562072754, | |
| "learning_rate": 8.88888888888889e-06, | |
| "loss": 3.0508, | |
| "mean_token_accuracy": 0.470831586048007, | |
| "num_tokens": 150583.0, | |
| "step": 2 | |
| }, | |
| { | |
| "epoch": 0.006066734074823054, | |
| "grad_norm": 3.7103006839752197, | |
| "learning_rate": 1.777777777777778e-05, | |
| "loss": 3.285, | |
| "mean_token_accuracy": 0.45825996436178684, | |
| "num_tokens": 221144.0, | |
| "step": 3 | |
| }, | |
| { | |
| "epoch": 0.008088978766430738, | |
| "grad_norm": 3.591843605041504, | |
| "learning_rate": 2.6666666666666667e-05, | |
| "loss": 3.4107, | |
| "mean_token_accuracy": 0.44140205159783363, | |
| "num_tokens": 287737.0, | |
| "step": 4 | |
| }, | |
| { | |
| "epoch": 0.010111223458038422, | |
| "grad_norm": 3.940007209777832, | |
| "learning_rate": 3.555555555555556e-05, | |
| "loss": 3.0975, | |
| "mean_token_accuracy": 0.4831150006502867, | |
| "num_tokens": 362591.0, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.012133468149646108, | |
| "grad_norm": 3.8853604793548584, | |
| "learning_rate": 4.4444444444444447e-05, | |
| "loss": 3.1354, | |
| "mean_token_accuracy": 0.48446146585047245, | |
| "num_tokens": 432327.0, | |
| "step": 6 | |
| }, | |
| { | |
| "epoch": 0.014155712841253791, | |
| "grad_norm": 3.9134953022003174, | |
| "learning_rate": 5.333333333333333e-05, | |
| "loss": 3.215, | |
| "mean_token_accuracy": 0.47610872238874435, | |
| "num_tokens": 506671.0, | |
| "step": 7 | |
| }, | |
| { | |
| "epoch": 0.016177957532861477, | |
| "grad_norm": 4.14130973815918, | |
| "learning_rate": 6.222222222222222e-05, | |
| "loss": 3.0424, | |
| "mean_token_accuracy": 0.47477637231349945, | |
| "num_tokens": 577418.0, | |
| "step": 8 | |
| }, | |
| { | |
| "epoch": 0.01820020222446916, | |
| "grad_norm": 4.15872859954834, | |
| "learning_rate": 7.111111111111112e-05, | |
| "loss": 3.0563, | |
| "mean_token_accuracy": 0.49431027099490166, | |
| "num_tokens": 640014.0, | |
| "step": 9 | |
| }, | |
| { | |
| "epoch": 0.020222446916076844, | |
| "grad_norm": 3.9895355701446533, | |
| "learning_rate": 8e-05, | |
| "loss": 2.6808, | |
| "mean_token_accuracy": 0.5322843790054321, | |
| "num_tokens": 704272.0, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.022244691607684528, | |
| "grad_norm": 4.202198028564453, | |
| "learning_rate": 8.888888888888889e-05, | |
| "loss": 2.6339, | |
| "mean_token_accuracy": 0.5354921519756317, | |
| "num_tokens": 771361.0, | |
| "step": 11 | |
| }, | |
| { | |
| "epoch": 0.024266936299292215, | |
| "grad_norm": 4.070754051208496, | |
| "learning_rate": 9.777777777777778e-05, | |
| "loss": 2.2029, | |
| "mean_token_accuracy": 0.5880691334605217, | |
| "num_tokens": 846229.0, | |
| "step": 12 | |
| }, | |
| { | |
| "epoch": 0.0262891809908999, | |
| "grad_norm": 3.6637940406799316, | |
| "learning_rate": 0.00010666666666666667, | |
| "loss": 1.7795, | |
| "mean_token_accuracy": 0.6244243904948235, | |
| "num_tokens": 927862.0, | |
| "step": 13 | |
| }, | |
| { | |
| "epoch": 0.028311425682507583, | |
| "grad_norm": 3.9786410331726074, | |
| "learning_rate": 0.00011555555555555555, | |
| "loss": 1.9043, | |
| "mean_token_accuracy": 0.6317372992634773, | |
| "num_tokens": 988396.0, | |
| "step": 14 | |
| }, | |
| { | |
| "epoch": 0.030333670374115267, | |
| "grad_norm": 3.229816198348999, | |
| "learning_rate": 0.00012444444444444444, | |
| "loss": 1.63, | |
| "mean_token_accuracy": 0.65444141253829, | |
| "num_tokens": 1047670.0, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.032355915065722954, | |
| "grad_norm": 2.8272366523742676, | |
| "learning_rate": 0.00013333333333333334, | |
| "loss": 1.4858, | |
| "mean_token_accuracy": 0.6778117530047894, | |
| "num_tokens": 1113088.0, | |
| "step": 16 | |
| }, | |
| { | |
| "epoch": 0.034378159757330634, | |
| "grad_norm": 2.599519968032837, | |
| "learning_rate": 0.00014222222222222224, | |
| "loss": 1.323, | |
| "mean_token_accuracy": 0.688772302120924, | |
| "num_tokens": 1178886.0, | |
| "step": 17 | |
| }, | |
| { | |
| "epoch": 0.03640040444893832, | |
| "grad_norm": 2.801631212234497, | |
| "learning_rate": 0.0001511111111111111, | |
| "loss": 1.2173, | |
| "mean_token_accuracy": 0.7124413475394249, | |
| "num_tokens": 1248356.0, | |
| "step": 18 | |
| }, | |
| { | |
| "epoch": 0.03842264914054601, | |
| "grad_norm": 3.745363473892212, | |
| "learning_rate": 0.00016, | |
| "loss": 1.0959, | |
| "mean_token_accuracy": 0.7285233177244663, | |
| "num_tokens": 1324299.0, | |
| "step": 19 | |
| }, | |
| { | |
| "epoch": 0.04044489383215369, | |
| "grad_norm": 4.511194229125977, | |
| "learning_rate": 0.00016888888888888889, | |
| "loss": 1.1729, | |
| "mean_token_accuracy": 0.7189365439116955, | |
| "num_tokens": 1392035.0, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.042467138523761376, | |
| "grad_norm": 4.869667053222656, | |
| "learning_rate": 0.00017777777777777779, | |
| "loss": 0.965, | |
| "mean_token_accuracy": 0.7327957898378372, | |
| "num_tokens": 1474776.0, | |
| "step": 21 | |
| }, | |
| { | |
| "epoch": 0.044489383215369056, | |
| "grad_norm": 3.513063430786133, | |
| "learning_rate": 0.0001866666666666667, | |
| "loss": 0.958, | |
| "mean_token_accuracy": 0.7463030181825161, | |
| "num_tokens": 1546445.0, | |
| "step": 22 | |
| }, | |
| { | |
| "epoch": 0.046511627906976744, | |
| "grad_norm": 2.169617176055908, | |
| "learning_rate": 0.00019555555555555556, | |
| "loss": 0.9572, | |
| "mean_token_accuracy": 0.748451080173254, | |
| "num_tokens": 1614331.0, | |
| "step": 23 | |
| }, | |
| { | |
| "epoch": 0.04853387259858443, | |
| "grad_norm": 1.2484831809997559, | |
| "learning_rate": 0.00020444444444444443, | |
| "loss": 0.8834, | |
| "mean_token_accuracy": 0.7673822268843651, | |
| "num_tokens": 1679566.0, | |
| "step": 24 | |
| }, | |
| { | |
| "epoch": 0.05055611729019211, | |
| "grad_norm": 1.0600098371505737, | |
| "learning_rate": 0.00021333333333333333, | |
| "loss": 0.8514, | |
| "mean_token_accuracy": 0.7709708698093891, | |
| "num_tokens": 1741770.0, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.0525783619817998, | |
| "grad_norm": 1.095992922782898, | |
| "learning_rate": 0.00022222222222222223, | |
| "loss": 0.8617, | |
| "mean_token_accuracy": 0.7583519890904427, | |
| "num_tokens": 1806990.0, | |
| "step": 26 | |
| }, | |
| { | |
| "epoch": 0.054600606673407485, | |
| "grad_norm": 1.0006545782089233, | |
| "learning_rate": 0.0002311111111111111, | |
| "loss": 0.7725, | |
| "mean_token_accuracy": 0.7819164581596851, | |
| "num_tokens": 1872686.0, | |
| "step": 27 | |
| }, | |
| { | |
| "epoch": 0.056622851365015166, | |
| "grad_norm": 0.6671711802482605, | |
| "learning_rate": 0.00024, | |
| "loss": 0.6548, | |
| "mean_token_accuracy": 0.8015744872391224, | |
| "num_tokens": 1943614.0, | |
| "step": 28 | |
| }, | |
| { | |
| "epoch": 0.05864509605662285, | |
| "grad_norm": 0.47610151767730713, | |
| "learning_rate": 0.0002488888888888889, | |
| "loss": 0.6524, | |
| "mean_token_accuracy": 0.8063510619103909, | |
| "num_tokens": 2021034.0, | |
| "step": 29 | |
| }, | |
| { | |
| "epoch": 0.06066734074823053, | |
| "grad_norm": 0.5676872730255127, | |
| "learning_rate": 0.00025777777777777783, | |
| "loss": 0.7402, | |
| "mean_token_accuracy": 0.784897617995739, | |
| "num_tokens": 2087348.0, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.06268958543983821, | |
| "grad_norm": 0.6818390488624573, | |
| "learning_rate": 0.0002666666666666667, | |
| "loss": 0.6894, | |
| "mean_token_accuracy": 0.8017890304327011, | |
| "num_tokens": 2154170.0, | |
| "step": 31 | |
| }, | |
| { | |
| "epoch": 0.06471183013144591, | |
| "grad_norm": 0.5972866415977478, | |
| "learning_rate": 0.0002755555555555556, | |
| "loss": 0.612, | |
| "mean_token_accuracy": 0.8184943534433842, | |
| "num_tokens": 2229392.0, | |
| "step": 32 | |
| }, | |
| { | |
| "epoch": 0.06673407482305359, | |
| "grad_norm": 0.4275088608264923, | |
| "learning_rate": 0.0002844444444444445, | |
| "loss": 0.5885, | |
| "mean_token_accuracy": 0.8229578360915184, | |
| "num_tokens": 2290048.0, | |
| "step": 33 | |
| }, | |
| { | |
| "epoch": 0.06875631951466127, | |
| "grad_norm": 0.3523823618888855, | |
| "learning_rate": 0.0002933333333333333, | |
| "loss": 0.5766, | |
| "mean_token_accuracy": 0.82804736495018, | |
| "num_tokens": 2360740.0, | |
| "step": 34 | |
| }, | |
| { | |
| "epoch": 0.07077856420626896, | |
| "grad_norm": 0.45881009101867676, | |
| "learning_rate": 0.0003022222222222222, | |
| "loss": 0.6217, | |
| "mean_token_accuracy": 0.8134612888097763, | |
| "num_tokens": 2419828.0, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.07280080889787664, | |
| "grad_norm": 0.46817246079444885, | |
| "learning_rate": 0.0003111111111111111, | |
| "loss": 0.5311, | |
| "mean_token_accuracy": 0.8388609476387501, | |
| "num_tokens": 2483206.0, | |
| "step": 36 | |
| }, | |
| { | |
| "epoch": 0.07482305358948432, | |
| "grad_norm": 0.36155763268470764, | |
| "learning_rate": 0.00032, | |
| "loss": 0.5268, | |
| "mean_token_accuracy": 0.8369965106248856, | |
| "num_tokens": 2556908.0, | |
| "step": 37 | |
| }, | |
| { | |
| "epoch": 0.07684529828109202, | |
| "grad_norm": 0.36704790592193604, | |
| "learning_rate": 0.00032888888888888887, | |
| "loss": 0.5548, | |
| "mean_token_accuracy": 0.8294766061007977, | |
| "num_tokens": 2626172.0, | |
| "step": 38 | |
| }, | |
| { | |
| "epoch": 0.0788675429726997, | |
| "grad_norm": 0.3038175106048584, | |
| "learning_rate": 0.00033777777777777777, | |
| "loss": 0.5232, | |
| "mean_token_accuracy": 0.8495447933673859, | |
| "num_tokens": 2693541.0, | |
| "step": 39 | |
| }, | |
| { | |
| "epoch": 0.08088978766430738, | |
| "grad_norm": 0.30305811762809753, | |
| "learning_rate": 0.00034666666666666667, | |
| "loss": 0.4882, | |
| "mean_token_accuracy": 0.8428361192345619, | |
| "num_tokens": 2758471.0, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.08291203235591507, | |
| "grad_norm": 0.33424293994903564, | |
| "learning_rate": 0.00035555555555555557, | |
| "loss": 0.508, | |
| "mean_token_accuracy": 0.8437883704900742, | |
| "num_tokens": 2826093.0, | |
| "step": 41 | |
| }, | |
| { | |
| "epoch": 0.08493427704752275, | |
| "grad_norm": 0.3217228651046753, | |
| "learning_rate": 0.00036444444444444447, | |
| "loss": 0.5045, | |
| "mean_token_accuracy": 0.8461326025426388, | |
| "num_tokens": 2893222.0, | |
| "step": 42 | |
| }, | |
| { | |
| "epoch": 0.08695652173913043, | |
| "grad_norm": 0.26564908027648926, | |
| "learning_rate": 0.0003733333333333334, | |
| "loss": 0.5068, | |
| "mean_token_accuracy": 0.8433473333716393, | |
| "num_tokens": 2956663.0, | |
| "step": 43 | |
| }, | |
| { | |
| "epoch": 0.08897876643073811, | |
| "grad_norm": 0.25354474782943726, | |
| "learning_rate": 0.0003822222222222223, | |
| "loss": 0.4609, | |
| "mean_token_accuracy": 0.8593583293259144, | |
| "num_tokens": 3020245.0, | |
| "step": 44 | |
| }, | |
| { | |
| "epoch": 0.0910010111223458, | |
| "grad_norm": 0.31298667192459106, | |
| "learning_rate": 0.0003911111111111111, | |
| "loss": 0.4884, | |
| "mean_token_accuracy": 0.8503717556595802, | |
| "num_tokens": 3091022.0, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.09302325581395349, | |
| "grad_norm": 0.23926222324371338, | |
| "learning_rate": 0.0004, | |
| "loss": 0.4635, | |
| "mean_token_accuracy": 0.8578044883906841, | |
| "num_tokens": 3167731.0, | |
| "step": 46 | |
| }, | |
| { | |
| "epoch": 0.09504550050556117, | |
| "grad_norm": 0.23057548701763153, | |
| "learning_rate": 0.00039999957163192333, | |
| "loss": 0.4464, | |
| "mean_token_accuracy": 0.8583495616912842, | |
| "num_tokens": 3230183.0, | |
| "step": 47 | |
| }, | |
| { | |
| "epoch": 0.09706774519716886, | |
| "grad_norm": 0.22786663472652435, | |
| "learning_rate": 0.0003999982865297322, | |
| "loss": 0.4165, | |
| "mean_token_accuracy": 0.8637920096516609, | |
| "num_tokens": 3300798.0, | |
| "step": 48 | |
| }, | |
| { | |
| "epoch": 0.09908998988877654, | |
| "grad_norm": 0.27733081579208374, | |
| "learning_rate": 0.0003999961446995433, | |
| "loss": 0.4348, | |
| "mean_token_accuracy": 0.8584615886211395, | |
| "num_tokens": 3368808.0, | |
| "step": 49 | |
| }, | |
| { | |
| "epoch": 0.10111223458038422, | |
| "grad_norm": 0.2632873058319092, | |
| "learning_rate": 0.00039999314615155084, | |
| "loss": 0.4545, | |
| "mean_token_accuracy": 0.8571835160255432, | |
| "num_tokens": 3436471.0, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.10313447927199192, | |
| "grad_norm": 0.20401886105537415, | |
| "learning_rate": 0.000399989290900027, | |
| "loss": 0.426, | |
| "mean_token_accuracy": 0.8630774058401585, | |
| "num_tokens": 3504251.0, | |
| "step": 51 | |
| }, | |
| { | |
| "epoch": 0.1051567239635996, | |
| "grad_norm": 0.2126135528087616, | |
| "learning_rate": 0.0003999845789633213, | |
| "loss": 0.4209, | |
| "mean_token_accuracy": 0.8644996210932732, | |
| "num_tokens": 3569455.0, | |
| "step": 52 | |
| }, | |
| { | |
| "epoch": 0.10717896865520728, | |
| "grad_norm": 0.20767471194267273, | |
| "learning_rate": 0.00039997901036386093, | |
| "loss": 0.4312, | |
| "mean_token_accuracy": 0.8648513294756413, | |
| "num_tokens": 3633701.0, | |
| "step": 53 | |
| }, | |
| { | |
| "epoch": 0.10920121334681497, | |
| "grad_norm": 0.19368676841259003, | |
| "learning_rate": 0.0003999725851281504, | |
| "loss": 0.4219, | |
| "mean_token_accuracy": 0.8675987049937248, | |
| "num_tokens": 3700579.0, | |
| "step": 54 | |
| }, | |
| { | |
| "epoch": 0.11122345803842265, | |
| "grad_norm": 0.19997400045394897, | |
| "learning_rate": 0.0003999653032867717, | |
| "loss": 0.4305, | |
| "mean_token_accuracy": 0.8599656298756599, | |
| "num_tokens": 3766515.0, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.11324570273003033, | |
| "grad_norm": 0.19456814229488373, | |
| "learning_rate": 0.00039995716487438367, | |
| "loss": 0.4084, | |
| "mean_token_accuracy": 0.8680460080504417, | |
| "num_tokens": 3832179.0, | |
| "step": 56 | |
| }, | |
| { | |
| "epoch": 0.11526794742163801, | |
| "grad_norm": 0.19756172597408295, | |
| "learning_rate": 0.00039994816992972227, | |
| "loss": 0.4199, | |
| "mean_token_accuracy": 0.8612547963857651, | |
| "num_tokens": 3898904.0, | |
| "step": 57 | |
| }, | |
| { | |
| "epoch": 0.1172901921132457, | |
| "grad_norm": 0.1712576448917389, | |
| "learning_rate": 0.0003999383184956003, | |
| "loss": 0.36, | |
| "mean_token_accuracy": 0.879060622304678, | |
| "num_tokens": 3976416.0, | |
| "step": 58 | |
| }, | |
| { | |
| "epoch": 0.11931243680485339, | |
| "grad_norm": 0.20002008974552155, | |
| "learning_rate": 0.00039992761061890717, | |
| "loss": 0.4269, | |
| "mean_token_accuracy": 0.8589905127882957, | |
| "num_tokens": 4036526.0, | |
| "step": 59 | |
| }, | |
| { | |
| "epoch": 0.12133468149646107, | |
| "grad_norm": 0.1924401819705963, | |
| "learning_rate": 0.00039991604635060835, | |
| "loss": 0.4268, | |
| "mean_token_accuracy": 0.8678371347486973, | |
| "num_tokens": 4100376.0, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.12335692618806876, | |
| "grad_norm": 0.17639940977096558, | |
| "learning_rate": 0.00039990362574574586, | |
| "loss": 0.3919, | |
| "mean_token_accuracy": 0.8658471070230007, | |
| "num_tokens": 4165704.0, | |
| "step": 61 | |
| }, | |
| { | |
| "epoch": 0.12537917087967643, | |
| "grad_norm": 0.1817377358675003, | |
| "learning_rate": 0.00039989034886343724, | |
| "loss": 0.3735, | |
| "mean_token_accuracy": 0.8759783655405045, | |
| "num_tokens": 4234412.0, | |
| "step": 62 | |
| }, | |
| { | |
| "epoch": 0.12740141557128412, | |
| "grad_norm": 0.18214447796344757, | |
| "learning_rate": 0.00039987621576687585, | |
| "loss": 0.3454, | |
| "mean_token_accuracy": 0.8825861141085625, | |
| "num_tokens": 4307593.0, | |
| "step": 63 | |
| }, | |
| { | |
| "epoch": 0.12942366026289182, | |
| "grad_norm": 0.18159601092338562, | |
| "learning_rate": 0.0003998612265233302, | |
| "loss": 0.3672, | |
| "mean_token_accuracy": 0.8755885139107704, | |
| "num_tokens": 4376630.0, | |
| "step": 64 | |
| }, | |
| { | |
| "epoch": 0.13144590495449948, | |
| "grad_norm": 0.17050184309482574, | |
| "learning_rate": 0.00039984538120414363, | |
| "loss": 0.3333, | |
| "mean_token_accuracy": 0.8833661302924156, | |
| "num_tokens": 4449580.0, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.13346814964610718, | |
| "grad_norm": 0.20457544922828674, | |
| "learning_rate": 0.0003998286798847344, | |
| "loss": 0.4182, | |
| "mean_token_accuracy": 0.8619738966226578, | |
| "num_tokens": 4518076.0, | |
| "step": 66 | |
| }, | |
| { | |
| "epoch": 0.13549039433771487, | |
| "grad_norm": 0.196366086602211, | |
| "learning_rate": 0.00039981112264459486, | |
| "loss": 0.3386, | |
| "mean_token_accuracy": 0.8908565118908882, | |
| "num_tokens": 4581622.0, | |
| "step": 67 | |
| }, | |
| { | |
| "epoch": 0.13751263902932254, | |
| "grad_norm": 0.18182213604450226, | |
| "learning_rate": 0.00039979270956729115, | |
| "loss": 0.3999, | |
| "mean_token_accuracy": 0.8703116998076439, | |
| "num_tokens": 4646580.0, | |
| "step": 68 | |
| }, | |
| { | |
| "epoch": 0.13953488372093023, | |
| "grad_norm": 0.18271780014038086, | |
| "learning_rate": 0.0003997734407404631, | |
| "loss": 0.3504, | |
| "mean_token_accuracy": 0.8762697987258434, | |
| "num_tokens": 4716771.0, | |
| "step": 69 | |
| }, | |
| { | |
| "epoch": 0.14155712841253792, | |
| "grad_norm": 0.19590984284877777, | |
| "learning_rate": 0.0003997533162558233, | |
| "loss": 0.3753, | |
| "mean_token_accuracy": 0.8757792375981808, | |
| "num_tokens": 4789100.0, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.1435793731041456, | |
| "grad_norm": 0.23697857558727264, | |
| "learning_rate": 0.00039973233620915733, | |
| "loss": 0.4225, | |
| "mean_token_accuracy": 0.8598962388932705, | |
| "num_tokens": 4851640.0, | |
| "step": 71 | |
| }, | |
| { | |
| "epoch": 0.14560161779575329, | |
| "grad_norm": 0.19626037776470184, | |
| "learning_rate": 0.0003997105007003228, | |
| "loss": 0.3572, | |
| "mean_token_accuracy": 0.8849809169769287, | |
| "num_tokens": 4916098.0, | |
| "step": 72 | |
| }, | |
| { | |
| "epoch": 0.14762386248736098, | |
| "grad_norm": 0.20964385569095612, | |
| "learning_rate": 0.00039968780983324893, | |
| "loss": 0.3507, | |
| "mean_token_accuracy": 0.8796872869133949, | |
| "num_tokens": 4979744.0, | |
| "step": 73 | |
| }, | |
| { | |
| "epoch": 0.14964610717896865, | |
| "grad_norm": 0.18054573237895966, | |
| "learning_rate": 0.00039966426371593607, | |
| "loss": 0.3683, | |
| "mean_token_accuracy": 0.8814638741314411, | |
| "num_tokens": 5050714.0, | |
| "step": 74 | |
| }, | |
| { | |
| "epoch": 0.15166835187057634, | |
| "grad_norm": 0.16331350803375244, | |
| "learning_rate": 0.0003996398624604556, | |
| "loss": 0.3406, | |
| "mean_token_accuracy": 0.8873084634542465, | |
| "num_tokens": 5130559.0, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.15369059656218403, | |
| "grad_norm": 0.20746077597141266, | |
| "learning_rate": 0.0003996146061829487, | |
| "loss": 0.3762, | |
| "mean_token_accuracy": 0.8765941001474857, | |
| "num_tokens": 5199691.0, | |
| "step": 76 | |
| }, | |
| { | |
| "epoch": 0.1557128412537917, | |
| "grad_norm": 0.16679136455059052, | |
| "learning_rate": 0.0003995884950036263, | |
| "loss": 0.3691, | |
| "mean_token_accuracy": 0.8776806406676769, | |
| "num_tokens": 5277116.0, | |
| "step": 77 | |
| }, | |
| { | |
| "epoch": 0.1577350859453994, | |
| "grad_norm": 0.2159774750471115, | |
| "learning_rate": 0.00039956152904676835, | |
| "loss": 0.4017, | |
| "mean_token_accuracy": 0.8709179721772671, | |
| "num_tokens": 5343258.0, | |
| "step": 78 | |
| }, | |
| { | |
| "epoch": 0.1597573306370071, | |
| "grad_norm": 0.16525208950042725, | |
| "learning_rate": 0.00039953370844072333, | |
| "loss": 0.372, | |
| "mean_token_accuracy": 0.8759802021086216, | |
| "num_tokens": 5418084.0, | |
| "step": 79 | |
| }, | |
| { | |
| "epoch": 0.16177957532861476, | |
| "grad_norm": 0.18534427881240845, | |
| "learning_rate": 0.00039950503331790744, | |
| "loss": 0.4236, | |
| "mean_token_accuracy": 0.8610594123601913, | |
| "num_tokens": 5483557.0, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.16380182002022245, | |
| "grad_norm": 0.17822565138339996, | |
| "learning_rate": 0.000399475503814804, | |
| "loss": 0.347, | |
| "mean_token_accuracy": 0.883899986743927, | |
| "num_tokens": 5559324.0, | |
| "step": 81 | |
| }, | |
| { | |
| "epoch": 0.16582406471183014, | |
| "grad_norm": 0.16568556427955627, | |
| "learning_rate": 0.00039944512007196307, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.8883480541408062, | |
| "num_tokens": 5646732.0, | |
| "step": 82 | |
| }, | |
| { | |
| "epoch": 0.1678463094034378, | |
| "grad_norm": 0.20850011706352234, | |
| "learning_rate": 0.0003994138822340004, | |
| "loss": 0.3727, | |
| "mean_token_accuracy": 0.8808489926159382, | |
| "num_tokens": 5709555.0, | |
| "step": 83 | |
| }, | |
| { | |
| "epoch": 0.1698685540950455, | |
| "grad_norm": 0.19419965147972107, | |
| "learning_rate": 0.00039938179044959714, | |
| "loss": 0.3667, | |
| "mean_token_accuracy": 0.8805488795042038, | |
| "num_tokens": 5779149.0, | |
| "step": 84 | |
| }, | |
| { | |
| "epoch": 0.1718907987866532, | |
| "grad_norm": 0.21039818227291107, | |
| "learning_rate": 0.0003993488448714986, | |
| "loss": 0.3912, | |
| "mean_token_accuracy": 0.8791179358959198, | |
| "num_tokens": 5850163.0, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.17391304347826086, | |
| "grad_norm": 0.2167867124080658, | |
| "learning_rate": 0.00039931504565651424, | |
| "loss": 0.3571, | |
| "mean_token_accuracy": 0.8792387843132019, | |
| "num_tokens": 5916129.0, | |
| "step": 86 | |
| }, | |
| { | |
| "epoch": 0.17593528816986856, | |
| "grad_norm": 0.2154702991247177, | |
| "learning_rate": 0.0003992803929655162, | |
| "loss": 0.3868, | |
| "mean_token_accuracy": 0.8748185895383358, | |
| "num_tokens": 5979082.0, | |
| "step": 87 | |
| }, | |
| { | |
| "epoch": 0.17795753286147623, | |
| "grad_norm": 0.1713341772556305, | |
| "learning_rate": 0.00039924488696343915, | |
| "loss": 0.338, | |
| "mean_token_accuracy": 0.8834210820496082, | |
| "num_tokens": 6048831.0, | |
| "step": 88 | |
| }, | |
| { | |
| "epoch": 0.17997977755308392, | |
| "grad_norm": 0.20742323994636536, | |
| "learning_rate": 0.00039920852781927886, | |
| "loss": 0.3911, | |
| "mean_token_accuracy": 0.868148323148489, | |
| "num_tokens": 6114503.0, | |
| "step": 89 | |
| }, | |
| { | |
| "epoch": 0.1820020222446916, | |
| "grad_norm": 0.18235628306865692, | |
| "learning_rate": 0.0003991713157060922, | |
| "loss": 0.3169, | |
| "mean_token_accuracy": 0.8923499137163162, | |
| "num_tokens": 6184293.0, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.18402426693629928, | |
| "grad_norm": 0.18693064153194427, | |
| "learning_rate": 0.00039913325080099545, | |
| "loss": 0.3678, | |
| "mean_token_accuracy": 0.8744825124740601, | |
| "num_tokens": 6252712.0, | |
| "step": 91 | |
| }, | |
| { | |
| "epoch": 0.18604651162790697, | |
| "grad_norm": 0.19899111986160278, | |
| "learning_rate": 0.0003990943332851641, | |
| "loss": 0.3497, | |
| "mean_token_accuracy": 0.8849819526076317, | |
| "num_tokens": 6313767.0, | |
| "step": 92 | |
| }, | |
| { | |
| "epoch": 0.18806875631951467, | |
| "grad_norm": 0.19068098068237305, | |
| "learning_rate": 0.0003990545633438318, | |
| "loss": 0.3492, | |
| "mean_token_accuracy": 0.8846092559397221, | |
| "num_tokens": 6382110.0, | |
| "step": 93 | |
| }, | |
| { | |
| "epoch": 0.19009100101112233, | |
| "grad_norm": 0.19140516221523285, | |
| "learning_rate": 0.0003990139411662892, | |
| "loss": 0.3434, | |
| "mean_token_accuracy": 0.8847804144024849, | |
| "num_tokens": 6445880.0, | |
| "step": 94 | |
| }, | |
| { | |
| "epoch": 0.19211324570273003, | |
| "grad_norm": 0.22566284239292145, | |
| "learning_rate": 0.00039897246694588364, | |
| "loss": 0.3726, | |
| "mean_token_accuracy": 0.8737127743661404, | |
| "num_tokens": 6512190.0, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.19413549039433772, | |
| "grad_norm": 0.193269744515419, | |
| "learning_rate": 0.00039893014088001754, | |
| "loss": 0.3689, | |
| "mean_token_accuracy": 0.8768584616482258, | |
| "num_tokens": 6581328.0, | |
| "step": 96 | |
| }, | |
| { | |
| "epoch": 0.1961577350859454, | |
| "grad_norm": 0.19110015034675598, | |
| "learning_rate": 0.00039888696317014807, | |
| "loss": 0.3307, | |
| "mean_token_accuracy": 0.8812081180512905, | |
| "num_tokens": 6653124.0, | |
| "step": 97 | |
| }, | |
| { | |
| "epoch": 0.19817997977755308, | |
| "grad_norm": 0.18114197254180908, | |
| "learning_rate": 0.00039884293402178575, | |
| "loss": 0.3451, | |
| "mean_token_accuracy": 0.8798027820885181, | |
| "num_tokens": 6723465.0, | |
| "step": 98 | |
| }, | |
| { | |
| "epoch": 0.20020222446916078, | |
| "grad_norm": 0.19303397834300995, | |
| "learning_rate": 0.0003987980536444938, | |
| "loss": 0.334, | |
| "mean_token_accuracy": 0.8881032280623913, | |
| "num_tokens": 6801637.0, | |
| "step": 99 | |
| }, | |
| { | |
| "epoch": 0.20222446916076844, | |
| "grad_norm": 0.1839206963777542, | |
| "learning_rate": 0.0003987523222518868, | |
| "loss": 0.3344, | |
| "mean_token_accuracy": 0.8791452720761299, | |
| "num_tokens": 6879826.0, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.20424671385237614, | |
| "grad_norm": 0.1716805100440979, | |
| "learning_rate": 0.0003987057400616299, | |
| "loss": 0.3494, | |
| "mean_token_accuracy": 0.8803286664187908, | |
| "num_tokens": 6958940.0, | |
| "step": 101 | |
| }, | |
| { | |
| "epoch": 0.20626895854398383, | |
| "grad_norm": 0.218710795044899, | |
| "learning_rate": 0.000398658307295438, | |
| "loss": 0.3696, | |
| "mean_token_accuracy": 0.8783976249396801, | |
| "num_tokens": 7019640.0, | |
| "step": 102 | |
| }, | |
| { | |
| "epoch": 0.2082912032355915, | |
| "grad_norm": 0.2176671177148819, | |
| "learning_rate": 0.0003986100241790741, | |
| "loss": 0.3778, | |
| "mean_token_accuracy": 0.8742088116705418, | |
| "num_tokens": 7083893.0, | |
| "step": 103 | |
| }, | |
| { | |
| "epoch": 0.2103134479271992, | |
| "grad_norm": 0.20480629801750183, | |
| "learning_rate": 0.0003985608909423487, | |
| "loss": 0.3644, | |
| "mean_token_accuracy": 0.8779697194695473, | |
| "num_tokens": 7146243.0, | |
| "step": 104 | |
| }, | |
| { | |
| "epoch": 0.2123356926188069, | |
| "grad_norm": 0.21523724496364594, | |
| "learning_rate": 0.0003985109078191187, | |
| "loss": 0.3384, | |
| "mean_token_accuracy": 0.8801298663020134, | |
| "num_tokens": 7211820.0, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.21435793731041455, | |
| "grad_norm": 0.2035398781299591, | |
| "learning_rate": 0.00039846007504728593, | |
| "loss": 0.3553, | |
| "mean_token_accuracy": 0.8752279430627823, | |
| "num_tokens": 7280455.0, | |
| "step": 106 | |
| }, | |
| { | |
| "epoch": 0.21638018200202225, | |
| "grad_norm": 0.1565598100423813, | |
| "learning_rate": 0.00039840839286879636, | |
| "loss": 0.3034, | |
| "mean_token_accuracy": 0.8931353390216827, | |
| "num_tokens": 7357510.0, | |
| "step": 107 | |
| }, | |
| { | |
| "epoch": 0.21840242669362994, | |
| "grad_norm": 0.17082397639751434, | |
| "learning_rate": 0.00039835586152963884, | |
| "loss": 0.3135, | |
| "mean_token_accuracy": 0.883228026330471, | |
| "num_tokens": 7428821.0, | |
| "step": 108 | |
| }, | |
| { | |
| "epoch": 0.2204246713852376, | |
| "grad_norm": 0.18526601791381836, | |
| "learning_rate": 0.0003983024812798439, | |
| "loss": 0.3156, | |
| "mean_token_accuracy": 0.88564358279109, | |
| "num_tokens": 7494951.0, | |
| "step": 109 | |
| }, | |
| { | |
| "epoch": 0.2224469160768453, | |
| "grad_norm": 0.20190876722335815, | |
| "learning_rate": 0.0003982482523734827, | |
| "loss": 0.3393, | |
| "mean_token_accuracy": 0.8834404349327087, | |
| "num_tokens": 7558067.0, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.224469160768453, | |
| "grad_norm": 0.1943565011024475, | |
| "learning_rate": 0.00039819317506866543, | |
| "loss": 0.3582, | |
| "mean_token_accuracy": 0.8790641874074936, | |
| "num_tokens": 7630543.0, | |
| "step": 111 | |
| }, | |
| { | |
| "epoch": 0.22649140546006066, | |
| "grad_norm": 0.2155260592699051, | |
| "learning_rate": 0.00039813724962754066, | |
| "loss": 0.3514, | |
| "mean_token_accuracy": 0.8799824342131615, | |
| "num_tokens": 7693798.0, | |
| "step": 112 | |
| }, | |
| { | |
| "epoch": 0.22851365015166836, | |
| "grad_norm": 0.17986060678958893, | |
| "learning_rate": 0.00039808047631629363, | |
| "loss": 0.3361, | |
| "mean_token_accuracy": 0.8870190940797329, | |
| "num_tokens": 7763267.0, | |
| "step": 113 | |
| }, | |
| { | |
| "epoch": 0.23053589484327602, | |
| "grad_norm": 0.18999366462230682, | |
| "learning_rate": 0.00039802285540514504, | |
| "loss": 0.325, | |
| "mean_token_accuracy": 0.8898543640971184, | |
| "num_tokens": 7834437.0, | |
| "step": 114 | |
| }, | |
| { | |
| "epoch": 0.23255813953488372, | |
| "grad_norm": 0.20567375421524048, | |
| "learning_rate": 0.0003979643871683501, | |
| "loss": 0.3734, | |
| "mean_token_accuracy": 0.8742238134145737, | |
| "num_tokens": 7896274.0, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.2345803842264914, | |
| "grad_norm": 0.18579523265361786, | |
| "learning_rate": 0.000397905071884197, | |
| "loss": 0.3543, | |
| "mean_token_accuracy": 0.8827438056468964, | |
| "num_tokens": 7962304.0, | |
| "step": 116 | |
| }, | |
| { | |
| "epoch": 0.23660262891809908, | |
| "grad_norm": 0.1895459145307541, | |
| "learning_rate": 0.00039784490983500514, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.888210829347372, | |
| "num_tokens": 8037020.0, | |
| "step": 117 | |
| }, | |
| { | |
| "epoch": 0.23862487360970677, | |
| "grad_norm": 0.1934623420238495, | |
| "learning_rate": 0.0003977839013071248, | |
| "loss": 0.3172, | |
| "mean_token_accuracy": 0.8874295391142368, | |
| "num_tokens": 8106669.0, | |
| "step": 118 | |
| }, | |
| { | |
| "epoch": 0.24064711830131447, | |
| "grad_norm": 0.18337437510490417, | |
| "learning_rate": 0.0003977220465909348, | |
| "loss": 0.328, | |
| "mean_token_accuracy": 0.884034089744091, | |
| "num_tokens": 8174813.0, | |
| "step": 119 | |
| }, | |
| { | |
| "epoch": 0.24266936299292213, | |
| "grad_norm": 0.18985910713672638, | |
| "learning_rate": 0.00039765934598084176, | |
| "loss": 0.3396, | |
| "mean_token_accuracy": 0.8789964653551579, | |
| "num_tokens": 8247396.0, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.24469160768452983, | |
| "grad_norm": 0.20584100484848022, | |
| "learning_rate": 0.0003975957997752783, | |
| "loss": 0.3537, | |
| "mean_token_accuracy": 0.8752495422959328, | |
| "num_tokens": 8310521.0, | |
| "step": 121 | |
| }, | |
| { | |
| "epoch": 0.24671385237613752, | |
| "grad_norm": 0.20211565494537354, | |
| "learning_rate": 0.00039753140827670163, | |
| "loss": 0.3607, | |
| "mean_token_accuracy": 0.877599012106657, | |
| "num_tokens": 8374419.0, | |
| "step": 122 | |
| }, | |
| { | |
| "epoch": 0.2487360970677452, | |
| "grad_norm": 0.21102474629878998, | |
| "learning_rate": 0.00039746617179159274, | |
| "loss": 0.3411, | |
| "mean_token_accuracy": 0.8837038949131966, | |
| "num_tokens": 8436270.0, | |
| "step": 123 | |
| }, | |
| { | |
| "epoch": 0.25075834175935285, | |
| "grad_norm": 0.22104637324810028, | |
| "learning_rate": 0.000397400090630454, | |
| "loss": 0.3467, | |
| "mean_token_accuracy": 0.8838667124509811, | |
| "num_tokens": 8496108.0, | |
| "step": 124 | |
| }, | |
| { | |
| "epoch": 0.2527805864509606, | |
| "grad_norm": 0.23561948537826538, | |
| "learning_rate": 0.0003973331651078084, | |
| "loss": 0.3933, | |
| "mean_token_accuracy": 0.864571388810873, | |
| "num_tokens": 8558449.0, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.25480283114256824, | |
| "grad_norm": 0.17975358664989471, | |
| "learning_rate": 0.00039726539554219746, | |
| "loss": 0.3246, | |
| "mean_token_accuracy": 0.8921530395746231, | |
| "num_tokens": 8633747.0, | |
| "step": 126 | |
| }, | |
| { | |
| "epoch": 0.2568250758341759, | |
| "grad_norm": 0.19475312530994415, | |
| "learning_rate": 0.0003971967822561805, | |
| "loss": 0.359, | |
| "mean_token_accuracy": 0.878424908965826, | |
| "num_tokens": 8700730.0, | |
| "step": 127 | |
| }, | |
| { | |
| "epoch": 0.25884732052578363, | |
| "grad_norm": 0.23659245669841766, | |
| "learning_rate": 0.0003971273255763324, | |
| "loss": 0.3606, | |
| "mean_token_accuracy": 0.8830053992569447, | |
| "num_tokens": 8760014.0, | |
| "step": 128 | |
| }, | |
| { | |
| "epoch": 0.2608695652173913, | |
| "grad_norm": 0.19543145596981049, | |
| "learning_rate": 0.0003970570258332422, | |
| "loss": 0.3309, | |
| "mean_token_accuracy": 0.8853320479393005, | |
| "num_tokens": 8824736.0, | |
| "step": 129 | |
| }, | |
| { | |
| "epoch": 0.26289180990899896, | |
| "grad_norm": 0.2464882731437683, | |
| "learning_rate": 0.0003969858833615119, | |
| "loss": 0.3589, | |
| "mean_token_accuracy": 0.8793282993137836, | |
| "num_tokens": 8887323.0, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.2649140546006067, | |
| "grad_norm": 0.16774067282676697, | |
| "learning_rate": 0.0003969138984997542, | |
| "loss": 0.3198, | |
| "mean_token_accuracy": 0.8886825554072857, | |
| "num_tokens": 8965857.0, | |
| "step": 131 | |
| }, | |
| { | |
| "epoch": 0.26693629929221435, | |
| "grad_norm": 0.19399577379226685, | |
| "learning_rate": 0.00039684107159059174, | |
| "loss": 0.3468, | |
| "mean_token_accuracy": 0.8808378390967846, | |
| "num_tokens": 9039028.0, | |
| "step": 132 | |
| }, | |
| { | |
| "epoch": 0.268958543983822, | |
| "grad_norm": 0.1961926966905594, | |
| "learning_rate": 0.00039676740298065467, | |
| "loss": 0.3501, | |
| "mean_token_accuracy": 0.8791337199509144, | |
| "num_tokens": 9108645.0, | |
| "step": 133 | |
| }, | |
| { | |
| "epoch": 0.27098078867542974, | |
| "grad_norm": 0.16180327534675598, | |
| "learning_rate": 0.00039669289302057955, | |
| "loss": 0.3291, | |
| "mean_token_accuracy": 0.889164712280035, | |
| "num_tokens": 9182295.0, | |
| "step": 134 | |
| }, | |
| { | |
| "epoch": 0.2730030333670374, | |
| "grad_norm": 0.18792307376861572, | |
| "learning_rate": 0.00039661754206500723, | |
| "loss": 0.305, | |
| "mean_token_accuracy": 0.890954252332449, | |
| "num_tokens": 9253798.0, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.2750252780586451, | |
| "grad_norm": 0.2211407721042633, | |
| "learning_rate": 0.0003965413504725815, | |
| "loss": 0.3516, | |
| "mean_token_accuracy": 0.8829210363328457, | |
| "num_tokens": 9319632.0, | |
| "step": 136 | |
| }, | |
| { | |
| "epoch": 0.2770475227502528, | |
| "grad_norm": 0.15928597748279572, | |
| "learning_rate": 0.0003964643186059474, | |
| "loss": 0.3209, | |
| "mean_token_accuracy": 0.8902908116579056, | |
| "num_tokens": 9396460.0, | |
| "step": 137 | |
| }, | |
| { | |
| "epoch": 0.27906976744186046, | |
| "grad_norm": 0.25479844212532043, | |
| "learning_rate": 0.00039638644683174937, | |
| "loss": 0.3247, | |
| "mean_token_accuracy": 0.8880501836538315, | |
| "num_tokens": 9460466.0, | |
| "step": 138 | |
| }, | |
| { | |
| "epoch": 0.2810920121334681, | |
| "grad_norm": 0.17745117843151093, | |
| "learning_rate": 0.00039630773552062925, | |
| "loss": 0.3383, | |
| "mean_token_accuracy": 0.8863355927169323, | |
| "num_tokens": 9532155.0, | |
| "step": 139 | |
| }, | |
| { | |
| "epoch": 0.28311425682507585, | |
| "grad_norm": 0.22157195210456848, | |
| "learning_rate": 0.0003962281850472251, | |
| "loss": 0.3499, | |
| "mean_token_accuracy": 0.879049763083458, | |
| "num_tokens": 9590255.0, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.2851365015166835, | |
| "grad_norm": 0.1807304471731186, | |
| "learning_rate": 0.0003961477957901689, | |
| "loss": 0.3065, | |
| "mean_token_accuracy": 0.8949154578149319, | |
| "num_tokens": 9667027.0, | |
| "step": 141 | |
| }, | |
| { | |
| "epoch": 0.2871587462082912, | |
| "grad_norm": 0.23244738578796387, | |
| "learning_rate": 0.00039606656813208504, | |
| "loss": 0.3608, | |
| "mean_token_accuracy": 0.8768214285373688, | |
| "num_tokens": 9723117.0, | |
| "step": 142 | |
| }, | |
| { | |
| "epoch": 0.2891809908998989, | |
| "grad_norm": 0.18404552340507507, | |
| "learning_rate": 0.0003959845024595883, | |
| "loss": 0.2972, | |
| "mean_token_accuracy": 0.8935975506901741, | |
| "num_tokens": 9792714.0, | |
| "step": 143 | |
| }, | |
| { | |
| "epoch": 0.29120323559150657, | |
| "grad_norm": 0.21092693507671356, | |
| "learning_rate": 0.00039590159916328224, | |
| "loss": 0.3552, | |
| "mean_token_accuracy": 0.8813748992979527, | |
| "num_tokens": 9846790.0, | |
| "step": 144 | |
| }, | |
| { | |
| "epoch": 0.29322548028311424, | |
| "grad_norm": 0.18293221294879913, | |
| "learning_rate": 0.00039581785863775705, | |
| "loss": 0.3497, | |
| "mean_token_accuracy": 0.8868285343050957, | |
| "num_tokens": 9920682.0, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.29524772497472196, | |
| "grad_norm": 0.23161938786506653, | |
| "learning_rate": 0.00039573328128158803, | |
| "loss": 0.3671, | |
| "mean_token_accuracy": 0.8772343806922436, | |
| "num_tokens": 9989629.0, | |
| "step": 146 | |
| }, | |
| { | |
| "epoch": 0.2972699696663296, | |
| "grad_norm": 0.19797147810459137, | |
| "learning_rate": 0.0003956478674973333, | |
| "loss": 0.356, | |
| "mean_token_accuracy": 0.8782718777656555, | |
| "num_tokens": 10048794.0, | |
| "step": 147 | |
| }, | |
| { | |
| "epoch": 0.2992922143579373, | |
| "grad_norm": 0.18177340924739838, | |
| "learning_rate": 0.00039556161769153226, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.8886930793523788, | |
| "num_tokens": 10116701.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 0.301314459049545, | |
| "grad_norm": 0.24357731640338898, | |
| "learning_rate": 0.0003954745322747034, | |
| "loss": 0.344, | |
| "mean_token_accuracy": 0.8848157115280628, | |
| "num_tokens": 10176439.0, | |
| "step": 149 | |
| }, | |
| { | |
| "epoch": 0.3033367037411527, | |
| "grad_norm": 0.18051762878894806, | |
| "learning_rate": 0.00039538661166134236, | |
| "loss": 0.3134, | |
| "mean_token_accuracy": 0.8913725949823856, | |
| "num_tokens": 10248461.0, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.30535894843276035, | |
| "grad_norm": 0.20022518932819366, | |
| "learning_rate": 0.00039529785626992006, | |
| "loss": 0.3436, | |
| "mean_token_accuracy": 0.8848014548420906, | |
| "num_tokens": 10310254.0, | |
| "step": 151 | |
| }, | |
| { | |
| "epoch": 0.30738119312436807, | |
| "grad_norm": 0.23199647665023804, | |
| "learning_rate": 0.0003952082665228808, | |
| "loss": 0.3233, | |
| "mean_token_accuracy": 0.8871180489659309, | |
| "num_tokens": 10375248.0, | |
| "step": 152 | |
| }, | |
| { | |
| "epoch": 0.30940343781597573, | |
| "grad_norm": 0.18778662383556366, | |
| "learning_rate": 0.00039511784284663976, | |
| "loss": 0.3044, | |
| "mean_token_accuracy": 0.8951373845338821, | |
| "num_tokens": 10442606.0, | |
| "step": 153 | |
| }, | |
| { | |
| "epoch": 0.3114256825075834, | |
| "grad_norm": 0.17572450637817383, | |
| "learning_rate": 0.0003950265856715818, | |
| "loss": 0.3331, | |
| "mean_token_accuracy": 0.8889199234545231, | |
| "num_tokens": 10509923.0, | |
| "step": 154 | |
| }, | |
| { | |
| "epoch": 0.3134479271991911, | |
| "grad_norm": 0.16929855942726135, | |
| "learning_rate": 0.0003949344954320586, | |
| "loss": 0.348, | |
| "mean_token_accuracy": 0.8804797492921352, | |
| "num_tokens": 10579730.0, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.3154701718907988, | |
| "grad_norm": 0.17655323445796967, | |
| "learning_rate": 0.0003948415725663871, | |
| "loss": 0.3293, | |
| "mean_token_accuracy": 0.883028332144022, | |
| "num_tokens": 10648731.0, | |
| "step": 156 | |
| }, | |
| { | |
| "epoch": 0.31749241658240646, | |
| "grad_norm": 0.1909574717283249, | |
| "learning_rate": 0.00039474781751684706, | |
| "loss": 0.3183, | |
| "mean_token_accuracy": 0.8886212892830372, | |
| "num_tokens": 10713689.0, | |
| "step": 157 | |
| }, | |
| { | |
| "epoch": 0.3195146612740142, | |
| "grad_norm": 0.17727530002593994, | |
| "learning_rate": 0.00039465323072967936, | |
| "loss": 0.3237, | |
| "mean_token_accuracy": 0.8898195438086987, | |
| "num_tokens": 10785736.0, | |
| "step": 158 | |
| }, | |
| { | |
| "epoch": 0.32153690596562184, | |
| "grad_norm": 0.18001440167427063, | |
| "learning_rate": 0.00039455781265508355, | |
| "loss": 0.332, | |
| "mean_token_accuracy": 0.8871553801000118, | |
| "num_tokens": 10856647.0, | |
| "step": 159 | |
| }, | |
| { | |
| "epoch": 0.3235591506572295, | |
| "grad_norm": 0.19728383421897888, | |
| "learning_rate": 0.0003944615637472158, | |
| "loss": 0.3621, | |
| "mean_token_accuracy": 0.8775678239762783, | |
| "num_tokens": 10918872.0, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.32558139534883723, | |
| "grad_norm": 0.20538869500160217, | |
| "learning_rate": 0.00039436448446418683, | |
| "loss": 0.3633, | |
| "mean_token_accuracy": 0.8745956718921661, | |
| "num_tokens": 10981209.0, | |
| "step": 161 | |
| }, | |
| { | |
| "epoch": 0.3276036400404449, | |
| "grad_norm": 0.19733993709087372, | |
| "learning_rate": 0.00039426657526805937, | |
| "loss": 0.3201, | |
| "mean_token_accuracy": 0.8928566165268421, | |
| "num_tokens": 11047089.0, | |
| "step": 162 | |
| }, | |
| { | |
| "epoch": 0.32962588473205257, | |
| "grad_norm": 0.15281331539154053, | |
| "learning_rate": 0.0003941678366248468, | |
| "loss": 0.3003, | |
| "mean_token_accuracy": 0.8931796550750732, | |
| "num_tokens": 11122846.0, | |
| "step": 163 | |
| }, | |
| { | |
| "epoch": 0.3316481294236603, | |
| "grad_norm": 0.17711788415908813, | |
| "learning_rate": 0.00039406826900450977, | |
| "loss": 0.3127, | |
| "mean_token_accuracy": 0.892613273113966, | |
| "num_tokens": 11197993.0, | |
| "step": 164 | |
| }, | |
| { | |
| "epoch": 0.33367037411526795, | |
| "grad_norm": 0.2002251148223877, | |
| "learning_rate": 0.00039396787288095497, | |
| "loss": 0.3328, | |
| "mean_token_accuracy": 0.8890563920140266, | |
| "num_tokens": 11267855.0, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.3356926188068756, | |
| "grad_norm": 0.16182006895542145, | |
| "learning_rate": 0.00039386664873203227, | |
| "loss": 0.3251, | |
| "mean_token_accuracy": 0.8839607983827591, | |
| "num_tokens": 11344330.0, | |
| "step": 166 | |
| }, | |
| { | |
| "epoch": 0.33771486349848334, | |
| "grad_norm": 0.16747458279132843, | |
| "learning_rate": 0.00039376459703953284, | |
| "loss": 0.3249, | |
| "mean_token_accuracy": 0.8876189365983009, | |
| "num_tokens": 11418350.0, | |
| "step": 167 | |
| }, | |
| { | |
| "epoch": 0.339737108190091, | |
| "grad_norm": 0.1826547235250473, | |
| "learning_rate": 0.0003936617182891864, | |
| "loss": 0.3291, | |
| "mean_token_accuracy": 0.8888828568160534, | |
| "num_tokens": 11485723.0, | |
| "step": 168 | |
| }, | |
| { | |
| "epoch": 0.3417593528816987, | |
| "grad_norm": 0.18488235771656036, | |
| "learning_rate": 0.0003935580129706593, | |
| "loss": 0.3097, | |
| "mean_token_accuracy": 0.8907660692930222, | |
| "num_tokens": 11551678.0, | |
| "step": 169 | |
| }, | |
| { | |
| "epoch": 0.3437815975733064, | |
| "grad_norm": 0.23008394241333008, | |
| "learning_rate": 0.00039345348157755213, | |
| "loss": 0.3533, | |
| "mean_token_accuracy": 0.8763989768922329, | |
| "num_tokens": 11609063.0, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.34580384226491406, | |
| "grad_norm": 0.2060030996799469, | |
| "learning_rate": 0.0003933481246073973, | |
| "loss": 0.3399, | |
| "mean_token_accuracy": 0.8879686929285526, | |
| "num_tokens": 11673330.0, | |
| "step": 171 | |
| }, | |
| { | |
| "epoch": 0.34782608695652173, | |
| "grad_norm": 0.17570629715919495, | |
| "learning_rate": 0.0003932419425616565, | |
| "loss": 0.3454, | |
| "mean_token_accuracy": 0.8838200494647026, | |
| "num_tokens": 11740475.0, | |
| "step": 172 | |
| }, | |
| { | |
| "epoch": 0.34984833164812945, | |
| "grad_norm": 0.16710588335990906, | |
| "learning_rate": 0.0003931349359457187, | |
| "loss": 0.2969, | |
| "mean_token_accuracy": 0.899805661290884, | |
| "num_tokens": 11806954.0, | |
| "step": 173 | |
| }, | |
| { | |
| "epoch": 0.3518705763397371, | |
| "grad_norm": 0.20197796821594238, | |
| "learning_rate": 0.0003930271052688974, | |
| "loss": 0.3525, | |
| "mean_token_accuracy": 0.8779477626085281, | |
| "num_tokens": 11870286.0, | |
| "step": 174 | |
| }, | |
| { | |
| "epoch": 0.3538928210313448, | |
| "grad_norm": 0.17107857763767242, | |
| "learning_rate": 0.0003929184510444284, | |
| "loss": 0.3266, | |
| "mean_token_accuracy": 0.8888569958508015, | |
| "num_tokens": 11947117.0, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.35591506572295245, | |
| "grad_norm": 0.17827239632606506, | |
| "learning_rate": 0.0003928089737894672, | |
| "loss": 0.3252, | |
| "mean_token_accuracy": 0.8897545039653778, | |
| "num_tokens": 12009582.0, | |
| "step": 176 | |
| }, | |
| { | |
| "epoch": 0.3579373104145602, | |
| "grad_norm": 0.22990773618221283, | |
| "learning_rate": 0.00039269867402508675, | |
| "loss": 0.3549, | |
| "mean_token_accuracy": 0.8815719597041607, | |
| "num_tokens": 12072827.0, | |
| "step": 177 | |
| }, | |
| { | |
| "epoch": 0.35995955510616784, | |
| "grad_norm": 0.19108358025550842, | |
| "learning_rate": 0.00039258755227627475, | |
| "loss": 0.3549, | |
| "mean_token_accuracy": 0.8812212906777859, | |
| "num_tokens": 12141736.0, | |
| "step": 178 | |
| }, | |
| { | |
| "epoch": 0.3619817997977755, | |
| "grad_norm": 0.19387130439281464, | |
| "learning_rate": 0.0003924756090719314, | |
| "loss": 0.3057, | |
| "mean_token_accuracy": 0.8937871865928173, | |
| "num_tokens": 12212850.0, | |
| "step": 179 | |
| }, | |
| { | |
| "epoch": 0.3640040444893832, | |
| "grad_norm": 0.19616757333278656, | |
| "learning_rate": 0.0003923628449448666, | |
| "loss": 0.3337, | |
| "mean_token_accuracy": 0.8879410326480865, | |
| "num_tokens": 12278676.0, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.3660262891809909, | |
| "grad_norm": 0.19950613379478455, | |
| "learning_rate": 0.0003922492604317976, | |
| "loss": 0.333, | |
| "mean_token_accuracy": 0.8837904818356037, | |
| "num_tokens": 12344019.0, | |
| "step": 181 | |
| }, | |
| { | |
| "epoch": 0.36804853387259856, | |
| "grad_norm": 0.18320327997207642, | |
| "learning_rate": 0.0003921348560733464, | |
| "loss": 0.3379, | |
| "mean_token_accuracy": 0.8864001519978046, | |
| "num_tokens": 12414279.0, | |
| "step": 182 | |
| }, | |
| { | |
| "epoch": 0.3700707785642063, | |
| "grad_norm": 0.19148240983486176, | |
| "learning_rate": 0.0003920196324140371, | |
| "loss": 0.3438, | |
| "mean_token_accuracy": 0.8869296424090862, | |
| "num_tokens": 12481557.0, | |
| "step": 183 | |
| }, | |
| { | |
| "epoch": 0.37209302325581395, | |
| "grad_norm": 0.16867059469223022, | |
| "learning_rate": 0.00039190359000229364, | |
| "loss": 0.3347, | |
| "mean_token_accuracy": 0.8817239366471767, | |
| "num_tokens": 12552783.0, | |
| "step": 184 | |
| }, | |
| { | |
| "epoch": 0.3741152679474216, | |
| "grad_norm": 0.20269234478473663, | |
| "learning_rate": 0.0003917867293904365, | |
| "loss": 0.3599, | |
| "mean_token_accuracy": 0.8779093511402607, | |
| "num_tokens": 12611751.0, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.37613751263902934, | |
| "grad_norm": 0.1963576078414917, | |
| "learning_rate": 0.0003916690511346809, | |
| "loss": 0.3219, | |
| "mean_token_accuracy": 0.8882619775831699, | |
| "num_tokens": 12674136.0, | |
| "step": 186 | |
| }, | |
| { | |
| "epoch": 0.378159757330637, | |
| "grad_norm": 0.1874200403690338, | |
| "learning_rate": 0.0003915505557951335, | |
| "loss": 0.2945, | |
| "mean_token_accuracy": 0.8926714062690735, | |
| "num_tokens": 12754627.0, | |
| "step": 187 | |
| }, | |
| { | |
| "epoch": 0.38018200202224467, | |
| "grad_norm": 0.21084272861480713, | |
| "learning_rate": 0.0003914312439357901, | |
| "loss": 0.3492, | |
| "mean_token_accuracy": 0.8815909698605537, | |
| "num_tokens": 12812878.0, | |
| "step": 188 | |
| }, | |
| { | |
| "epoch": 0.3822042467138524, | |
| "grad_norm": 0.21426641941070557, | |
| "learning_rate": 0.00039131111612453293, | |
| "loss": 0.3226, | |
| "mean_token_accuracy": 0.8860650397837162, | |
| "num_tokens": 12876950.0, | |
| "step": 189 | |
| }, | |
| { | |
| "epoch": 0.38422649140546006, | |
| "grad_norm": 0.1843956857919693, | |
| "learning_rate": 0.0003911901729331277, | |
| "loss": 0.3012, | |
| "mean_token_accuracy": 0.8955246210098267, | |
| "num_tokens": 12940008.0, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.3862487360970677, | |
| "grad_norm": 0.16776444017887115, | |
| "learning_rate": 0.00039106841493722103, | |
| "loss": 0.2915, | |
| "mean_token_accuracy": 0.8939312994480133, | |
| "num_tokens": 13011277.0, | |
| "step": 191 | |
| }, | |
| { | |
| "epoch": 0.38827098078867545, | |
| "grad_norm": 0.21435709297657013, | |
| "learning_rate": 0.0003909458427163379, | |
| "loss": 0.3297, | |
| "mean_token_accuracy": 0.8883927799761295, | |
| "num_tokens": 13076795.0, | |
| "step": 192 | |
| }, | |
| { | |
| "epoch": 0.3902932254802831, | |
| "grad_norm": 0.18475346267223358, | |
| "learning_rate": 0.00039082245685387855, | |
| "loss": 0.3322, | |
| "mean_token_accuracy": 0.8888528421521187, | |
| "num_tokens": 13142952.0, | |
| "step": 193 | |
| }, | |
| { | |
| "epoch": 0.3923154701718908, | |
| "grad_norm": 0.19243639707565308, | |
| "learning_rate": 0.00039069825793711587, | |
| "loss": 0.3213, | |
| "mean_token_accuracy": 0.8921789862215519, | |
| "num_tokens": 13211022.0, | |
| "step": 194 | |
| }, | |
| { | |
| "epoch": 0.3943377148634985, | |
| "grad_norm": 0.1858910322189331, | |
| "learning_rate": 0.0003905732465571928, | |
| "loss": 0.3179, | |
| "mean_token_accuracy": 0.8920286670327187, | |
| "num_tokens": 13276701.0, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.39635995955510617, | |
| "grad_norm": 0.20470379292964935, | |
| "learning_rate": 0.0003904474233091191, | |
| "loss": 0.3189, | |
| "mean_token_accuracy": 0.8954358175396919, | |
| "num_tokens": 13344684.0, | |
| "step": 196 | |
| }, | |
| { | |
| "epoch": 0.39838220424671383, | |
| "grad_norm": 0.18819299340248108, | |
| "learning_rate": 0.00039032078879176865, | |
| "loss": 0.3447, | |
| "mean_token_accuracy": 0.8849571086466312, | |
| "num_tokens": 13409885.0, | |
| "step": 197 | |
| }, | |
| { | |
| "epoch": 0.40040444893832156, | |
| "grad_norm": 0.17828333377838135, | |
| "learning_rate": 0.00039019334360787706, | |
| "loss": 0.324, | |
| "mean_token_accuracy": 0.8868827521800995, | |
| "num_tokens": 13473352.0, | |
| "step": 198 | |
| }, | |
| { | |
| "epoch": 0.4024266936299292, | |
| "grad_norm": 0.18609419465065002, | |
| "learning_rate": 0.0003900650883640381, | |
| "loss": 0.285, | |
| "mean_token_accuracy": 0.8940243273973465, | |
| "num_tokens": 13540264.0, | |
| "step": 199 | |
| }, | |
| { | |
| "epoch": 0.4044489383215369, | |
| "grad_norm": 0.18147540092468262, | |
| "learning_rate": 0.0003899360236707012, | |
| "loss": 0.3077, | |
| "mean_token_accuracy": 0.8837394788861275, | |
| "num_tokens": 13610806.0, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.4064711830131446, | |
| "grad_norm": 0.19080513715744019, | |
| "learning_rate": 0.00038980615014216853, | |
| "loss": 0.3241, | |
| "mean_token_accuracy": 0.8904240913689137, | |
| "num_tokens": 13669371.0, | |
| "step": 201 | |
| }, | |
| { | |
| "epoch": 0.4084934277047523, | |
| "grad_norm": 0.16377419233322144, | |
| "learning_rate": 0.00038967546839659215, | |
| "loss": 0.3149, | |
| "mean_token_accuracy": 0.8902618512511253, | |
| "num_tokens": 13745941.0, | |
| "step": 202 | |
| }, | |
| { | |
| "epoch": 0.41051567239635994, | |
| "grad_norm": 0.19631735980510712, | |
| "learning_rate": 0.00038954397905597063, | |
| "loss": 0.3459, | |
| "mean_token_accuracy": 0.8863471113145351, | |
| "num_tokens": 13818760.0, | |
| "step": 203 | |
| }, | |
| { | |
| "epoch": 0.41253791708796766, | |
| "grad_norm": 0.18512631952762604, | |
| "learning_rate": 0.00038941168274614677, | |
| "loss": 0.3168, | |
| "mean_token_accuracy": 0.8905623555183411, | |
| "num_tokens": 13889651.0, | |
| "step": 204 | |
| }, | |
| { | |
| "epoch": 0.41456016177957533, | |
| "grad_norm": 0.21926718950271606, | |
| "learning_rate": 0.00038927858009680394, | |
| "loss": 0.3006, | |
| "mean_token_accuracy": 0.8961901552975178, | |
| "num_tokens": 13949554.0, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.416582406471183, | |
| "grad_norm": 0.17943674325942993, | |
| "learning_rate": 0.0003891446717414635, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.8922952748835087, | |
| "num_tokens": 14021083.0, | |
| "step": 206 | |
| }, | |
| { | |
| "epoch": 0.4186046511627907, | |
| "grad_norm": 0.1913203001022339, | |
| "learning_rate": 0.0003890099583174819, | |
| "loss": 0.3209, | |
| "mean_token_accuracy": 0.8909911513328552, | |
| "num_tokens": 14093782.0, | |
| "step": 207 | |
| }, | |
| { | |
| "epoch": 0.4206268958543984, | |
| "grad_norm": 0.19808340072631836, | |
| "learning_rate": 0.0003888744404660472, | |
| "loss": 0.3338, | |
| "mean_token_accuracy": 0.884627778083086, | |
| "num_tokens": 14159998.0, | |
| "step": 208 | |
| }, | |
| { | |
| "epoch": 0.42264914054600605, | |
| "grad_norm": 0.17383399605751038, | |
| "learning_rate": 0.0003887381188321762, | |
| "loss": 0.3153, | |
| "mean_token_accuracy": 0.8930625729262829, | |
| "num_tokens": 14232551.0, | |
| "step": 209 | |
| }, | |
| { | |
| "epoch": 0.4246713852376138, | |
| "grad_norm": 0.17710869014263153, | |
| "learning_rate": 0.0003886009940647116, | |
| "loss": 0.3296, | |
| "mean_token_accuracy": 0.8819810189306736, | |
| "num_tokens": 14298797.0, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.42669362992922144, | |
| "grad_norm": 0.1701733022928238, | |
| "learning_rate": 0.0003884630668163186, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.8916125111281872, | |
| "num_tokens": 14367264.0, | |
| "step": 211 | |
| }, | |
| { | |
| "epoch": 0.4287158746208291, | |
| "grad_norm": 0.16983942687511444, | |
| "learning_rate": 0.0003883243377434821, | |
| "loss": 0.336, | |
| "mean_token_accuracy": 0.8859187439084053, | |
| "num_tokens": 14434405.0, | |
| "step": 212 | |
| }, | |
| { | |
| "epoch": 0.43073811931243683, | |
| "grad_norm": 0.22955253720283508, | |
| "learning_rate": 0.0003881848075065032, | |
| "loss": 0.331, | |
| "mean_token_accuracy": 0.89054074883461, | |
| "num_tokens": 14504097.0, | |
| "step": 213 | |
| }, | |
| { | |
| "epoch": 0.4327603640040445, | |
| "grad_norm": 0.1674816757440567, | |
| "learning_rate": 0.0003880444767694963, | |
| "loss": 0.3292, | |
| "mean_token_accuracy": 0.8852434195578098, | |
| "num_tokens": 14576012.0, | |
| "step": 214 | |
| }, | |
| { | |
| "epoch": 0.43478260869565216, | |
| "grad_norm": 0.23137012124061584, | |
| "learning_rate": 0.00038790334620038606, | |
| "loss": 0.3293, | |
| "mean_token_accuracy": 0.8874834440648556, | |
| "num_tokens": 14641442.0, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.4368048533872599, | |
| "grad_norm": 0.1810149997472763, | |
| "learning_rate": 0.00038776141647090375, | |
| "loss": 0.3359, | |
| "mean_token_accuracy": 0.8845292665064335, | |
| "num_tokens": 14701016.0, | |
| "step": 216 | |
| }, | |
| { | |
| "epoch": 0.43882709807886755, | |
| "grad_norm": 0.19873689115047455, | |
| "learning_rate": 0.00038761868825658465, | |
| "loss": 0.3275, | |
| "mean_token_accuracy": 0.8850444070994854, | |
| "num_tokens": 14762543.0, | |
| "step": 217 | |
| }, | |
| { | |
| "epoch": 0.4408493427704752, | |
| "grad_norm": 0.16571380198001862, | |
| "learning_rate": 0.00038747516223676447, | |
| "loss": 0.3097, | |
| "mean_token_accuracy": 0.8963964283466339, | |
| "num_tokens": 14837183.0, | |
| "step": 218 | |
| }, | |
| { | |
| "epoch": 0.44287158746208294, | |
| "grad_norm": 0.18150104582309723, | |
| "learning_rate": 0.00038733083909457607, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.891868706792593, | |
| "num_tokens": 14909675.0, | |
| "step": 219 | |
| }, | |
| { | |
| "epoch": 0.4448938321536906, | |
| "grad_norm": 0.2008552849292755, | |
| "learning_rate": 0.00038718571951694636, | |
| "loss": 0.3397, | |
| "mean_token_accuracy": 0.881518941372633, | |
| "num_tokens": 14974075.0, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.44691607684529827, | |
| "grad_norm": 0.20857571065425873, | |
| "learning_rate": 0.00038703980419459323, | |
| "loss": 0.3251, | |
| "mean_token_accuracy": 0.8877891451120377, | |
| "num_tokens": 15044109.0, | |
| "step": 221 | |
| }, | |
| { | |
| "epoch": 0.448938321536906, | |
| "grad_norm": 0.1777462363243103, | |
| "learning_rate": 0.00038689309382202174, | |
| "loss": 0.3017, | |
| "mean_token_accuracy": 0.8944090716540813, | |
| "num_tokens": 15114045.0, | |
| "step": 222 | |
| }, | |
| { | |
| "epoch": 0.45096056622851366, | |
| "grad_norm": 0.16797004640102386, | |
| "learning_rate": 0.0003867455890975213, | |
| "loss": 0.2901, | |
| "mean_token_accuracy": 0.8903030268847942, | |
| "num_tokens": 15184412.0, | |
| "step": 223 | |
| }, | |
| { | |
| "epoch": 0.4529828109201213, | |
| "grad_norm": 0.26226508617401123, | |
| "learning_rate": 0.00038659729072316193, | |
| "loss": 0.356, | |
| "mean_token_accuracy": 0.8832045011222363, | |
| "num_tokens": 15245581.0, | |
| "step": 224 | |
| }, | |
| { | |
| "epoch": 0.455005055611729, | |
| "grad_norm": 0.16607579588890076, | |
| "learning_rate": 0.00038644819940479146, | |
| "loss": 0.3148, | |
| "mean_token_accuracy": 0.8910624943673611, | |
| "num_tokens": 15315013.0, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.4570273003033367, | |
| "grad_norm": 0.15852072834968567, | |
| "learning_rate": 0.00038629831585203163, | |
| "loss": 0.2908, | |
| "mean_token_accuracy": 0.8945996090769768, | |
| "num_tokens": 15398701.0, | |
| "step": 226 | |
| }, | |
| { | |
| "epoch": 0.4590495449949444, | |
| "grad_norm": 0.17531050741672516, | |
| "learning_rate": 0.000386147640778275, | |
| "loss": 0.2748, | |
| "mean_token_accuracy": 0.9027018919587135, | |
| "num_tokens": 15490499.0, | |
| "step": 227 | |
| }, | |
| { | |
| "epoch": 0.46107178968655205, | |
| "grad_norm": 0.16767503321170807, | |
| "learning_rate": 0.00038599617490068134, | |
| "loss": 0.3044, | |
| "mean_token_accuracy": 0.8939338177442551, | |
| "num_tokens": 15556168.0, | |
| "step": 228 | |
| }, | |
| { | |
| "epoch": 0.46309403437815977, | |
| "grad_norm": 0.211036816239357, | |
| "learning_rate": 0.0003858439189401747, | |
| "loss": 0.3207, | |
| "mean_token_accuracy": 0.8899048455059528, | |
| "num_tokens": 15622005.0, | |
| "step": 229 | |
| }, | |
| { | |
| "epoch": 0.46511627906976744, | |
| "grad_norm": 0.16442608833312988, | |
| "learning_rate": 0.0003856908736214393, | |
| "loss": 0.3191, | |
| "mean_token_accuracy": 0.8901388570666313, | |
| "num_tokens": 15693753.0, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.4671385237613751, | |
| "grad_norm": 0.15724638104438782, | |
| "learning_rate": 0.0003855370396729166, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.9016358070075512, | |
| "num_tokens": 15763494.0, | |
| "step": 231 | |
| }, | |
| { | |
| "epoch": 0.4691607684529828, | |
| "grad_norm": 0.18976381421089172, | |
| "learning_rate": 0.0003853824178268017, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.8904677703976631, | |
| "num_tokens": 15833863.0, | |
| "step": 232 | |
| }, | |
| { | |
| "epoch": 0.4711830131445905, | |
| "grad_norm": 0.14951825141906738, | |
| "learning_rate": 0.00038522700881903966, | |
| "loss": 0.2525, | |
| "mean_token_accuracy": 0.8994054794311523, | |
| "num_tokens": 15911573.0, | |
| "step": 233 | |
| }, | |
| { | |
| "epoch": 0.47320525783619816, | |
| "grad_norm": 0.19189335405826569, | |
| "learning_rate": 0.0003850708133893223, | |
| "loss": 0.3223, | |
| "mean_token_accuracy": 0.8889148533344269, | |
| "num_tokens": 15973006.0, | |
| "step": 234 | |
| }, | |
| { | |
| "epoch": 0.4752275025278059, | |
| "grad_norm": 0.15495674312114716, | |
| "learning_rate": 0.0003849138322810845, | |
| "loss": 0.3025, | |
| "mean_token_accuracy": 0.8922797180712223, | |
| "num_tokens": 16044921.0, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.47724974721941354, | |
| "grad_norm": 0.1728491634130478, | |
| "learning_rate": 0.00038475606624150055, | |
| "loss": 0.3094, | |
| "mean_token_accuracy": 0.8931614607572556, | |
| "num_tokens": 16116096.0, | |
| "step": 236 | |
| }, | |
| { | |
| "epoch": 0.4792719919110212, | |
| "grad_norm": 0.1603267937898636, | |
| "learning_rate": 0.0003845975160214808, | |
| "loss": 0.3235, | |
| "mean_token_accuracy": 0.8852398991584778, | |
| "num_tokens": 16184529.0, | |
| "step": 237 | |
| }, | |
| { | |
| "epoch": 0.48129423660262893, | |
| "grad_norm": 0.16991828382015228, | |
| "learning_rate": 0.00038443818237566814, | |
| "loss": 0.2902, | |
| "mean_token_accuracy": 0.892944622784853, | |
| "num_tokens": 16253758.0, | |
| "step": 238 | |
| }, | |
| { | |
| "epoch": 0.4833164812942366, | |
| "grad_norm": 0.17524850368499756, | |
| "learning_rate": 0.0003842780660624343, | |
| "loss": 0.3227, | |
| "mean_token_accuracy": 0.8884528502821922, | |
| "num_tokens": 16320041.0, | |
| "step": 239 | |
| }, | |
| { | |
| "epoch": 0.48533872598584427, | |
| "grad_norm": 0.18329283595085144, | |
| "learning_rate": 0.00038411716784387596, | |
| "loss": 0.313, | |
| "mean_token_accuracy": 0.8975342884659767, | |
| "num_tokens": 16392051.0, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.487360970677452, | |
| "grad_norm": 0.1628371924161911, | |
| "learning_rate": 0.00038395548848581165, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9011796675622463, | |
| "num_tokens": 16462030.0, | |
| "step": 241 | |
| }, | |
| { | |
| "epoch": 0.48938321536905965, | |
| "grad_norm": 0.18072479963302612, | |
| "learning_rate": 0.0003837930287577778, | |
| "loss": 0.3041, | |
| "mean_token_accuracy": 0.8932337760925293, | |
| "num_tokens": 16532493.0, | |
| "step": 242 | |
| }, | |
| { | |
| "epoch": 0.4914054600606673, | |
| "grad_norm": 0.2059275507926941, | |
| "learning_rate": 0.000383629789433025, | |
| "loss": 0.3281, | |
| "mean_token_accuracy": 0.8870198056101799, | |
| "num_tokens": 16590133.0, | |
| "step": 243 | |
| }, | |
| { | |
| "epoch": 0.49342770475227504, | |
| "grad_norm": 0.19713951647281647, | |
| "learning_rate": 0.00038346577128851465, | |
| "loss": 0.3235, | |
| "mean_token_accuracy": 0.8893256969749928, | |
| "num_tokens": 16655566.0, | |
| "step": 244 | |
| }, | |
| { | |
| "epoch": 0.4954499494438827, | |
| "grad_norm": 0.16556710004806519, | |
| "learning_rate": 0.00038330097510491483, | |
| "loss": 0.3148, | |
| "mean_token_accuracy": 0.8895911388099194, | |
| "num_tokens": 16718728.0, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.4974721941354904, | |
| "grad_norm": 0.1870684027671814, | |
| "learning_rate": 0.000383135401666597, | |
| "loss": 0.329, | |
| "mean_token_accuracy": 0.8862268440425396, | |
| "num_tokens": 16776165.0, | |
| "step": 246 | |
| }, | |
| { | |
| "epoch": 0.4994944388270981, | |
| "grad_norm": 0.18431027233600616, | |
| "learning_rate": 0.00038296905176163197, | |
| "loss": 0.3143, | |
| "mean_token_accuracy": 0.8902600333094597, | |
| "num_tokens": 16835743.0, | |
| "step": 247 | |
| }, | |
| { | |
| "epoch": 0.5015166835187057, | |
| "grad_norm": 0.18272148072719574, | |
| "learning_rate": 0.0003828019261817863, | |
| "loss": 0.3243, | |
| "mean_token_accuracy": 0.8864033743739128, | |
| "num_tokens": 16899775.0, | |
| "step": 248 | |
| }, | |
| { | |
| "epoch": 0.5035389282103134, | |
| "grad_norm": 0.1712082177400589, | |
| "learning_rate": 0.0003826340257225184, | |
| "loss": 0.324, | |
| "mean_token_accuracy": 0.8914847373962402, | |
| "num_tokens": 16972506.0, | |
| "step": 249 | |
| }, | |
| { | |
| "epoch": 0.5055611729019212, | |
| "grad_norm": 0.18500936031341553, | |
| "learning_rate": 0.00038246535118297497, | |
| "loss": 0.3006, | |
| "mean_token_accuracy": 0.8903259225189686, | |
| "num_tokens": 17036215.0, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.5075834175935288, | |
| "grad_norm": 0.19614791870117188, | |
| "learning_rate": 0.00038229590336598694, | |
| "loss": 0.3176, | |
| "mean_token_accuracy": 0.8885915465652943, | |
| "num_tokens": 17099060.0, | |
| "step": 251 | |
| }, | |
| { | |
| "epoch": 0.5096056622851365, | |
| "grad_norm": 0.20587585866451263, | |
| "learning_rate": 0.0003821256830780658, | |
| "loss": 0.3252, | |
| "mean_token_accuracy": 0.8900357261300087, | |
| "num_tokens": 17160737.0, | |
| "step": 252 | |
| }, | |
| { | |
| "epoch": 0.5116279069767442, | |
| "grad_norm": 0.16274958848953247, | |
| "learning_rate": 0.0003819546911293999, | |
| "loss": 0.3065, | |
| "mean_token_accuracy": 0.8940119668841362, | |
| "num_tokens": 17228903.0, | |
| "step": 253 | |
| }, | |
| { | |
| "epoch": 0.5136501516683518, | |
| "grad_norm": 0.16572465002536774, | |
| "learning_rate": 0.0003817829283338501, | |
| "loss": 0.3011, | |
| "mean_token_accuracy": 0.8989259153604507, | |
| "num_tokens": 17309457.0, | |
| "step": 254 | |
| }, | |
| { | |
| "epoch": 0.5156723963599595, | |
| "grad_norm": 0.21092504262924194, | |
| "learning_rate": 0.0003816103955089464, | |
| "loss": 0.3645, | |
| "mean_token_accuracy": 0.8738524205982685, | |
| "num_tokens": 17371710.0, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.5176946410515673, | |
| "grad_norm": 0.1776529848575592, | |
| "learning_rate": 0.0003814370934758839, | |
| "loss": 0.3413, | |
| "mean_token_accuracy": 0.8868374638259411, | |
| "num_tokens": 17445132.0, | |
| "step": 256 | |
| }, | |
| { | |
| "epoch": 0.5197168857431749, | |
| "grad_norm": 0.1718549132347107, | |
| "learning_rate": 0.0003812630230595188, | |
| "loss": 0.3472, | |
| "mean_token_accuracy": 0.8835309036076069, | |
| "num_tokens": 17511865.0, | |
| "step": 257 | |
| }, | |
| { | |
| "epoch": 0.5217391304347826, | |
| "grad_norm": 0.17998023331165314, | |
| "learning_rate": 0.0003810881850883645, | |
| "loss": 0.35, | |
| "mean_token_accuracy": 0.8810900300741196, | |
| "num_tokens": 17579299.0, | |
| "step": 258 | |
| }, | |
| { | |
| "epoch": 0.5237613751263903, | |
| "grad_norm": 0.15693029761314392, | |
| "learning_rate": 0.0003809125803945878, | |
| "loss": 0.2853, | |
| "mean_token_accuracy": 0.8982386291027069, | |
| "num_tokens": 17651748.0, | |
| "step": 259 | |
| }, | |
| { | |
| "epoch": 0.5257836198179979, | |
| "grad_norm": 0.18085962533950806, | |
| "learning_rate": 0.00038073620981400467, | |
| "loss": 0.2962, | |
| "mean_token_accuracy": 0.898784764111042, | |
| "num_tokens": 17716859.0, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.5278058645096056, | |
| "grad_norm": 0.2137887328863144, | |
| "learning_rate": 0.00038055907418607654, | |
| "loss": 0.3485, | |
| "mean_token_accuracy": 0.8793986700475216, | |
| "num_tokens": 17776486.0, | |
| "step": 261 | |
| }, | |
| { | |
| "epoch": 0.5298281092012134, | |
| "grad_norm": 0.169187992811203, | |
| "learning_rate": 0.0003803811743539062, | |
| "loss": 0.3093, | |
| "mean_token_accuracy": 0.8887566514313221, | |
| "num_tokens": 17844621.0, | |
| "step": 262 | |
| }, | |
| { | |
| "epoch": 0.531850353892821, | |
| "grad_norm": 0.1435088813304901, | |
| "learning_rate": 0.0003802025111642338, | |
| "loss": 0.2623, | |
| "mean_token_accuracy": 0.9051036462187767, | |
| "num_tokens": 17930557.0, | |
| "step": 263 | |
| }, | |
| { | |
| "epoch": 0.5338725985844287, | |
| "grad_norm": 0.1761457622051239, | |
| "learning_rate": 0.00038002308546743256, | |
| "loss": 0.3008, | |
| "mean_token_accuracy": 0.8946518003940582, | |
| "num_tokens": 17999603.0, | |
| "step": 264 | |
| }, | |
| { | |
| "epoch": 0.5358948432760364, | |
| "grad_norm": 0.17682158946990967, | |
| "learning_rate": 0.0003798428981175053, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.8925192318856716, | |
| "num_tokens": 18071957.0, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.537917087967644, | |
| "grad_norm": 0.18640998005867004, | |
| "learning_rate": 0.0003796619499720799, | |
| "loss": 0.3145, | |
| "mean_token_accuracy": 0.8919526562094688, | |
| "num_tokens": 18141501.0, | |
| "step": 266 | |
| }, | |
| { | |
| "epoch": 0.5399393326592518, | |
| "grad_norm": 0.1694413125514984, | |
| "learning_rate": 0.0003794802418924054, | |
| "loss": 0.299, | |
| "mean_token_accuracy": 0.8966234587132931, | |
| "num_tokens": 18215962.0, | |
| "step": 267 | |
| }, | |
| { | |
| "epoch": 0.5419615773508595, | |
| "grad_norm": 0.1720503866672516, | |
| "learning_rate": 0.00037929777474334756, | |
| "loss": 0.3269, | |
| "mean_token_accuracy": 0.8884270638227463, | |
| "num_tokens": 18282357.0, | |
| "step": 268 | |
| }, | |
| { | |
| "epoch": 0.5439838220424671, | |
| "grad_norm": 0.19224666059017181, | |
| "learning_rate": 0.0003791145493933855, | |
| "loss": 0.3477, | |
| "mean_token_accuracy": 0.8821601495146751, | |
| "num_tokens": 18347587.0, | |
| "step": 269 | |
| }, | |
| { | |
| "epoch": 0.5460060667340748, | |
| "grad_norm": 0.1664774864912033, | |
| "learning_rate": 0.0003789305667146069, | |
| "loss": 0.3009, | |
| "mean_token_accuracy": 0.8948215469717979, | |
| "num_tokens": 18415368.0, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.5480283114256825, | |
| "grad_norm": 0.18322114646434784, | |
| "learning_rate": 0.0003787458275827039, | |
| "loss": 0.3195, | |
| "mean_token_accuracy": 0.8908861018717289, | |
| "num_tokens": 18482285.0, | |
| "step": 271 | |
| }, | |
| { | |
| "epoch": 0.5500505561172901, | |
| "grad_norm": 0.20983459055423737, | |
| "learning_rate": 0.00037856033287696943, | |
| "loss": 0.2945, | |
| "mean_token_accuracy": 0.8964951671659946, | |
| "num_tokens": 18540330.0, | |
| "step": 272 | |
| }, | |
| { | |
| "epoch": 0.5520728008088979, | |
| "grad_norm": 0.1815643161535263, | |
| "learning_rate": 0.00037837408348029235, | |
| "loss": 0.3159, | |
| "mean_token_accuracy": 0.8929238878190517, | |
| "num_tokens": 18604976.0, | |
| "step": 273 | |
| }, | |
| { | |
| "epoch": 0.5540950455005056, | |
| "grad_norm": 0.2073771208524704, | |
| "learning_rate": 0.00037818708027915376, | |
| "loss": 0.3244, | |
| "mean_token_accuracy": 0.8876978568732738, | |
| "num_tokens": 18672322.0, | |
| "step": 274 | |
| }, | |
| { | |
| "epoch": 0.5561172901921132, | |
| "grad_norm": 0.19434937834739685, | |
| "learning_rate": 0.00037799932416362266, | |
| "loss": 0.3111, | |
| "mean_token_accuracy": 0.8910202607512474, | |
| "num_tokens": 18735221.0, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.5581395348837209, | |
| "grad_norm": 0.15825523436069489, | |
| "learning_rate": 0.00037781081602735145, | |
| "loss": 0.2758, | |
| "mean_token_accuracy": 0.8941913619637489, | |
| "num_tokens": 18815168.0, | |
| "step": 276 | |
| }, | |
| { | |
| "epoch": 0.5601617795753286, | |
| "grad_norm": 0.16185039281845093, | |
| "learning_rate": 0.00037762155676757196, | |
| "loss": 0.2978, | |
| "mean_token_accuracy": 0.89651133492589, | |
| "num_tokens": 18884062.0, | |
| "step": 277 | |
| }, | |
| { | |
| "epoch": 0.5621840242669363, | |
| "grad_norm": 0.18850262463092804, | |
| "learning_rate": 0.00037743154728509123, | |
| "loss": 0.3109, | |
| "mean_token_accuracy": 0.8866820931434631, | |
| "num_tokens": 18948236.0, | |
| "step": 278 | |
| }, | |
| { | |
| "epoch": 0.564206268958544, | |
| "grad_norm": 0.1736079454421997, | |
| "learning_rate": 0.00037724078848428707, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.9002561867237091, | |
| "num_tokens": 19017663.0, | |
| "step": 279 | |
| }, | |
| { | |
| "epoch": 0.5662285136501517, | |
| "grad_norm": 0.15573325753211975, | |
| "learning_rate": 0.0003770492812731035, | |
| "loss": 0.3072, | |
| "mean_token_accuracy": 0.891198180615902, | |
| "num_tokens": 19089120.0, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.5682507583417593, | |
| "grad_norm": 0.18526090681552887, | |
| "learning_rate": 0.0003768570265630471, | |
| "loss": 0.3305, | |
| "mean_token_accuracy": 0.8860407620668411, | |
| "num_tokens": 19154650.0, | |
| "step": 281 | |
| }, | |
| { | |
| "epoch": 0.570273003033367, | |
| "grad_norm": 0.1691296249628067, | |
| "learning_rate": 0.00037666402526918195, | |
| "loss": 0.3188, | |
| "mean_token_accuracy": 0.8919213153421879, | |
| "num_tokens": 19224445.0, | |
| "step": 282 | |
| }, | |
| { | |
| "epoch": 0.5722952477249748, | |
| "grad_norm": 0.17496982216835022, | |
| "learning_rate": 0.0003764702783101259, | |
| "loss": 0.3032, | |
| "mean_token_accuracy": 0.8902747184038162, | |
| "num_tokens": 19298006.0, | |
| "step": 283 | |
| }, | |
| { | |
| "epoch": 0.5743174924165824, | |
| "grad_norm": 0.14839443564414978, | |
| "learning_rate": 0.00037627578660804565, | |
| "loss": 0.2734, | |
| "mean_token_accuracy": 0.8967320993542671, | |
| "num_tokens": 19374661.0, | |
| "step": 284 | |
| }, | |
| { | |
| "epoch": 0.5763397371081901, | |
| "grad_norm": 0.1658451408147812, | |
| "learning_rate": 0.0003760805510886527, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.8904417157173157, | |
| "num_tokens": 19450524.0, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.5783619817997978, | |
| "grad_norm": 0.19472143054008484, | |
| "learning_rate": 0.000375884572681199, | |
| "loss": 0.3083, | |
| "mean_token_accuracy": 0.8959350101649761, | |
| "num_tokens": 19516098.0, | |
| "step": 286 | |
| }, | |
| { | |
| "epoch": 0.5803842264914054, | |
| "grad_norm": 0.17645469307899475, | |
| "learning_rate": 0.0003756878523184721, | |
| "loss": 0.3232, | |
| "mean_token_accuracy": 0.8930424600839615, | |
| "num_tokens": 19586030.0, | |
| "step": 287 | |
| }, | |
| { | |
| "epoch": 0.5824064711830131, | |
| "grad_norm": 0.16119012236595154, | |
| "learning_rate": 0.0003754903909367912, | |
| "loss": 0.2305, | |
| "mean_token_accuracy": 0.9079996608197689, | |
| "num_tokens": 19652723.0, | |
| "step": 288 | |
| }, | |
| { | |
| "epoch": 0.5844287158746209, | |
| "grad_norm": 0.1650022268295288, | |
| "learning_rate": 0.00037529218947600254, | |
| "loss": 0.2913, | |
| "mean_token_accuracy": 0.8961706385016441, | |
| "num_tokens": 19721013.0, | |
| "step": 289 | |
| }, | |
| { | |
| "epoch": 0.5864509605662285, | |
| "grad_norm": 0.1751680225133896, | |
| "learning_rate": 0.00037509324887947465, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.8925143517553806, | |
| "num_tokens": 19785808.0, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.5884732052578362, | |
| "grad_norm": 0.14274518191814423, | |
| "learning_rate": 0.0003748935700940942, | |
| "loss": 0.3009, | |
| "mean_token_accuracy": 0.8944595381617546, | |
| "num_tokens": 19864767.0, | |
| "step": 291 | |
| }, | |
| { | |
| "epoch": 0.5904954499494439, | |
| "grad_norm": 0.19173842668533325, | |
| "learning_rate": 0.00037469315407026154, | |
| "loss": 0.3189, | |
| "mean_token_accuracy": 0.8890005201101303, | |
| "num_tokens": 19934782.0, | |
| "step": 292 | |
| }, | |
| { | |
| "epoch": 0.5925176946410515, | |
| "grad_norm": 0.16339226067066193, | |
| "learning_rate": 0.0003744920017618856, | |
| "loss": 0.2984, | |
| "mean_token_accuracy": 0.891924075782299, | |
| "num_tokens": 20004350.0, | |
| "step": 293 | |
| }, | |
| { | |
| "epoch": 0.5945399393326593, | |
| "grad_norm": 0.1845332533121109, | |
| "learning_rate": 0.0003742901141263802, | |
| "loss": 0.3233, | |
| "mean_token_accuracy": 0.8917621746659279, | |
| "num_tokens": 20073462.0, | |
| "step": 294 | |
| }, | |
| { | |
| "epoch": 0.596562184024267, | |
| "grad_norm": 0.18664658069610596, | |
| "learning_rate": 0.00037408749212465895, | |
| "loss": 0.3168, | |
| "mean_token_accuracy": 0.8909800015389919, | |
| "num_tokens": 20135987.0, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.5985844287158746, | |
| "grad_norm": 0.17890143394470215, | |
| "learning_rate": 0.0003738841367211304, | |
| "loss": 0.2679, | |
| "mean_token_accuracy": 0.9026052355766296, | |
| "num_tokens": 20206699.0, | |
| "step": 296 | |
| }, | |
| { | |
| "epoch": 0.6006066734074823, | |
| "grad_norm": 0.18279992043972015, | |
| "learning_rate": 0.0003736800488836944, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.8961853981018066, | |
| "num_tokens": 20272267.0, | |
| "step": 297 | |
| }, | |
| { | |
| "epoch": 0.60262891809909, | |
| "grad_norm": 0.2653316557407379, | |
| "learning_rate": 0.00037347522958373664, | |
| "loss": 0.286, | |
| "mean_token_accuracy": 0.8971174284815788, | |
| "num_tokens": 20343534.0, | |
| "step": 298 | |
| }, | |
| { | |
| "epoch": 0.6046511627906976, | |
| "grad_norm": 0.15699949860572815, | |
| "learning_rate": 0.00037326967979612425, | |
| "loss": 0.2861, | |
| "mean_token_accuracy": 0.9003230258822441, | |
| "num_tokens": 20420939.0, | |
| "step": 299 | |
| }, | |
| { | |
| "epoch": 0.6066734074823054, | |
| "grad_norm": 0.17641445994377136, | |
| "learning_rate": 0.0003730634004992013, | |
| "loss": 0.3051, | |
| "mean_token_accuracy": 0.8907876797020435, | |
| "num_tokens": 20488068.0, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.6086956521739131, | |
| "grad_norm": 0.1636650264263153, | |
| "learning_rate": 0.0003728563926747842, | |
| "loss": 0.2928, | |
| "mean_token_accuracy": 0.8949981555342674, | |
| "num_tokens": 20560510.0, | |
| "step": 301 | |
| }, | |
| { | |
| "epoch": 0.6107178968655207, | |
| "grad_norm": 0.18622446060180664, | |
| "learning_rate": 0.0003726486573081567, | |
| "loss": 0.3156, | |
| "mean_token_accuracy": 0.8932462483644485, | |
| "num_tokens": 20627926.0, | |
| "step": 302 | |
| }, | |
| { | |
| "epoch": 0.6127401415571284, | |
| "grad_norm": 0.18102477490901947, | |
| "learning_rate": 0.00037244019538806546, | |
| "loss": 0.2859, | |
| "mean_token_accuracy": 0.897308062762022, | |
| "num_tokens": 20695635.0, | |
| "step": 303 | |
| }, | |
| { | |
| "epoch": 0.6147623862487361, | |
| "grad_norm": 0.19487911462783813, | |
| "learning_rate": 0.00037223100790671526, | |
| "loss": 0.3232, | |
| "mean_token_accuracy": 0.8873684406280518, | |
| "num_tokens": 20764073.0, | |
| "step": 304 | |
| }, | |
| { | |
| "epoch": 0.6167846309403437, | |
| "grad_norm": 0.16768330335617065, | |
| "learning_rate": 0.0003720210958597642, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.8974824510514736, | |
| "num_tokens": 20834156.0, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.6188068756319515, | |
| "grad_norm": 0.17184442281723022, | |
| "learning_rate": 0.00037181046024631944, | |
| "loss": 0.3167, | |
| "mean_token_accuracy": 0.8905413933098316, | |
| "num_tokens": 20906046.0, | |
| "step": 306 | |
| }, | |
| { | |
| "epoch": 0.6208291203235592, | |
| "grad_norm": 0.17979033291339874, | |
| "learning_rate": 0.0003715991020689316, | |
| "loss": 0.3166, | |
| "mean_token_accuracy": 0.8910835459828377, | |
| "num_tokens": 20969038.0, | |
| "step": 307 | |
| }, | |
| { | |
| "epoch": 0.6228513650151668, | |
| "grad_norm": 0.16872760653495789, | |
| "learning_rate": 0.0003713870223335907, | |
| "loss": 0.3023, | |
| "mean_token_accuracy": 0.8999812118709087, | |
| "num_tokens": 21054878.0, | |
| "step": 308 | |
| }, | |
| { | |
| "epoch": 0.6248736097067745, | |
| "grad_norm": 0.17098355293273926, | |
| "learning_rate": 0.00037117422204972094, | |
| "loss": 0.2918, | |
| "mean_token_accuracy": 0.9006133303046227, | |
| "num_tokens": 21120211.0, | |
| "step": 309 | |
| }, | |
| { | |
| "epoch": 0.6268958543983822, | |
| "grad_norm": 0.19943217933177948, | |
| "learning_rate": 0.00037096070223017634, | |
| "loss": 0.2992, | |
| "mean_token_accuracy": 0.8970108516514301, | |
| "num_tokens": 21193385.0, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.6289180990899899, | |
| "grad_norm": 0.19835074245929718, | |
| "learning_rate": 0.0003707464638912354, | |
| "loss": 0.2987, | |
| "mean_token_accuracy": 0.8971699252724648, | |
| "num_tokens": 21258335.0, | |
| "step": 311 | |
| }, | |
| { | |
| "epoch": 0.6309403437815976, | |
| "grad_norm": 0.1647316962480545, | |
| "learning_rate": 0.0003705315080525967, | |
| "loss": 0.2877, | |
| "mean_token_accuracy": 0.8915503136813641, | |
| "num_tokens": 21328815.0, | |
| "step": 312 | |
| }, | |
| { | |
| "epoch": 0.6329625884732053, | |
| "grad_norm": 0.18789348006248474, | |
| "learning_rate": 0.00037031583573737375, | |
| "loss": 0.2973, | |
| "mean_token_accuracy": 0.8956909030675888, | |
| "num_tokens": 21408498.0, | |
| "step": 313 | |
| }, | |
| { | |
| "epoch": 0.6349848331648129, | |
| "grad_norm": 0.23517835140228271, | |
| "learning_rate": 0.0003700994479720903, | |
| "loss": 0.3022, | |
| "mean_token_accuracy": 0.8944514766335487, | |
| "num_tokens": 21477506.0, | |
| "step": 314 | |
| }, | |
| { | |
| "epoch": 0.6370070778564206, | |
| "grad_norm": 0.1805562973022461, | |
| "learning_rate": 0.00036988234578667526, | |
| "loss": 0.313, | |
| "mean_token_accuracy": 0.892850112169981, | |
| "num_tokens": 21543808.0, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.6390293225480284, | |
| "grad_norm": 0.2823885679244995, | |
| "learning_rate": 0.0003696645302144582, | |
| "loss": 0.3397, | |
| "mean_token_accuracy": 0.8829572051763535, | |
| "num_tokens": 21607431.0, | |
| "step": 316 | |
| }, | |
| { | |
| "epoch": 0.641051567239636, | |
| "grad_norm": 0.19618524610996246, | |
| "learning_rate": 0.00036944600229216375, | |
| "loss": 0.3164, | |
| "mean_token_accuracy": 0.8882573507726192, | |
| "num_tokens": 21675489.0, | |
| "step": 317 | |
| }, | |
| { | |
| "epoch": 0.6430738119312437, | |
| "grad_norm": 0.19782759249210358, | |
| "learning_rate": 0.00036922676305990753, | |
| "loss": 0.3211, | |
| "mean_token_accuracy": 0.8908263929188251, | |
| "num_tokens": 21739400.0, | |
| "step": 318 | |
| }, | |
| { | |
| "epoch": 0.6450960566228514, | |
| "grad_norm": 0.20694133639335632, | |
| "learning_rate": 0.00036900681356119043, | |
| "loss": 0.2927, | |
| "mean_token_accuracy": 0.8931123651564121, | |
| "num_tokens": 21807454.0, | |
| "step": 319 | |
| }, | |
| { | |
| "epoch": 0.647118301314459, | |
| "grad_norm": 0.16246715188026428, | |
| "learning_rate": 0.00036878615484289395, | |
| "loss": 0.3095, | |
| "mean_token_accuracy": 0.8925521671772003, | |
| "num_tokens": 21883534.0, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.6491405460060667, | |
| "grad_norm": 0.1689622849225998, | |
| "learning_rate": 0.0003685647879552755, | |
| "loss": 0.3198, | |
| "mean_token_accuracy": 0.8910107761621475, | |
| "num_tokens": 21954057.0, | |
| "step": 321 | |
| }, | |
| { | |
| "epoch": 0.6511627906976745, | |
| "grad_norm": 0.21298348903656006, | |
| "learning_rate": 0.0003683427139519628, | |
| "loss": 0.3098, | |
| "mean_token_accuracy": 0.8946363367140293, | |
| "num_tokens": 22024559.0, | |
| "step": 322 | |
| }, | |
| { | |
| "epoch": 0.6531850353892821, | |
| "grad_norm": 0.20307037234306335, | |
| "learning_rate": 0.00036811993388994945, | |
| "loss": 0.3042, | |
| "mean_token_accuracy": 0.8996872641146183, | |
| "num_tokens": 22083005.0, | |
| "step": 323 | |
| }, | |
| { | |
| "epoch": 0.6552072800808898, | |
| "grad_norm": 0.19622348248958588, | |
| "learning_rate": 0.00036789644882958953, | |
| "loss": 0.3106, | |
| "mean_token_accuracy": 0.8917652256786823, | |
| "num_tokens": 22153882.0, | |
| "step": 324 | |
| }, | |
| { | |
| "epoch": 0.6572295247724975, | |
| "grad_norm": 0.2048502266407013, | |
| "learning_rate": 0.00036767225983459247, | |
| "loss": 0.3072, | |
| "mean_token_accuracy": 0.892122782766819, | |
| "num_tokens": 22223638.0, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.6592517694641051, | |
| "grad_norm": 0.17371125519275665, | |
| "learning_rate": 0.00036744736797201855, | |
| "loss": 0.2818, | |
| "mean_token_accuracy": 0.9024628438055515, | |
| "num_tokens": 22287424.0, | |
| "step": 326 | |
| }, | |
| { | |
| "epoch": 0.6612740141557129, | |
| "grad_norm": 0.1815844625234604, | |
| "learning_rate": 0.0003672217743122732, | |
| "loss": 0.3064, | |
| "mean_token_accuracy": 0.8945932053029537, | |
| "num_tokens": 22349842.0, | |
| "step": 327 | |
| }, | |
| { | |
| "epoch": 0.6632962588473206, | |
| "grad_norm": 0.16366587579250336, | |
| "learning_rate": 0.00036699547992910227, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.8982814475893974, | |
| "num_tokens": 22436553.0, | |
| "step": 328 | |
| }, | |
| { | |
| "epoch": 0.6653185035389282, | |
| "grad_norm": 0.1992887556552887, | |
| "learning_rate": 0.00036676848589958663, | |
| "loss": 0.325, | |
| "mean_token_accuracy": 0.8879561647772789, | |
| "num_tokens": 22493823.0, | |
| "step": 329 | |
| }, | |
| { | |
| "epoch": 0.6673407482305359, | |
| "grad_norm": 0.17708779871463776, | |
| "learning_rate": 0.0003665407933041375, | |
| "loss": 0.3325, | |
| "mean_token_accuracy": 0.8939349353313446, | |
| "num_tokens": 22563840.0, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.6693629929221436, | |
| "grad_norm": 0.2144147753715515, | |
| "learning_rate": 0.00036631240322649076, | |
| "loss": 0.3335, | |
| "mean_token_accuracy": 0.8810755871236324, | |
| "num_tokens": 22624256.0, | |
| "step": 331 | |
| }, | |
| { | |
| "epoch": 0.6713852376137512, | |
| "grad_norm": 0.16541875898838043, | |
| "learning_rate": 0.0003660833167537022, | |
| "loss": 0.3275, | |
| "mean_token_accuracy": 0.8926926329731941, | |
| "num_tokens": 22694170.0, | |
| "step": 332 | |
| }, | |
| { | |
| "epoch": 0.673407482305359, | |
| "grad_norm": 0.1698412150144577, | |
| "learning_rate": 0.00036585353497614224, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.8967249467968941, | |
| "num_tokens": 22768633.0, | |
| "step": 333 | |
| }, | |
| { | |
| "epoch": 0.6754297269969667, | |
| "grad_norm": 0.1821826696395874, | |
| "learning_rate": 0.00036562305898749054, | |
| "loss": 0.3208, | |
| "mean_token_accuracy": 0.886600024998188, | |
| "num_tokens": 22837600.0, | |
| "step": 334 | |
| }, | |
| { | |
| "epoch": 0.6774519716885743, | |
| "grad_norm": 0.1860353797674179, | |
| "learning_rate": 0.00036539188988473113, | |
| "loss": 0.3098, | |
| "mean_token_accuracy": 0.8903123624622822, | |
| "num_tokens": 22896567.0, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.679474216380182, | |
| "grad_norm": 0.14535972476005554, | |
| "learning_rate": 0.0003651600287681469, | |
| "loss": 0.2686, | |
| "mean_token_accuracy": 0.9052710346877575, | |
| "num_tokens": 22973060.0, | |
| "step": 336 | |
| }, | |
| { | |
| "epoch": 0.6814964610717897, | |
| "grad_norm": 0.19355034828186035, | |
| "learning_rate": 0.0003649274767413145, | |
| "loss": 0.2877, | |
| "mean_token_accuracy": 0.8924892544746399, | |
| "num_tokens": 23043913.0, | |
| "step": 337 | |
| }, | |
| { | |
| "epoch": 0.6835187057633973, | |
| "grad_norm": 0.185837984085083, | |
| "learning_rate": 0.00036469423491109913, | |
| "loss": 0.2982, | |
| "mean_token_accuracy": 0.8957228772342205, | |
| "num_tokens": 23114457.0, | |
| "step": 338 | |
| }, | |
| { | |
| "epoch": 0.6855409504550051, | |
| "grad_norm": 0.18406859040260315, | |
| "learning_rate": 0.0003644603043876492, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.8950929716229439, | |
| "num_tokens": 23181146.0, | |
| "step": 339 | |
| }, | |
| { | |
| "epoch": 0.6875631951466128, | |
| "grad_norm": 0.19715051352977753, | |
| "learning_rate": 0.00036422568628439095, | |
| "loss": 0.307, | |
| "mean_token_accuracy": 0.8925964459776878, | |
| "num_tokens": 23245269.0, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.6895854398382204, | |
| "grad_norm": 0.18601888418197632, | |
| "learning_rate": 0.0003639903817180233, | |
| "loss": 0.2756, | |
| "mean_token_accuracy": 0.8994149342179298, | |
| "num_tokens": 23321463.0, | |
| "step": 341 | |
| }, | |
| { | |
| "epoch": 0.6916076845298281, | |
| "grad_norm": 0.18005859851837158, | |
| "learning_rate": 0.0003637543918085127, | |
| "loss": 0.2958, | |
| "mean_token_accuracy": 0.8977576531469822, | |
| "num_tokens": 23385849.0, | |
| "step": 342 | |
| }, | |
| { | |
| "epoch": 0.6936299292214358, | |
| "grad_norm": 0.19051752984523773, | |
| "learning_rate": 0.00036351771767908727, | |
| "loss": 0.3074, | |
| "mean_token_accuracy": 0.8928764685988426, | |
| "num_tokens": 23456847.0, | |
| "step": 343 | |
| }, | |
| { | |
| "epoch": 0.6956521739130435, | |
| "grad_norm": 0.20482131838798523, | |
| "learning_rate": 0.0003632803604562319, | |
| "loss": 0.3029, | |
| "mean_token_accuracy": 0.8909181989729404, | |
| "num_tokens": 23526730.0, | |
| "step": 344 | |
| }, | |
| { | |
| "epoch": 0.6976744186046512, | |
| "grad_norm": 0.17358487844467163, | |
| "learning_rate": 0.00036304232126968295, | |
| "loss": 0.2844, | |
| "mean_token_accuracy": 0.898325003683567, | |
| "num_tokens": 23588681.0, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.6996966632962589, | |
| "grad_norm": 0.1700018048286438, | |
| "learning_rate": 0.00036280360125242234, | |
| "loss": 0.2733, | |
| "mean_token_accuracy": 0.9010062254965305, | |
| "num_tokens": 23664445.0, | |
| "step": 346 | |
| }, | |
| { | |
| "epoch": 0.7017189079878665, | |
| "grad_norm": 0.193056121468544, | |
| "learning_rate": 0.0003625642015406727, | |
| "loss": 0.3102, | |
| "mean_token_accuracy": 0.8916714228689671, | |
| "num_tokens": 23738941.0, | |
| "step": 347 | |
| }, | |
| { | |
| "epoch": 0.7037411526794742, | |
| "grad_norm": 0.19169779121875763, | |
| "learning_rate": 0.0003623241232738919, | |
| "loss": 0.2957, | |
| "mean_token_accuracy": 0.8949874453246593, | |
| "num_tokens": 23801979.0, | |
| "step": 348 | |
| }, | |
| { | |
| "epoch": 0.7057633973710818, | |
| "grad_norm": 0.16655734181404114, | |
| "learning_rate": 0.00036208336759476704, | |
| "loss": 0.2937, | |
| "mean_token_accuracy": 0.896770391613245, | |
| "num_tokens": 23868193.0, | |
| "step": 349 | |
| }, | |
| { | |
| "epoch": 0.7077856420626896, | |
| "grad_norm": 0.15496356785297394, | |
| "learning_rate": 0.0003618419356492099, | |
| "loss": 0.2871, | |
| "mean_token_accuracy": 0.9015951566398144, | |
| "num_tokens": 23947204.0, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.7098078867542973, | |
| "grad_norm": 0.160264790058136, | |
| "learning_rate": 0.00036159982858635105, | |
| "loss": 0.2825, | |
| "mean_token_accuracy": 0.9006201699376106, | |
| "num_tokens": 24021149.0, | |
| "step": 351 | |
| }, | |
| { | |
| "epoch": 0.7118301314459049, | |
| "grad_norm": 0.16146975755691528, | |
| "learning_rate": 0.00036135704755853407, | |
| "loss": 0.2757, | |
| "mean_token_accuracy": 0.9038827978074551, | |
| "num_tokens": 24092549.0, | |
| "step": 352 | |
| }, | |
| { | |
| "epoch": 0.7138523761375126, | |
| "grad_norm": 0.20805270969867706, | |
| "learning_rate": 0.0003611135937213106, | |
| "loss": 0.3267, | |
| "mean_token_accuracy": 0.8861317448318005, | |
| "num_tokens": 24157474.0, | |
| "step": 353 | |
| }, | |
| { | |
| "epoch": 0.7158746208291203, | |
| "grad_norm": 0.16421623528003693, | |
| "learning_rate": 0.0003608694682334345, | |
| "loss": 0.2935, | |
| "mean_token_accuracy": 0.8962382674217224, | |
| "num_tokens": 24230461.0, | |
| "step": 354 | |
| }, | |
| { | |
| "epoch": 0.717896865520728, | |
| "grad_norm": 0.1796526312828064, | |
| "learning_rate": 0.0003606246722568566, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.8999650441110134, | |
| "num_tokens": 24296781.0, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.7199191102123357, | |
| "grad_norm": 0.18790611624717712, | |
| "learning_rate": 0.0003603792069567187, | |
| "loss": 0.3496, | |
| "mean_token_accuracy": 0.8827480934560299, | |
| "num_tokens": 24361770.0, | |
| "step": 356 | |
| }, | |
| { | |
| "epoch": 0.7219413549039434, | |
| "grad_norm": 0.16473916172981262, | |
| "learning_rate": 0.00036013307350134884, | |
| "loss": 0.314, | |
| "mean_token_accuracy": 0.8960560448467731, | |
| "num_tokens": 24432956.0, | |
| "step": 357 | |
| }, | |
| { | |
| "epoch": 0.723963599595551, | |
| "grad_norm": 0.17466352880001068, | |
| "learning_rate": 0.0003598862730622548, | |
| "loss": 0.3113, | |
| "mean_token_accuracy": 0.8914259672164917, | |
| "num_tokens": 24499417.0, | |
| "step": 358 | |
| }, | |
| { | |
| "epoch": 0.7259858442871587, | |
| "grad_norm": 0.17617358267307281, | |
| "learning_rate": 0.0003596388068141191, | |
| "loss": 0.2961, | |
| "mean_token_accuracy": 0.893797617405653, | |
| "num_tokens": 24567238.0, | |
| "step": 359 | |
| }, | |
| { | |
| "epoch": 0.7280080889787665, | |
| "grad_norm": 0.18195107579231262, | |
| "learning_rate": 0.0003593906759347934, | |
| "loss": 0.318, | |
| "mean_token_accuracy": 0.8848773874342442, | |
| "num_tokens": 24634769.0, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.7300303336703741, | |
| "grad_norm": 0.16119951009750366, | |
| "learning_rate": 0.00035914188160529267, | |
| "loss": 0.2863, | |
| "mean_token_accuracy": 0.89824278652668, | |
| "num_tokens": 24701688.0, | |
| "step": 361 | |
| }, | |
| { | |
| "epoch": 0.7320525783619818, | |
| "grad_norm": 0.1530333161354065, | |
| "learning_rate": 0.00035889242500978966, | |
| "loss": 0.2737, | |
| "mean_token_accuracy": 0.901515819132328, | |
| "num_tokens": 24778487.0, | |
| "step": 362 | |
| }, | |
| { | |
| "epoch": 0.7340748230535895, | |
| "grad_norm": 0.172471821308136, | |
| "learning_rate": 0.0003586423073356092, | |
| "loss": 0.3, | |
| "mean_token_accuracy": 0.8986642919480801, | |
| "num_tokens": 24844779.0, | |
| "step": 363 | |
| }, | |
| { | |
| "epoch": 0.7360970677451971, | |
| "grad_norm": 0.1733032464981079, | |
| "learning_rate": 0.00035839152977322275, | |
| "loss": 0.2899, | |
| "mean_token_accuracy": 0.8977354988455772, | |
| "num_tokens": 24909088.0, | |
| "step": 364 | |
| }, | |
| { | |
| "epoch": 0.7381193124368048, | |
| "grad_norm": 0.16756588220596313, | |
| "learning_rate": 0.00035814009351624256, | |
| "loss": 0.2923, | |
| "mean_token_accuracy": 0.897175993770361, | |
| "num_tokens": 24978294.0, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.7401415571284126, | |
| "grad_norm": 0.1823996752500534, | |
| "learning_rate": 0.00035788799976141605, | |
| "loss": 0.3227, | |
| "mean_token_accuracy": 0.8896390423178673, | |
| "num_tokens": 25043171.0, | |
| "step": 366 | |
| }, | |
| { | |
| "epoch": 0.7421638018200202, | |
| "grad_norm": 0.18004441261291504, | |
| "learning_rate": 0.0003576352497086201, | |
| "loss": 0.2954, | |
| "mean_token_accuracy": 0.8963689431548119, | |
| "num_tokens": 25113306.0, | |
| "step": 367 | |
| }, | |
| { | |
| "epoch": 0.7441860465116279, | |
| "grad_norm": 0.19010895490646362, | |
| "learning_rate": 0.0003573818445608552, | |
| "loss": 0.3013, | |
| "mean_token_accuracy": 0.8936556875705719, | |
| "num_tokens": 25178407.0, | |
| "step": 368 | |
| }, | |
| { | |
| "epoch": 0.7462082912032356, | |
| "grad_norm": 0.2009873390197754, | |
| "learning_rate": 0.0003571277855242401, | |
| "loss": 0.3204, | |
| "mean_token_accuracy": 0.8890100382268429, | |
| "num_tokens": 25236571.0, | |
| "step": 369 | |
| }, | |
| { | |
| "epoch": 0.7482305358948432, | |
| "grad_norm": 0.17589393258094788, | |
| "learning_rate": 0.00035687307380800556, | |
| "loss": 0.3046, | |
| "mean_token_accuracy": 0.8946997821331024, | |
| "num_tokens": 25298545.0, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.750252780586451, | |
| "grad_norm": 0.1642550528049469, | |
| "learning_rate": 0.00035661771062448915, | |
| "loss": 0.2808, | |
| "mean_token_accuracy": 0.8977020867168903, | |
| "num_tokens": 25371496.0, | |
| "step": 371 | |
| }, | |
| { | |
| "epoch": 0.7522750252780587, | |
| "grad_norm": 0.178288072347641, | |
| "learning_rate": 0.00035636169718912894, | |
| "loss": 0.3122, | |
| "mean_token_accuracy": 0.8912137039005756, | |
| "num_tokens": 25434070.0, | |
| "step": 372 | |
| }, | |
| { | |
| "epoch": 0.7542972699696663, | |
| "grad_norm": 0.1830630898475647, | |
| "learning_rate": 0.0003561050347204581, | |
| "loss": 0.3156, | |
| "mean_token_accuracy": 0.8928086012601852, | |
| "num_tokens": 25499661.0, | |
| "step": 373 | |
| }, | |
| { | |
| "epoch": 0.756319514661274, | |
| "grad_norm": 0.15954959392547607, | |
| "learning_rate": 0.000355847724440099, | |
| "loss": 0.281, | |
| "mean_token_accuracy": 0.896581944078207, | |
| "num_tokens": 25577844.0, | |
| "step": 374 | |
| }, | |
| { | |
| "epoch": 0.7583417593528817, | |
| "grad_norm": 0.200165256857872, | |
| "learning_rate": 0.00035558976757275716, | |
| "loss": 0.3191, | |
| "mean_token_accuracy": 0.8899872414767742, | |
| "num_tokens": 25638524.0, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.7603640040444893, | |
| "grad_norm": 0.1939467191696167, | |
| "learning_rate": 0.00035533116534621596, | |
| "loss": 0.3107, | |
| "mean_token_accuracy": 0.8947361186146736, | |
| "num_tokens": 25704939.0, | |
| "step": 376 | |
| }, | |
| { | |
| "epoch": 0.7623862487360971, | |
| "grad_norm": 0.16760645806789398, | |
| "learning_rate": 0.0003550719189913302, | |
| "loss": 0.2895, | |
| "mean_token_accuracy": 0.9010074771940708, | |
| "num_tokens": 25773040.0, | |
| "step": 377 | |
| }, | |
| { | |
| "epoch": 0.7644084934277048, | |
| "grad_norm": 0.17111922800540924, | |
| "learning_rate": 0.0003548120297420204, | |
| "loss": 0.2941, | |
| "mean_token_accuracy": 0.8943174667656422, | |
| "num_tokens": 25841353.0, | |
| "step": 378 | |
| }, | |
| { | |
| "epoch": 0.7664307381193124, | |
| "grad_norm": 0.19698713719844818, | |
| "learning_rate": 0.00035455149883526746, | |
| "loss": 0.3089, | |
| "mean_token_accuracy": 0.8988425992429256, | |
| "num_tokens": 25908894.0, | |
| "step": 379 | |
| }, | |
| { | |
| "epoch": 0.7684529828109201, | |
| "grad_norm": 0.19156275689601898, | |
| "learning_rate": 0.00035429032751110596, | |
| "loss": 0.2904, | |
| "mean_token_accuracy": 0.8982725702226162, | |
| "num_tokens": 25976883.0, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.7704752275025278, | |
| "grad_norm": 0.17211389541625977, | |
| "learning_rate": 0.00035402851701261874, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.8920269943773746, | |
| "num_tokens": 26045757.0, | |
| "step": 381 | |
| }, | |
| { | |
| "epoch": 0.7724974721941354, | |
| "grad_norm": 0.17306530475616455, | |
| "learning_rate": 0.000353766068585931, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.8918648697435856, | |
| "num_tokens": 26118719.0, | |
| "step": 382 | |
| }, | |
| { | |
| "epoch": 0.7745197168857432, | |
| "grad_norm": 0.17627696692943573, | |
| "learning_rate": 0.00035350298348020407, | |
| "loss": 0.2979, | |
| "mean_token_accuracy": 0.8935811407864094, | |
| "num_tokens": 26183890.0, | |
| "step": 383 | |
| }, | |
| { | |
| "epoch": 0.7765419615773509, | |
| "grad_norm": 0.16283521056175232, | |
| "learning_rate": 0.0003532392629476298, | |
| "loss": 0.2819, | |
| "mean_token_accuracy": 0.895574290305376, | |
| "num_tokens": 26254712.0, | |
| "step": 384 | |
| }, | |
| { | |
| "epoch": 0.7785642062689585, | |
| "grad_norm": 0.18045000731945038, | |
| "learning_rate": 0.00035297490824342436, | |
| "loss": 0.307, | |
| "mean_token_accuracy": 0.8899386301636696, | |
| "num_tokens": 26317196.0, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.7805864509605662, | |
| "grad_norm": 0.15806086361408234, | |
| "learning_rate": 0.0003527099206258223, | |
| "loss": 0.289, | |
| "mean_token_accuracy": 0.8989690914750099, | |
| "num_tokens": 26385704.0, | |
| "step": 386 | |
| }, | |
| { | |
| "epoch": 0.782608695652174, | |
| "grad_norm": 0.17871202528476715, | |
| "learning_rate": 0.0003524443013560709, | |
| "loss": 0.2968, | |
| "mean_token_accuracy": 0.8961369805037975, | |
| "num_tokens": 26453865.0, | |
| "step": 387 | |
| }, | |
| { | |
| "epoch": 0.7846309403437816, | |
| "grad_norm": 0.17596516013145447, | |
| "learning_rate": 0.0003521780516984234, | |
| "loss": 0.2849, | |
| "mean_token_accuracy": 0.8956369571387768, | |
| "num_tokens": 26519337.0, | |
| "step": 388 | |
| }, | |
| { | |
| "epoch": 0.7866531850353893, | |
| "grad_norm": 0.1952444314956665, | |
| "learning_rate": 0.00035191117292013394, | |
| "loss": 0.3073, | |
| "mean_token_accuracy": 0.8928476311266422, | |
| "num_tokens": 26590979.0, | |
| "step": 389 | |
| }, | |
| { | |
| "epoch": 0.788675429726997, | |
| "grad_norm": 0.16196580231189728, | |
| "learning_rate": 0.00035164366629145073, | |
| "loss": 0.2858, | |
| "mean_token_accuracy": 0.8969371728599072, | |
| "num_tokens": 26662280.0, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.7906976744186046, | |
| "grad_norm": 0.18022611737251282, | |
| "learning_rate": 0.0003513755330856104, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.8949360400438309, | |
| "num_tokens": 26735704.0, | |
| "step": 391 | |
| }, | |
| { | |
| "epoch": 0.7927199191102123, | |
| "grad_norm": 0.1670723408460617, | |
| "learning_rate": 0.000351106774578832, | |
| "loss": 0.3023, | |
| "mean_token_accuracy": 0.8980297967791557, | |
| "num_tokens": 26806733.0, | |
| "step": 392 | |
| }, | |
| { | |
| "epoch": 0.7947421638018201, | |
| "grad_norm": 0.16242116689682007, | |
| "learning_rate": 0.0003508373920503108, | |
| "loss": 0.2683, | |
| "mean_token_accuracy": 0.8985998295247555, | |
| "num_tokens": 26873233.0, | |
| "step": 393 | |
| }, | |
| { | |
| "epoch": 0.7967644084934277, | |
| "grad_norm": 0.15898491442203522, | |
| "learning_rate": 0.00035056738678221176, | |
| "loss": 0.2938, | |
| "mean_token_accuracy": 0.8989557921886444, | |
| "num_tokens": 26949546.0, | |
| "step": 394 | |
| }, | |
| { | |
| "epoch": 0.7987866531850354, | |
| "grad_norm": 0.1636972278356552, | |
| "learning_rate": 0.00035029676005966445, | |
| "loss": 0.2884, | |
| "mean_token_accuracy": 0.8981003984808922, | |
| "num_tokens": 27014513.0, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.8008088978766431, | |
| "grad_norm": 0.1949148327112198, | |
| "learning_rate": 0.000350025513170756, | |
| "loss": 0.3172, | |
| "mean_token_accuracy": 0.8922760672867298, | |
| "num_tokens": 27076549.0, | |
| "step": 396 | |
| }, | |
| { | |
| "epoch": 0.8028311425682507, | |
| "grad_norm": 0.18752135336399078, | |
| "learning_rate": 0.0003497536474065254, | |
| "loss": 0.3197, | |
| "mean_token_accuracy": 0.8879435993731022, | |
| "num_tokens": 27143261.0, | |
| "step": 397 | |
| }, | |
| { | |
| "epoch": 0.8048533872598584, | |
| "grad_norm": 0.18382735550403595, | |
| "learning_rate": 0.0003494811640609572, | |
| "loss": 0.3165, | |
| "mean_token_accuracy": 0.8949453271925449, | |
| "num_tokens": 27208188.0, | |
| "step": 398 | |
| }, | |
| { | |
| "epoch": 0.8068756319514662, | |
| "grad_norm": 0.1782997101545334, | |
| "learning_rate": 0.0003492080644309756, | |
| "loss": 0.3018, | |
| "mean_token_accuracy": 0.8956249915063381, | |
| "num_tokens": 27279349.0, | |
| "step": 399 | |
| }, | |
| { | |
| "epoch": 0.8088978766430738, | |
| "grad_norm": 0.16625821590423584, | |
| "learning_rate": 0.0003489343498164378, | |
| "loss": 0.2909, | |
| "mean_token_accuracy": 0.8978218026459217, | |
| "num_tokens": 27349491.0, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.8109201213346815, | |
| "grad_norm": 0.2034144103527069, | |
| "learning_rate": 0.0003486600215201284, | |
| "loss": 0.3205, | |
| "mean_token_accuracy": 0.8883098587393761, | |
| "num_tokens": 27425145.0, | |
| "step": 401 | |
| }, | |
| { | |
| "epoch": 0.8129423660262892, | |
| "grad_norm": 0.18235254287719727, | |
| "learning_rate": 0.0003483850808477527, | |
| "loss": 0.3142, | |
| "mean_token_accuracy": 0.8946905098855495, | |
| "num_tokens": 27493953.0, | |
| "step": 402 | |
| }, | |
| { | |
| "epoch": 0.8149646107178968, | |
| "grad_norm": 0.16972221434116364, | |
| "learning_rate": 0.00034810952910793085, | |
| "loss": 0.3183, | |
| "mean_token_accuracy": 0.886278223246336, | |
| "num_tokens": 27559794.0, | |
| "step": 403 | |
| }, | |
| { | |
| "epoch": 0.8169868554095046, | |
| "grad_norm": 0.17891989648342133, | |
| "learning_rate": 0.00034783336761219137, | |
| "loss": 0.2848, | |
| "mean_token_accuracy": 0.8995977118611336, | |
| "num_tokens": 27629989.0, | |
| "step": 404 | |
| }, | |
| { | |
| "epoch": 0.8190091001011123, | |
| "grad_norm": 0.1790463924407959, | |
| "learning_rate": 0.0003475565976749651, | |
| "loss": 0.3109, | |
| "mean_token_accuracy": 0.8868453428149223, | |
| "num_tokens": 27688846.0, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.8210313447927199, | |
| "grad_norm": 0.1789504438638687, | |
| "learning_rate": 0.00034727922061357855, | |
| "loss": 0.3284, | |
| "mean_token_accuracy": 0.8879125751554966, | |
| "num_tokens": 27755235.0, | |
| "step": 406 | |
| }, | |
| { | |
| "epoch": 0.8230535894843276, | |
| "grad_norm": 0.19450780749320984, | |
| "learning_rate": 0.0003470012377482484, | |
| "loss": 0.3079, | |
| "mean_token_accuracy": 0.8906297236680984, | |
| "num_tokens": 27819736.0, | |
| "step": 407 | |
| }, | |
| { | |
| "epoch": 0.8250758341759353, | |
| "grad_norm": 0.21135565638542175, | |
| "learning_rate": 0.0003467226504020743, | |
| "loss": 0.3314, | |
| "mean_token_accuracy": 0.8855904154479504, | |
| "num_tokens": 27878648.0, | |
| "step": 408 | |
| }, | |
| { | |
| "epoch": 0.8270980788675429, | |
| "grad_norm": 0.1756933629512787, | |
| "learning_rate": 0.0003464434599010333, | |
| "loss": 0.3045, | |
| "mean_token_accuracy": 0.8893042095005512, | |
| "num_tokens": 27937967.0, | |
| "step": 409 | |
| }, | |
| { | |
| "epoch": 0.8291203235591507, | |
| "grad_norm": 0.1893833428621292, | |
| "learning_rate": 0.0003461636675739732, | |
| "loss": 0.3089, | |
| "mean_token_accuracy": 0.8921520821750164, | |
| "num_tokens": 28003500.0, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.8311425682507584, | |
| "grad_norm": 0.19579611718654633, | |
| "learning_rate": 0.0003458832747526061, | |
| "loss": 0.2954, | |
| "mean_token_accuracy": 0.8962360806763172, | |
| "num_tokens": 28060691.0, | |
| "step": 411 | |
| }, | |
| { | |
| "epoch": 0.833164812942366, | |
| "grad_norm": 0.19954101741313934, | |
| "learning_rate": 0.0003456022827715025, | |
| "loss": 0.3057, | |
| "mean_token_accuracy": 0.8955631367862225, | |
| "num_tokens": 28119842.0, | |
| "step": 412 | |
| }, | |
| { | |
| "epoch": 0.8351870576339737, | |
| "grad_norm": 0.17535583674907684, | |
| "learning_rate": 0.0003453206929680844, | |
| "loss": 0.3181, | |
| "mean_token_accuracy": 0.8896914720535278, | |
| "num_tokens": 28189519.0, | |
| "step": 413 | |
| }, | |
| { | |
| "epoch": 0.8372093023255814, | |
| "grad_norm": 0.2034400850534439, | |
| "learning_rate": 0.0003450385066826195, | |
| "loss": 0.3132, | |
| "mean_token_accuracy": 0.8903135284781456, | |
| "num_tokens": 28256532.0, | |
| "step": 414 | |
| }, | |
| { | |
| "epoch": 0.839231547017189, | |
| "grad_norm": 0.18071752786636353, | |
| "learning_rate": 0.0003447557252582145, | |
| "loss": 0.3229, | |
| "mean_token_accuracy": 0.891409307718277, | |
| "num_tokens": 28320211.0, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.8412537917087968, | |
| "grad_norm": 0.17119021713733673, | |
| "learning_rate": 0.00034447235004080853, | |
| "loss": 0.3096, | |
| "mean_token_accuracy": 0.8913502097129822, | |
| "num_tokens": 28384204.0, | |
| "step": 416 | |
| }, | |
| { | |
| "epoch": 0.8432760364004045, | |
| "grad_norm": 0.17320208251476288, | |
| "learning_rate": 0.0003441883823791671, | |
| "loss": 0.2935, | |
| "mean_token_accuracy": 0.8983162231743336, | |
| "num_tokens": 28454515.0, | |
| "step": 417 | |
| }, | |
| { | |
| "epoch": 0.8452982810920121, | |
| "grad_norm": 0.17323511838912964, | |
| "learning_rate": 0.0003439038236248757, | |
| "loss": 0.3053, | |
| "mean_token_accuracy": 0.8946337774395943, | |
| "num_tokens": 28524571.0, | |
| "step": 418 | |
| }, | |
| { | |
| "epoch": 0.8473205257836198, | |
| "grad_norm": 0.19488638639450073, | |
| "learning_rate": 0.00034361867513233303, | |
| "loss": 0.3131, | |
| "mean_token_accuracy": 0.8917714729905128, | |
| "num_tokens": 28583638.0, | |
| "step": 419 | |
| }, | |
| { | |
| "epoch": 0.8493427704752275, | |
| "grad_norm": 0.14881743490695953, | |
| "learning_rate": 0.00034333293825874464, | |
| "loss": 0.2561, | |
| "mean_token_accuracy": 0.9055963829159737, | |
| "num_tokens": 28668101.0, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.8513650151668352, | |
| "grad_norm": 0.17198774218559265, | |
| "learning_rate": 0.0003430466143641168, | |
| "loss": 0.3071, | |
| "mean_token_accuracy": 0.8936148509383202, | |
| "num_tokens": 28739207.0, | |
| "step": 421 | |
| }, | |
| { | |
| "epoch": 0.8533872598584429, | |
| "grad_norm": 0.18449024856090546, | |
| "learning_rate": 0.00034275970481124977, | |
| "loss": 0.276, | |
| "mean_token_accuracy": 0.9006006754934788, | |
| "num_tokens": 28803993.0, | |
| "step": 422 | |
| }, | |
| { | |
| "epoch": 0.8554095045500506, | |
| "grad_norm": 0.1962573230266571, | |
| "learning_rate": 0.0003424722109657311, | |
| "loss": 0.3059, | |
| "mean_token_accuracy": 0.8941029235720634, | |
| "num_tokens": 28870609.0, | |
| "step": 423 | |
| }, | |
| { | |
| "epoch": 0.8574317492416582, | |
| "grad_norm": 0.1821158230304718, | |
| "learning_rate": 0.00034218413419592953, | |
| "loss": 0.2905, | |
| "mean_token_accuracy": 0.890890721231699, | |
| "num_tokens": 28943383.0, | |
| "step": 424 | |
| }, | |
| { | |
| "epoch": 0.8594539939332659, | |
| "grad_norm": 0.18370237946510315, | |
| "learning_rate": 0.00034189547587298836, | |
| "loss": 0.3166, | |
| "mean_token_accuracy": 0.8925870470702648, | |
| "num_tokens": 29009040.0, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.8614762386248737, | |
| "grad_norm": 0.17396995425224304, | |
| "learning_rate": 0.00034160623737081885, | |
| "loss": 0.3011, | |
| "mean_token_accuracy": 0.8914640247821808, | |
| "num_tokens": 29074553.0, | |
| "step": 426 | |
| }, | |
| { | |
| "epoch": 0.8634984833164813, | |
| "grad_norm": 0.19026698172092438, | |
| "learning_rate": 0.00034131642006609365, | |
| "loss": 0.3249, | |
| "mean_token_accuracy": 0.8910115286707878, | |
| "num_tokens": 29147204.0, | |
| "step": 427 | |
| }, | |
| { | |
| "epoch": 0.865520728008089, | |
| "grad_norm": 0.1589595079421997, | |
| "learning_rate": 0.00034102602533824027, | |
| "loss": 0.2785, | |
| "mean_token_accuracy": 0.903257142752409, | |
| "num_tokens": 29218571.0, | |
| "step": 428 | |
| }, | |
| { | |
| "epoch": 0.8675429726996967, | |
| "grad_norm": 0.1674802154302597, | |
| "learning_rate": 0.00034073505456943463, | |
| "loss": 0.2977, | |
| "mean_token_accuracy": 0.8929527476429939, | |
| "num_tokens": 29287185.0, | |
| "step": 429 | |
| }, | |
| { | |
| "epoch": 0.8695652173913043, | |
| "grad_norm": 0.17129530012607574, | |
| "learning_rate": 0.0003404435091445945, | |
| "loss": 0.2769, | |
| "mean_token_accuracy": 0.8992316760122776, | |
| "num_tokens": 29355908.0, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.871587462082912, | |
| "grad_norm": 0.1718977391719818, | |
| "learning_rate": 0.00034015139045137253, | |
| "loss": 0.3137, | |
| "mean_token_accuracy": 0.8935650922358036, | |
| "num_tokens": 29421396.0, | |
| "step": 431 | |
| }, | |
| { | |
| "epoch": 0.8736097067745198, | |
| "grad_norm": 0.17011679708957672, | |
| "learning_rate": 0.00033985869988015016, | |
| "loss": 0.2855, | |
| "mean_token_accuracy": 0.8953105248510838, | |
| "num_tokens": 29493294.0, | |
| "step": 432 | |
| }, | |
| { | |
| "epoch": 0.8756319514661274, | |
| "grad_norm": 0.1868988573551178, | |
| "learning_rate": 0.0003395654388240307, | |
| "loss": 0.3196, | |
| "mean_token_accuracy": 0.8894085213541985, | |
| "num_tokens": 29555484.0, | |
| "step": 433 | |
| }, | |
| { | |
| "epoch": 0.8776541961577351, | |
| "grad_norm": 0.15462960302829742, | |
| "learning_rate": 0.0003392716086788328, | |
| "loss": 0.2957, | |
| "mean_token_accuracy": 0.8983679711818695, | |
| "num_tokens": 29623656.0, | |
| "step": 434 | |
| }, | |
| { | |
| "epoch": 0.8796764408493428, | |
| "grad_norm": 0.16427457332611084, | |
| "learning_rate": 0.0003389772108430835, | |
| "loss": 0.2979, | |
| "mean_token_accuracy": 0.8941413648426533, | |
| "num_tokens": 29690023.0, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.8816986855409504, | |
| "grad_norm": 0.1687782257795334, | |
| "learning_rate": 0.00033868224671801243, | |
| "loss": 0.2573, | |
| "mean_token_accuracy": 0.9024456590414047, | |
| "num_tokens": 29756579.0, | |
| "step": 436 | |
| }, | |
| { | |
| "epoch": 0.8837209302325582, | |
| "grad_norm": 0.1602339744567871, | |
| "learning_rate": 0.00033838671770754393, | |
| "loss": 0.2829, | |
| "mean_token_accuracy": 0.9009444527328014, | |
| "num_tokens": 29823974.0, | |
| "step": 437 | |
| }, | |
| { | |
| "epoch": 0.8857431749241659, | |
| "grad_norm": 0.17867590487003326, | |
| "learning_rate": 0.00033809062521829135, | |
| "loss": 0.3058, | |
| "mean_token_accuracy": 0.8952712267637253, | |
| "num_tokens": 29896076.0, | |
| "step": 438 | |
| }, | |
| { | |
| "epoch": 0.8877654196157735, | |
| "grad_norm": 0.20030587911605835, | |
| "learning_rate": 0.0003377939706595499, | |
| "loss": 0.3275, | |
| "mean_token_accuracy": 0.8882710337638855, | |
| "num_tokens": 29959878.0, | |
| "step": 439 | |
| }, | |
| { | |
| "epoch": 0.8897876643073812, | |
| "grad_norm": 0.18861141800880432, | |
| "learning_rate": 0.00033749675544329007, | |
| "loss": 0.2941, | |
| "mean_token_accuracy": 0.894235398620367, | |
| "num_tokens": 30029419.0, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.8918099089989889, | |
| "grad_norm": 0.17503049969673157, | |
| "learning_rate": 0.0003371989809841508, | |
| "loss": 0.2796, | |
| "mean_token_accuracy": 0.8981444463133812, | |
| "num_tokens": 30099980.0, | |
| "step": 441 | |
| }, | |
| { | |
| "epoch": 0.8938321536905965, | |
| "grad_norm": 0.17344842851161957, | |
| "learning_rate": 0.00033690064869943304, | |
| "loss": 0.2806, | |
| "mean_token_accuracy": 0.9028143547475338, | |
| "num_tokens": 30160123.0, | |
| "step": 442 | |
| }, | |
| { | |
| "epoch": 0.8958543983822043, | |
| "grad_norm": 0.21486879885196686, | |
| "learning_rate": 0.00033660176000909256, | |
| "loss": 0.3017, | |
| "mean_token_accuracy": 0.8955220691859722, | |
| "num_tokens": 30221040.0, | |
| "step": 443 | |
| }, | |
| { | |
| "epoch": 0.897876643073812, | |
| "grad_norm": 0.16732099652290344, | |
| "learning_rate": 0.0003363023163357335, | |
| "loss": 0.3038, | |
| "mean_token_accuracy": 0.8961573019623756, | |
| "num_tokens": 30288318.0, | |
| "step": 444 | |
| }, | |
| { | |
| "epoch": 0.8998988877654196, | |
| "grad_norm": 0.17499873042106628, | |
| "learning_rate": 0.00033600231910460153, | |
| "loss": 0.2942, | |
| "mean_token_accuracy": 0.8975008726119995, | |
| "num_tokens": 30351020.0, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.9019211324570273, | |
| "grad_norm": 0.18658067286014557, | |
| "learning_rate": 0.0003357017697435771, | |
| "loss": 0.2997, | |
| "mean_token_accuracy": 0.8956367336213589, | |
| "num_tokens": 30425559.0, | |
| "step": 446 | |
| }, | |
| { | |
| "epoch": 0.9039433771486349, | |
| "grad_norm": 0.19921845197677612, | |
| "learning_rate": 0.0003354006696831685, | |
| "loss": 0.321, | |
| "mean_token_accuracy": 0.8870183601975441, | |
| "num_tokens": 30487225.0, | |
| "step": 447 | |
| }, | |
| { | |
| "epoch": 0.9059656218402427, | |
| "grad_norm": 0.15201924741268158, | |
| "learning_rate": 0.00033509902035650527, | |
| "loss": 0.2805, | |
| "mean_token_accuracy": 0.8986309170722961, | |
| "num_tokens": 30566969.0, | |
| "step": 448 | |
| }, | |
| { | |
| "epoch": 0.9079878665318504, | |
| "grad_norm": 0.14417074620723724, | |
| "learning_rate": 0.00033479682319933124, | |
| "loss": 0.2746, | |
| "mean_token_accuracy": 0.9016837328672409, | |
| "num_tokens": 30657549.0, | |
| "step": 449 | |
| }, | |
| { | |
| "epoch": 0.910010111223458, | |
| "grad_norm": 0.20164437592029572, | |
| "learning_rate": 0.00033449407964999755, | |
| "loss": 0.307, | |
| "mean_token_accuracy": 0.8908158242702484, | |
| "num_tokens": 30719396.0, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.9120323559150657, | |
| "grad_norm": 0.15949569642543793, | |
| "learning_rate": 0.0003341907911494562, | |
| "loss": 0.2813, | |
| "mean_token_accuracy": 0.8971740826964378, | |
| "num_tokens": 30796942.0, | |
| "step": 451 | |
| }, | |
| { | |
| "epoch": 0.9140546006066734, | |
| "grad_norm": 0.18862098455429077, | |
| "learning_rate": 0.0003338869591412529, | |
| "loss": 0.3339, | |
| "mean_token_accuracy": 0.8874437399208546, | |
| "num_tokens": 30858913.0, | |
| "step": 452 | |
| }, | |
| { | |
| "epoch": 0.916076845298281, | |
| "grad_norm": 0.19091889262199402, | |
| "learning_rate": 0.0003335825850715203, | |
| "loss": 0.3099, | |
| "mean_token_accuracy": 0.8915912732481956, | |
| "num_tokens": 30923946.0, | |
| "step": 453 | |
| }, | |
| { | |
| "epoch": 0.9180990899898888, | |
| "grad_norm": 0.17616930603981018, | |
| "learning_rate": 0.0003332776703889708, | |
| "loss": 0.302, | |
| "mean_token_accuracy": 0.8977428935468197, | |
| "num_tokens": 30991635.0, | |
| "step": 454 | |
| }, | |
| { | |
| "epoch": 0.9201213346814965, | |
| "grad_norm": 0.16347502171993256, | |
| "learning_rate": 0.00033297221654489026, | |
| "loss": 0.2968, | |
| "mean_token_accuracy": 0.8974283151328564, | |
| "num_tokens": 31065527.0, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.9221435793731041, | |
| "grad_norm": 0.15494075417518616, | |
| "learning_rate": 0.0003326662249931307, | |
| "loss": 0.2745, | |
| "mean_token_accuracy": 0.9003672078251839, | |
| "num_tokens": 31139389.0, | |
| "step": 456 | |
| }, | |
| { | |
| "epoch": 0.9241658240647118, | |
| "grad_norm": 0.14488424360752106, | |
| "learning_rate": 0.0003323596971901032, | |
| "loss": 0.2315, | |
| "mean_token_accuracy": 0.9032083451747894, | |
| "num_tokens": 31211644.0, | |
| "step": 457 | |
| }, | |
| { | |
| "epoch": 0.9261880687563195, | |
| "grad_norm": 0.17343585193157196, | |
| "learning_rate": 0.0003320526345947716, | |
| "loss": 0.2834, | |
| "mean_token_accuracy": 0.8971737772226334, | |
| "num_tokens": 31281551.0, | |
| "step": 458 | |
| }, | |
| { | |
| "epoch": 0.9282103134479271, | |
| "grad_norm": 0.20809240639209747, | |
| "learning_rate": 0.0003317450386686447, | |
| "loss": 0.3392, | |
| "mean_token_accuracy": 0.8834185339510441, | |
| "num_tokens": 31339866.0, | |
| "step": 459 | |
| }, | |
| { | |
| "epoch": 0.9302325581395349, | |
| "grad_norm": 0.1745264083147049, | |
| "learning_rate": 0.00033143691087577016, | |
| "loss": 0.3135, | |
| "mean_token_accuracy": 0.8907811567187309, | |
| "num_tokens": 31397435.0, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.9322548028311426, | |
| "grad_norm": 0.19855932891368866, | |
| "learning_rate": 0.00033112825268272693, | |
| "loss": 0.2874, | |
| "mean_token_accuracy": 0.9011034667491913, | |
| "num_tokens": 31477769.0, | |
| "step": 461 | |
| }, | |
| { | |
| "epoch": 0.9342770475227502, | |
| "grad_norm": 0.18550598621368408, | |
| "learning_rate": 0.0003308190655586185, | |
| "loss": 0.3026, | |
| "mean_token_accuracy": 0.8910555392503738, | |
| "num_tokens": 31543808.0, | |
| "step": 462 | |
| }, | |
| { | |
| "epoch": 0.9362992922143579, | |
| "grad_norm": 0.17249254882335663, | |
| "learning_rate": 0.000330509350975066, | |
| "loss": 0.2988, | |
| "mean_token_accuracy": 0.8944742307066917, | |
| "num_tokens": 31608876.0, | |
| "step": 463 | |
| }, | |
| { | |
| "epoch": 0.9383215369059656, | |
| "grad_norm": 0.15075324475765228, | |
| "learning_rate": 0.0003301991104062009, | |
| "loss": 0.272, | |
| "mean_token_accuracy": 0.90623002871871, | |
| "num_tokens": 31680601.0, | |
| "step": 464 | |
| }, | |
| { | |
| "epoch": 0.9403437815975733, | |
| "grad_norm": 0.18637825548648834, | |
| "learning_rate": 0.00032988834532865827, | |
| "loss": 0.3234, | |
| "mean_token_accuracy": 0.8885620683431625, | |
| "num_tokens": 31747402.0, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.942366026289181, | |
| "grad_norm": 0.1554325670003891, | |
| "learning_rate": 0.0003295770572215697, | |
| "loss": 0.2836, | |
| "mean_token_accuracy": 0.9002716057002544, | |
| "num_tokens": 31818720.0, | |
| "step": 466 | |
| }, | |
| { | |
| "epoch": 0.9443882709807887, | |
| "grad_norm": 0.17428986728191376, | |
| "learning_rate": 0.00032926524756655615, | |
| "loss": 0.2917, | |
| "mean_token_accuracy": 0.8964979350566864, | |
| "num_tokens": 31891824.0, | |
| "step": 467 | |
| }, | |
| { | |
| "epoch": 0.9464105156723963, | |
| "grad_norm": 0.16667652130126953, | |
| "learning_rate": 0.000328952917847721, | |
| "loss": 0.2742, | |
| "mean_token_accuracy": 0.901694979518652, | |
| "num_tokens": 31967670.0, | |
| "step": 468 | |
| }, | |
| { | |
| "epoch": 0.948432760364004, | |
| "grad_norm": 0.17575259506702423, | |
| "learning_rate": 0.00032864006955164287, | |
| "loss": 0.3164, | |
| "mean_token_accuracy": 0.8907586932182312, | |
| "num_tokens": 32033261.0, | |
| "step": 469 | |
| }, | |
| { | |
| "epoch": 0.9504550050556118, | |
| "grad_norm": 0.17919106781482697, | |
| "learning_rate": 0.0003283267041673687, | |
| "loss": 0.303, | |
| "mean_token_accuracy": 0.8939293213188648, | |
| "num_tokens": 32096462.0, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.9524772497472194, | |
| "grad_norm": 0.18951061367988586, | |
| "learning_rate": 0.0003280128231864066, | |
| "loss": 0.3249, | |
| "mean_token_accuracy": 0.8879147619009018, | |
| "num_tokens": 32157870.0, | |
| "step": 471 | |
| }, | |
| { | |
| "epoch": 0.9544994944388271, | |
| "grad_norm": 0.1526096761226654, | |
| "learning_rate": 0.0003276984281027186, | |
| "loss": 0.2505, | |
| "mean_token_accuracy": 0.9095052257180214, | |
| "num_tokens": 32236445.0, | |
| "step": 472 | |
| }, | |
| { | |
| "epoch": 0.9565217391304348, | |
| "grad_norm": 0.16995003819465637, | |
| "learning_rate": 0.00032738352041271395, | |
| "loss": 0.3174, | |
| "mean_token_accuracy": 0.8889270462095737, | |
| "num_tokens": 32304171.0, | |
| "step": 473 | |
| }, | |
| { | |
| "epoch": 0.9585439838220424, | |
| "grad_norm": 0.16517885029315948, | |
| "learning_rate": 0.0003270681016152414, | |
| "loss": 0.3144, | |
| "mean_token_accuracy": 0.8923964686691761, | |
| "num_tokens": 32372702.0, | |
| "step": 474 | |
| }, | |
| { | |
| "epoch": 0.9605662285136501, | |
| "grad_norm": 0.18384018540382385, | |
| "learning_rate": 0.00032675217321158264, | |
| "loss": 0.2903, | |
| "mean_token_accuracy": 0.8964046128094196, | |
| "num_tokens": 32442132.0, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.9625884732052579, | |
| "grad_norm": 0.1601627767086029, | |
| "learning_rate": 0.0003264357367054449, | |
| "loss": 0.2766, | |
| "mean_token_accuracy": 0.9007900506258011, | |
| "num_tokens": 32514430.0, | |
| "step": 476 | |
| }, | |
| { | |
| "epoch": 0.9646107178968655, | |
| "grad_norm": 0.18358251452445984, | |
| "learning_rate": 0.00032611879360295345, | |
| "loss": 0.2927, | |
| "mean_token_accuracy": 0.8977400958538055, | |
| "num_tokens": 32579788.0, | |
| "step": 477 | |
| }, | |
| { | |
| "epoch": 0.9666329625884732, | |
| "grad_norm": 0.2047470211982727, | |
| "learning_rate": 0.0003258013454126452, | |
| "loss": 0.3131, | |
| "mean_token_accuracy": 0.8929316326975822, | |
| "num_tokens": 32642283.0, | |
| "step": 478 | |
| }, | |
| { | |
| "epoch": 0.9686552072800809, | |
| "grad_norm": 0.1662026345729828, | |
| "learning_rate": 0.0003254833936454609, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.8985595107078552, | |
| "num_tokens": 32709386.0, | |
| "step": 479 | |
| }, | |
| { | |
| "epoch": 0.9706774519716885, | |
| "grad_norm": 0.1934393048286438, | |
| "learning_rate": 0.00032516493981473826, | |
| "loss": 0.2869, | |
| "mean_token_accuracy": 0.8976165167987347, | |
| "num_tokens": 32778573.0, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.9726996966632963, | |
| "grad_norm": 0.1651667058467865, | |
| "learning_rate": 0.0003248459854362044, | |
| "loss": 0.2993, | |
| "mean_token_accuracy": 0.893569964915514, | |
| "num_tokens": 32853785.0, | |
| "step": 481 | |
| }, | |
| { | |
| "epoch": 0.974721941354904, | |
| "grad_norm": 0.18779976665973663, | |
| "learning_rate": 0.00032452653202796915, | |
| "loss": 0.3223, | |
| "mean_token_accuracy": 0.8855483829975128, | |
| "num_tokens": 32917542.0, | |
| "step": 482 | |
| }, | |
| { | |
| "epoch": 0.9767441860465116, | |
| "grad_norm": 0.14583131670951843, | |
| "learning_rate": 0.00032420658111051746, | |
| "loss": 0.2772, | |
| "mean_token_accuracy": 0.8998262621462345, | |
| "num_tokens": 32987391.0, | |
| "step": 483 | |
| }, | |
| { | |
| "epoch": 0.9787664307381193, | |
| "grad_norm": 0.23910751938819885, | |
| "learning_rate": 0.00032388613420670213, | |
| "loss": 0.3257, | |
| "mean_token_accuracy": 0.8845948688685894, | |
| "num_tokens": 33053804.0, | |
| "step": 484 | |
| }, | |
| { | |
| "epoch": 0.980788675429727, | |
| "grad_norm": 0.1679566651582718, | |
| "learning_rate": 0.00032356519284173666, | |
| "loss": 0.2988, | |
| "mean_token_accuracy": 0.8954810760915279, | |
| "num_tokens": 33123281.0, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.9828109201213346, | |
| "grad_norm": 0.17945775389671326, | |
| "learning_rate": 0.0003232437585431883, | |
| "loss": 0.3127, | |
| "mean_token_accuracy": 0.8931021988391876, | |
| "num_tokens": 33188358.0, | |
| "step": 486 | |
| }, | |
| { | |
| "epoch": 0.9848331648129424, | |
| "grad_norm": 0.18727077543735504, | |
| "learning_rate": 0.00032292183284097023, | |
| "loss": 0.3259, | |
| "mean_token_accuracy": 0.8901765421032906, | |
| "num_tokens": 33251289.0, | |
| "step": 487 | |
| }, | |
| { | |
| "epoch": 0.9868554095045501, | |
| "grad_norm": 0.1629391312599182, | |
| "learning_rate": 0.0003225994172673346, | |
| "loss": 0.3004, | |
| "mean_token_accuracy": 0.8926238007843494, | |
| "num_tokens": 33322968.0, | |
| "step": 488 | |
| }, | |
| { | |
| "epoch": 0.9888776541961577, | |
| "grad_norm": 0.1630707085132599, | |
| "learning_rate": 0.00032227651335686513, | |
| "loss": 0.2809, | |
| "mean_token_accuracy": 0.9002612978219986, | |
| "num_tokens": 33393350.0, | |
| "step": 489 | |
| }, | |
| { | |
| "epoch": 0.9908998988877654, | |
| "grad_norm": 0.17929117381572723, | |
| "learning_rate": 0.0003219531226464699, | |
| "loss": 0.3214, | |
| "mean_token_accuracy": 0.8894147910177708, | |
| "num_tokens": 33458431.0, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.9929221435793731, | |
| "grad_norm": 0.1639278680086136, | |
| "learning_rate": 0.00032162924667537406, | |
| "loss": 0.2891, | |
| "mean_token_accuracy": 0.8945626839995384, | |
| "num_tokens": 33526451.0, | |
| "step": 491 | |
| }, | |
| { | |
| "epoch": 0.9949443882709808, | |
| "grad_norm": 0.1808111071586609, | |
| "learning_rate": 0.0003213048869851124, | |
| "loss": 0.2965, | |
| "mean_token_accuracy": 0.8966854028403759, | |
| "num_tokens": 33589564.0, | |
| "step": 492 | |
| }, | |
| { | |
| "epoch": 0.9969666329625885, | |
| "grad_norm": 0.1905975043773651, | |
| "learning_rate": 0.00032098004511952184, | |
| "loss": 0.3017, | |
| "mean_token_accuracy": 0.8935710862278938, | |
| "num_tokens": 33649359.0, | |
| "step": 493 | |
| }, | |
| { | |
| "epoch": 0.9989888776541962, | |
| "grad_norm": 0.17898094654083252, | |
| "learning_rate": 0.00032065472262473443, | |
| "loss": 0.3193, | |
| "mean_token_accuracy": 0.8906168565154076, | |
| "num_tokens": 33721593.0, | |
| "step": 494 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.22628654539585114, | |
| "learning_rate": 0.00032032892104917, | |
| "loss": 0.3083, | |
| "mean_token_accuracy": 0.8914947211742401, | |
| "num_tokens": 33755641.0, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 1.0020222446916076, | |
| "grad_norm": 0.13782188296318054, | |
| "learning_rate": 0.00032000264194352845, | |
| "loss": 0.2663, | |
| "mean_token_accuracy": 0.8996973298490047, | |
| "num_tokens": 33834819.0, | |
| "step": 496 | |
| }, | |
| { | |
| "epoch": 1.0040444893832154, | |
| "grad_norm": 0.17569021880626678, | |
| "learning_rate": 0.0003196758868607825, | |
| "loss": 0.2952, | |
| "mean_token_accuracy": 0.8985786736011505, | |
| "num_tokens": 33902435.0, | |
| "step": 497 | |
| }, | |
| { | |
| "epoch": 1.006066734074823, | |
| "grad_norm": 0.2067909836769104, | |
| "learning_rate": 0.0003193486573561705, | |
| "loss": 0.3225, | |
| "mean_token_accuracy": 0.8876040019094944, | |
| "num_tokens": 33965666.0, | |
| "step": 498 | |
| }, | |
| { | |
| "epoch": 1.0080889787664307, | |
| "grad_norm": 0.16878552734851837, | |
| "learning_rate": 0.0003190209549871888, | |
| "loss": 0.2942, | |
| "mean_token_accuracy": 0.8955768346786499, | |
| "num_tokens": 34032445.0, | |
| "step": 499 | |
| }, | |
| { | |
| "epoch": 1.0101112234580385, | |
| "grad_norm": 0.15274177491664886, | |
| "learning_rate": 0.00031869278131358455, | |
| "loss": 0.2427, | |
| "mean_token_accuracy": 0.9117574766278267, | |
| "num_tokens": 34114342.0, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 1.012133468149646, | |
| "grad_norm": 0.22229406237602234, | |
| "learning_rate": 0.0003183641378973478, | |
| "loss": 0.2961, | |
| "mean_token_accuracy": 0.8931870721280575, | |
| "num_tokens": 34170031.0, | |
| "step": 501 | |
| }, | |
| { | |
| "epoch": 1.0141557128412537, | |
| "grad_norm": 0.17795279622077942, | |
| "learning_rate": 0.0003180350263027049, | |
| "loss": 0.2921, | |
| "mean_token_accuracy": 0.8974833749234676, | |
| "num_tokens": 34232994.0, | |
| "step": 502 | |
| }, | |
| { | |
| "epoch": 1.0161779575328616, | |
| "grad_norm": 0.1530430167913437, | |
| "learning_rate": 0.0003177054480961101, | |
| "loss": 0.2587, | |
| "mean_token_accuracy": 0.8979953937232494, | |
| "num_tokens": 34306018.0, | |
| "step": 503 | |
| }, | |
| { | |
| "epoch": 1.0182002022244692, | |
| "grad_norm": 0.17740803956985474, | |
| "learning_rate": 0.00031737540484623895, | |
| "loss": 0.3102, | |
| "mean_token_accuracy": 0.8884628489613533, | |
| "num_tokens": 34374661.0, | |
| "step": 504 | |
| }, | |
| { | |
| "epoch": 1.0202224469160768, | |
| "grad_norm": 0.177719384431839, | |
| "learning_rate": 0.00031704489812398013, | |
| "loss": 0.2953, | |
| "mean_token_accuracy": 0.8939866498112679, | |
| "num_tokens": 34438514.0, | |
| "step": 505 | |
| }, | |
| { | |
| "epoch": 1.0222446916076846, | |
| "grad_norm": 0.168897345662117, | |
| "learning_rate": 0.00031671392950242836, | |
| "loss": 0.269, | |
| "mean_token_accuracy": 0.9047276936471462, | |
| "num_tokens": 34505982.0, | |
| "step": 506 | |
| }, | |
| { | |
| "epoch": 1.0242669362992922, | |
| "grad_norm": 0.15597204864025116, | |
| "learning_rate": 0.0003163825005568769, | |
| "loss": 0.2585, | |
| "mean_token_accuracy": 0.9080711491405964, | |
| "num_tokens": 34578668.0, | |
| "step": 507 | |
| }, | |
| { | |
| "epoch": 1.0262891809908998, | |
| "grad_norm": 0.17869000136852264, | |
| "learning_rate": 0.00031605061286481013, | |
| "loss": 0.3069, | |
| "mean_token_accuracy": 0.8951312974095345, | |
| "num_tokens": 34649274.0, | |
| "step": 508 | |
| }, | |
| { | |
| "epoch": 1.0283114256825077, | |
| "grad_norm": 0.15539689362049103, | |
| "learning_rate": 0.0003157182680058955, | |
| "loss": 0.2495, | |
| "mean_token_accuracy": 0.9083127416670322, | |
| "num_tokens": 34727319.0, | |
| "step": 509 | |
| }, | |
| { | |
| "epoch": 1.0303336703741153, | |
| "grad_norm": 0.18144549429416656, | |
| "learning_rate": 0.00031538546756197693, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.9019791670143604, | |
| "num_tokens": 34797454.0, | |
| "step": 510 | |
| }, | |
| { | |
| "epoch": 1.0323559150657229, | |
| "grad_norm": 0.18584753572940826, | |
| "learning_rate": 0.0003150522131170663, | |
| "loss": 0.2954, | |
| "mean_token_accuracy": 0.8972033709287643, | |
| "num_tokens": 34864905.0, | |
| "step": 511 | |
| }, | |
| { | |
| "epoch": 1.0343781597573307, | |
| "grad_norm": 0.19840823113918304, | |
| "learning_rate": 0.0003147185062573365, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.901741374284029, | |
| "num_tokens": 34928661.0, | |
| "step": 512 | |
| }, | |
| { | |
| "epoch": 1.0364004044489383, | |
| "grad_norm": 0.14095668494701385, | |
| "learning_rate": 0.00031438434857111405, | |
| "loss": 0.2666, | |
| "mean_token_accuracy": 0.9036082923412323, | |
| "num_tokens": 35002573.0, | |
| "step": 513 | |
| }, | |
| { | |
| "epoch": 1.038422649140546, | |
| "grad_norm": 0.13482429087162018, | |
| "learning_rate": 0.0003140497416488708, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.9059791043400764, | |
| "num_tokens": 35083602.0, | |
| "step": 514 | |
| }, | |
| { | |
| "epoch": 1.0404448938321538, | |
| "grad_norm": 0.20816905796527863, | |
| "learning_rate": 0.00031371468708321713, | |
| "loss": 0.3049, | |
| "mean_token_accuracy": 0.8949435539543629, | |
| "num_tokens": 35150470.0, | |
| "step": 515 | |
| }, | |
| { | |
| "epoch": 1.0424671385237614, | |
| "grad_norm": 0.17933416366577148, | |
| "learning_rate": 0.0003133791864688939, | |
| "loss": 0.2972, | |
| "mean_token_accuracy": 0.8948968909680843, | |
| "num_tokens": 35216813.0, | |
| "step": 516 | |
| }, | |
| { | |
| "epoch": 1.044489383215369, | |
| "grad_norm": 0.17087870836257935, | |
| "learning_rate": 0.00031304324140276496, | |
| "loss": 0.2891, | |
| "mean_token_accuracy": 0.8967925682663918, | |
| "num_tokens": 35287089.0, | |
| "step": 517 | |
| }, | |
| { | |
| "epoch": 1.0465116279069768, | |
| "grad_norm": 0.19874465465545654, | |
| "learning_rate": 0.0003127068534838098, | |
| "loss": 0.2864, | |
| "mean_token_accuracy": 0.8976041786372662, | |
| "num_tokens": 35348784.0, | |
| "step": 518 | |
| }, | |
| { | |
| "epoch": 1.0485338725985844, | |
| "grad_norm": 0.17467646300792694, | |
| "learning_rate": 0.0003123700243131155, | |
| "loss": 0.2742, | |
| "mean_token_accuracy": 0.9038321636617184, | |
| "num_tokens": 35430257.0, | |
| "step": 519 | |
| }, | |
| { | |
| "epoch": 1.050556117290192, | |
| "grad_norm": 0.20859748125076294, | |
| "learning_rate": 0.00031203275549386935, | |
| "loss": 0.29, | |
| "mean_token_accuracy": 0.8973617292940617, | |
| "num_tokens": 35492098.0, | |
| "step": 520 | |
| }, | |
| { | |
| "epoch": 1.0525783619817999, | |
| "grad_norm": 0.1560591757297516, | |
| "learning_rate": 0.00031169504863135157, | |
| "loss": 0.2593, | |
| "mean_token_accuracy": 0.9061496220529079, | |
| "num_tokens": 35578894.0, | |
| "step": 521 | |
| }, | |
| { | |
| "epoch": 1.0546006066734075, | |
| "grad_norm": 0.17322826385498047, | |
| "learning_rate": 0.0003113569053329268, | |
| "loss": 0.2656, | |
| "mean_token_accuracy": 0.9077408090233803, | |
| "num_tokens": 35658590.0, | |
| "step": 522 | |
| }, | |
| { | |
| "epoch": 1.056622851365015, | |
| "grad_norm": 0.16736696660518646, | |
| "learning_rate": 0.0003110183272080373, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9043499119579792, | |
| "num_tokens": 35722339.0, | |
| "step": 523 | |
| }, | |
| { | |
| "epoch": 1.058645096056623, | |
| "grad_norm": 0.20183323323726654, | |
| "learning_rate": 0.00031067931586819473, | |
| "loss": 0.2937, | |
| "mean_token_accuracy": 0.8954190462827682, | |
| "num_tokens": 35782293.0, | |
| "step": 524 | |
| }, | |
| { | |
| "epoch": 1.0606673407482305, | |
| "grad_norm": 0.16886426508426666, | |
| "learning_rate": 0.000310339872926973, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.9006736651062965, | |
| "num_tokens": 35849795.0, | |
| "step": 525 | |
| }, | |
| { | |
| "epoch": 1.0626895854398382, | |
| "grad_norm": 0.16396957635879517, | |
| "learning_rate": 0.00031, | |
| "loss": 0.2747, | |
| "mean_token_accuracy": 0.9040698818862438, | |
| "num_tokens": 35926179.0, | |
| "step": 526 | |
| }, | |
| { | |
| "epoch": 1.064711830131446, | |
| "grad_norm": 0.17668411135673523, | |
| "learning_rate": 0.00030965969870495034, | |
| "loss": 0.293, | |
| "mean_token_accuracy": 0.8949432447552681, | |
| "num_tokens": 35992037.0, | |
| "step": 527 | |
| }, | |
| { | |
| "epoch": 1.0667340748230536, | |
| "grad_norm": 0.16346760094165802, | |
| "learning_rate": 0.0003093189706615375, | |
| "loss": 0.2524, | |
| "mean_token_accuracy": 0.9064350612461567, | |
| "num_tokens": 36060378.0, | |
| "step": 528 | |
| }, | |
| { | |
| "epoch": 1.0687563195146612, | |
| "grad_norm": 0.17525459825992584, | |
| "learning_rate": 0.000308977817491506, | |
| "loss": 0.2943, | |
| "mean_token_accuracy": 0.8935273364186287, | |
| "num_tokens": 36126013.0, | |
| "step": 529 | |
| }, | |
| { | |
| "epoch": 1.070778564206269, | |
| "grad_norm": 0.16501343250274658, | |
| "learning_rate": 0.00030863624081862415, | |
| "loss": 0.2789, | |
| "mean_token_accuracy": 0.8968185931444168, | |
| "num_tokens": 36196795.0, | |
| "step": 530 | |
| }, | |
| { | |
| "epoch": 1.0728008088978767, | |
| "grad_norm": 0.16026921570301056, | |
| "learning_rate": 0.0003082942422686754, | |
| "loss": 0.2671, | |
| "mean_token_accuracy": 0.9082406982779503, | |
| "num_tokens": 36275178.0, | |
| "step": 531 | |
| }, | |
| { | |
| "epoch": 1.0748230535894843, | |
| "grad_norm": 0.19023281335830688, | |
| "learning_rate": 0.0003079518234694519, | |
| "loss": 0.3116, | |
| "mean_token_accuracy": 0.8914121352136135, | |
| "num_tokens": 36338049.0, | |
| "step": 532 | |
| }, | |
| { | |
| "epoch": 1.076845298281092, | |
| "grad_norm": 0.18959233164787292, | |
| "learning_rate": 0.00030760898605074546, | |
| "loss": 0.2626, | |
| "mean_token_accuracy": 0.9018443673849106, | |
| "num_tokens": 36420122.0, | |
| "step": 533 | |
| }, | |
| { | |
| "epoch": 1.0788675429726997, | |
| "grad_norm": 0.18601641058921814, | |
| "learning_rate": 0.00030726573164434074, | |
| "loss": 0.2946, | |
| "mean_token_accuracy": 0.8955305181443691, | |
| "num_tokens": 36486673.0, | |
| "step": 534 | |
| }, | |
| { | |
| "epoch": 1.0808897876643073, | |
| "grad_norm": 0.17861206829547882, | |
| "learning_rate": 0.0003069220618840067, | |
| "loss": 0.2638, | |
| "mean_token_accuracy": 0.9000630341470242, | |
| "num_tokens": 36548189.0, | |
| "step": 535 | |
| }, | |
| { | |
| "epoch": 1.0829120323559152, | |
| "grad_norm": 0.16839022934436798, | |
| "learning_rate": 0.0003065779784054898, | |
| "loss": 0.2821, | |
| "mean_token_accuracy": 0.901892576366663, | |
| "num_tokens": 36619289.0, | |
| "step": 536 | |
| }, | |
| { | |
| "epoch": 1.0849342770475228, | |
| "grad_norm": 0.16797274351119995, | |
| "learning_rate": 0.0003062334828465052, | |
| "loss": 0.2722, | |
| "mean_token_accuracy": 0.901667632162571, | |
| "num_tokens": 36690144.0, | |
| "step": 537 | |
| }, | |
| { | |
| "epoch": 1.0869565217391304, | |
| "grad_norm": 0.1743130087852478, | |
| "learning_rate": 0.00030588857684672955, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9072123803198338, | |
| "num_tokens": 36761617.0, | |
| "step": 538 | |
| }, | |
| { | |
| "epoch": 1.0889787664307382, | |
| "grad_norm": 0.1802840232849121, | |
| "learning_rate": 0.0003055432620477931, | |
| "loss": 0.2822, | |
| "mean_token_accuracy": 0.8998791016638279, | |
| "num_tokens": 36828873.0, | |
| "step": 539 | |
| }, | |
| { | |
| "epoch": 1.0910010111223458, | |
| "grad_norm": 0.19156496226787567, | |
| "learning_rate": 0.00030519754009327186, | |
| "loss": 0.3002, | |
| "mean_token_accuracy": 0.8940830379724503, | |
| "num_tokens": 36893847.0, | |
| "step": 540 | |
| }, | |
| { | |
| "epoch": 1.0930232558139534, | |
| "grad_norm": 0.18583235144615173, | |
| "learning_rate": 0.0003048514126286796, | |
| "loss": 0.2692, | |
| "mean_token_accuracy": 0.9024544768035412, | |
| "num_tokens": 36963240.0, | |
| "step": 541 | |
| }, | |
| { | |
| "epoch": 1.0950455005055613, | |
| "grad_norm": 0.17397500574588776, | |
| "learning_rate": 0.00030450488130146034, | |
| "loss": 0.2691, | |
| "mean_token_accuracy": 0.9022202827036381, | |
| "num_tokens": 37026381.0, | |
| "step": 542 | |
| }, | |
| { | |
| "epoch": 1.0970677451971689, | |
| "grad_norm": 0.24742691218852997, | |
| "learning_rate": 0.0003041579477609803, | |
| "loss": 0.3287, | |
| "mean_token_accuracy": 0.8853081800043583, | |
| "num_tokens": 37085095.0, | |
| "step": 543 | |
| }, | |
| { | |
| "epoch": 1.0990899898887765, | |
| "grad_norm": 0.16266337037086487, | |
| "learning_rate": 0.00030381061365852006, | |
| "loss": 0.2669, | |
| "mean_token_accuracy": 0.908314511179924, | |
| "num_tokens": 37156057.0, | |
| "step": 544 | |
| }, | |
| { | |
| "epoch": 1.1011122345803843, | |
| "grad_norm": 0.1805969476699829, | |
| "learning_rate": 0.00030346288064726676, | |
| "loss": 0.2762, | |
| "mean_token_accuracy": 0.9019368290901184, | |
| "num_tokens": 37218048.0, | |
| "step": 545 | |
| }, | |
| { | |
| "epoch": 1.103134479271992, | |
| "grad_norm": 0.2024918794631958, | |
| "learning_rate": 0.00030311475038230615, | |
| "loss": 0.2948, | |
| "mean_token_accuracy": 0.8978271037340164, | |
| "num_tokens": 37283475.0, | |
| "step": 546 | |
| }, | |
| { | |
| "epoch": 1.1051567239635995, | |
| "grad_norm": 0.16442124545574188, | |
| "learning_rate": 0.00030276622452061477, | |
| "loss": 0.2746, | |
| "mean_token_accuracy": 0.9010177366435528, | |
| "num_tokens": 37358871.0, | |
| "step": 547 | |
| }, | |
| { | |
| "epoch": 1.1071789686552074, | |
| "grad_norm": 0.17242524027824402, | |
| "learning_rate": 0.0003024173047210522, | |
| "loss": 0.2975, | |
| "mean_token_accuracy": 0.8940832912921906, | |
| "num_tokens": 37421863.0, | |
| "step": 548 | |
| }, | |
| { | |
| "epoch": 1.109201213346815, | |
| "grad_norm": 0.2123114913702011, | |
| "learning_rate": 0.00030206799264435294, | |
| "loss": 0.3084, | |
| "mean_token_accuracy": 0.8925547078251839, | |
| "num_tokens": 37486615.0, | |
| "step": 549 | |
| }, | |
| { | |
| "epoch": 1.1112234580384226, | |
| "grad_norm": 0.16941364109516144, | |
| "learning_rate": 0.00030171828995311845, | |
| "loss": 0.2997, | |
| "mean_token_accuracy": 0.8960695490241051, | |
| "num_tokens": 37556657.0, | |
| "step": 550 | |
| }, | |
| { | |
| "epoch": 1.1132457027300304, | |
| "grad_norm": 0.18581314384937286, | |
| "learning_rate": 0.0003013681983118096, | |
| "loss": 0.3056, | |
| "mean_token_accuracy": 0.8949491046369076, | |
| "num_tokens": 37623124.0, | |
| "step": 551 | |
| }, | |
| { | |
| "epoch": 1.115267947421638, | |
| "grad_norm": 0.17790380120277405, | |
| "learning_rate": 0.0003010177193867383, | |
| "loss": 0.2849, | |
| "mean_token_accuracy": 0.8990210555493832, | |
| "num_tokens": 37688876.0, | |
| "step": 552 | |
| }, | |
| { | |
| "epoch": 1.1172901921132457, | |
| "grad_norm": 0.17190231382846832, | |
| "learning_rate": 0.00030066685484606004, | |
| "loss": 0.2805, | |
| "mean_token_accuracy": 0.8991851061582565, | |
| "num_tokens": 37757188.0, | |
| "step": 553 | |
| }, | |
| { | |
| "epoch": 1.1193124368048535, | |
| "grad_norm": 0.17098551988601685, | |
| "learning_rate": 0.00030031560635976557, | |
| "loss": 0.2809, | |
| "mean_token_accuracy": 0.8985818810760975, | |
| "num_tokens": 37822088.0, | |
| "step": 554 | |
| }, | |
| { | |
| "epoch": 1.121334681496461, | |
| "grad_norm": 0.16426457464694977, | |
| "learning_rate": 0.0002999639755996731, | |
| "loss": 0.271, | |
| "mean_token_accuracy": 0.9015116766095161, | |
| "num_tokens": 37885778.0, | |
| "step": 555 | |
| }, | |
| { | |
| "epoch": 1.1233569261880687, | |
| "grad_norm": 0.16016022861003876, | |
| "learning_rate": 0.00029961196423942027, | |
| "loss": 0.2436, | |
| "mean_token_accuracy": 0.9075723215937614, | |
| "num_tokens": 37956105.0, | |
| "step": 556 | |
| }, | |
| { | |
| "epoch": 1.1253791708796763, | |
| "grad_norm": 0.17624878883361816, | |
| "learning_rate": 0.0002992595739544563, | |
| "loss": 0.2851, | |
| "mean_token_accuracy": 0.8980127796530724, | |
| "num_tokens": 38022057.0, | |
| "step": 557 | |
| }, | |
| { | |
| "epoch": 1.1274014155712841, | |
| "grad_norm": 0.2018936723470688, | |
| "learning_rate": 0.00029890680642203395, | |
| "loss": 0.2971, | |
| "mean_token_accuracy": 0.8927877955138683, | |
| "num_tokens": 38088320.0, | |
| "step": 558 | |
| }, | |
| { | |
| "epoch": 1.1294236602628918, | |
| "grad_norm": 0.19130869209766388, | |
| "learning_rate": 0.0002985536633212016, | |
| "loss": 0.2797, | |
| "mean_token_accuracy": 0.8997831009328365, | |
| "num_tokens": 38149395.0, | |
| "step": 559 | |
| }, | |
| { | |
| "epoch": 1.1314459049544996, | |
| "grad_norm": 0.19779284298419952, | |
| "learning_rate": 0.0002982001463327951, | |
| "loss": 0.3127, | |
| "mean_token_accuracy": 0.8897297792136669, | |
| "num_tokens": 38211779.0, | |
| "step": 560 | |
| }, | |
| { | |
| "epoch": 1.1334681496461072, | |
| "grad_norm": 0.1628047674894333, | |
| "learning_rate": 0.0002978462571394299, | |
| "loss": 0.2637, | |
| "mean_token_accuracy": 0.9051007218658924, | |
| "num_tokens": 38279919.0, | |
| "step": 561 | |
| }, | |
| { | |
| "epoch": 1.1354903943377148, | |
| "grad_norm": 0.1489226073026657, | |
| "learning_rate": 0.00029749199742549315, | |
| "loss": 0.2525, | |
| "mean_token_accuracy": 0.9131556376814842, | |
| "num_tokens": 38348885.0, | |
| "step": 562 | |
| }, | |
| { | |
| "epoch": 1.1375126390293224, | |
| "grad_norm": 0.16562367975711823, | |
| "learning_rate": 0.0002971373688771353, | |
| "loss": 0.2804, | |
| "mean_token_accuracy": 0.9060126468539238, | |
| "num_tokens": 38414361.0, | |
| "step": 563 | |
| }, | |
| { | |
| "epoch": 1.1395348837209303, | |
| "grad_norm": 0.18426918983459473, | |
| "learning_rate": 0.00029678237318226254, | |
| "loss": 0.3034, | |
| "mean_token_accuracy": 0.8923818841576576, | |
| "num_tokens": 38478031.0, | |
| "step": 564 | |
| }, | |
| { | |
| "epoch": 1.1415571284125379, | |
| "grad_norm": 0.18996812403202057, | |
| "learning_rate": 0.0002964270120305284, | |
| "loss": 0.3118, | |
| "mean_token_accuracy": 0.8920970819890499, | |
| "num_tokens": 38537650.0, | |
| "step": 565 | |
| }, | |
| { | |
| "epoch": 1.1435793731041457, | |
| "grad_norm": 0.1744386851787567, | |
| "learning_rate": 0.0002960712871133259, | |
| "loss": 0.3105, | |
| "mean_token_accuracy": 0.8955930359661579, | |
| "num_tokens": 38599799.0, | |
| "step": 566 | |
| }, | |
| { | |
| "epoch": 1.1456016177957533, | |
| "grad_norm": 0.1756746470928192, | |
| "learning_rate": 0.0002957152001237796, | |
| "loss": 0.2879, | |
| "mean_token_accuracy": 0.8998842090368271, | |
| "num_tokens": 38665696.0, | |
| "step": 567 | |
| }, | |
| { | |
| "epoch": 1.147623862487361, | |
| "grad_norm": 0.17731311917304993, | |
| "learning_rate": 0.00029535875275673706, | |
| "loss": 0.3028, | |
| "mean_token_accuracy": 0.896138958632946, | |
| "num_tokens": 38736012.0, | |
| "step": 568 | |
| }, | |
| { | |
| "epoch": 1.1496461071789685, | |
| "grad_norm": 0.16211020946502686, | |
| "learning_rate": 0.00029500194670876155, | |
| "loss": 0.2661, | |
| "mean_token_accuracy": 0.9007462747395039, | |
| "num_tokens": 38813042.0, | |
| "step": 569 | |
| }, | |
| { | |
| "epoch": 1.1516683518705764, | |
| "grad_norm": 0.16605907678604126, | |
| "learning_rate": 0.00029464478367812304, | |
| "loss": 0.2708, | |
| "mean_token_accuracy": 0.9033683091402054, | |
| "num_tokens": 38884323.0, | |
| "step": 570 | |
| }, | |
| { | |
| "epoch": 1.153690596562184, | |
| "grad_norm": 0.16346529126167297, | |
| "learning_rate": 0.0002942872653647911, | |
| "loss": 0.2787, | |
| "mean_token_accuracy": 0.8993464335799217, | |
| "num_tokens": 38954581.0, | |
| "step": 571 | |
| }, | |
| { | |
| "epoch": 1.1557128412537918, | |
| "grad_norm": 0.1715569943189621, | |
| "learning_rate": 0.0002939293934704259, | |
| "loss": 0.2876, | |
| "mean_token_accuracy": 0.899021927267313, | |
| "num_tokens": 39024859.0, | |
| "step": 572 | |
| }, | |
| { | |
| "epoch": 1.1577350859453994, | |
| "grad_norm": 0.1708040088415146, | |
| "learning_rate": 0.00029357116969837093, | |
| "loss": 0.2716, | |
| "mean_token_accuracy": 0.9040286540985107, | |
| "num_tokens": 39084032.0, | |
| "step": 573 | |
| }, | |
| { | |
| "epoch": 1.159757330637007, | |
| "grad_norm": 0.15547077357769012, | |
| "learning_rate": 0.00029321259575364406, | |
| "loss": 0.2876, | |
| "mean_token_accuracy": 0.9014556109905243, | |
| "num_tokens": 39158216.0, | |
| "step": 574 | |
| }, | |
| { | |
| "epoch": 1.1617795753286146, | |
| "grad_norm": 0.1835734099149704, | |
| "learning_rate": 0.0002928536733429302, | |
| "loss": 0.2904, | |
| "mean_token_accuracy": 0.8962517976760864, | |
| "num_tokens": 39219228.0, | |
| "step": 575 | |
| }, | |
| { | |
| "epoch": 1.1638018200202225, | |
| "grad_norm": 0.21164695918560028, | |
| "learning_rate": 0.00029249440417457274, | |
| "loss": 0.3095, | |
| "mean_token_accuracy": 0.8903193324804306, | |
| "num_tokens": 39279145.0, | |
| "step": 576 | |
| }, | |
| { | |
| "epoch": 1.16582406471183, | |
| "grad_norm": 0.16395002603530884, | |
| "learning_rate": 0.00029213478995856535, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9063084498047829, | |
| "num_tokens": 39346035.0, | |
| "step": 577 | |
| }, | |
| { | |
| "epoch": 1.167846309403438, | |
| "grad_norm": 0.15447662770748138, | |
| "learning_rate": 0.0002917748324065443, | |
| "loss": 0.2609, | |
| "mean_token_accuracy": 0.9043813906610012, | |
| "num_tokens": 39419464.0, | |
| "step": 578 | |
| }, | |
| { | |
| "epoch": 1.1698685540950455, | |
| "grad_norm": 0.18628905713558197, | |
| "learning_rate": 0.0002914145332317798, | |
| "loss": 0.3079, | |
| "mean_token_accuracy": 0.892396155744791, | |
| "num_tokens": 39476986.0, | |
| "step": 579 | |
| }, | |
| { | |
| "epoch": 1.1718907987866531, | |
| "grad_norm": 0.15657448768615723, | |
| "learning_rate": 0.0002910538941491681, | |
| "loss": 0.2596, | |
| "mean_token_accuracy": 0.9103246405720711, | |
| "num_tokens": 39547007.0, | |
| "step": 580 | |
| }, | |
| { | |
| "epoch": 1.1739130434782608, | |
| "grad_norm": 0.16723878681659698, | |
| "learning_rate": 0.00029069291687522337, | |
| "loss": 0.2578, | |
| "mean_token_accuracy": 0.9113052189350128, | |
| "num_tokens": 39615140.0, | |
| "step": 581 | |
| }, | |
| { | |
| "epoch": 1.1759352881698686, | |
| "grad_norm": 0.21382521092891693, | |
| "learning_rate": 0.00029033160312806925, | |
| "loss": 0.2843, | |
| "mean_token_accuracy": 0.9006746262311935, | |
| "num_tokens": 39676629.0, | |
| "step": 582 | |
| }, | |
| { | |
| "epoch": 1.1779575328614762, | |
| "grad_norm": 0.17140787839889526, | |
| "learning_rate": 0.0002899699546274312, | |
| "loss": 0.2973, | |
| "mean_token_accuracy": 0.8942140191793442, | |
| "num_tokens": 39744182.0, | |
| "step": 583 | |
| }, | |
| { | |
| "epoch": 1.179979777553084, | |
| "grad_norm": 0.16415606439113617, | |
| "learning_rate": 0.0002896079730946277, | |
| "loss": 0.248, | |
| "mean_token_accuracy": 0.9046668969094753, | |
| "num_tokens": 39809087.0, | |
| "step": 584 | |
| }, | |
| { | |
| "epoch": 1.1820020222446916, | |
| "grad_norm": 0.15275758504867554, | |
| "learning_rate": 0.0002892456602525625, | |
| "loss": 0.2528, | |
| "mean_token_accuracy": 0.9055165685713291, | |
| "num_tokens": 39883376.0, | |
| "step": 585 | |
| }, | |
| { | |
| "epoch": 1.1840242669362993, | |
| "grad_norm": 0.1598130762577057, | |
| "learning_rate": 0.00028888301782571614, | |
| "loss": 0.2571, | |
| "mean_token_accuracy": 0.9055753275752068, | |
| "num_tokens": 39950688.0, | |
| "step": 586 | |
| }, | |
| { | |
| "epoch": 1.1860465116279069, | |
| "grad_norm": 0.16630232334136963, | |
| "learning_rate": 0.000288520047540138, | |
| "loss": 0.2857, | |
| "mean_token_accuracy": 0.9000633843243122, | |
| "num_tokens": 40015260.0, | |
| "step": 587 | |
| }, | |
| { | |
| "epoch": 1.1880687563195147, | |
| "grad_norm": 0.19941283762454987, | |
| "learning_rate": 0.00028815675112343794, | |
| "loss": 0.2954, | |
| "mean_token_accuracy": 0.8945838250219822, | |
| "num_tokens": 40079394.0, | |
| "step": 588 | |
| }, | |
| { | |
| "epoch": 1.1900910010111223, | |
| "grad_norm": 0.19106529653072357, | |
| "learning_rate": 0.00028779313030477793, | |
| "loss": 0.3112, | |
| "mean_token_accuracy": 0.8897448740899563, | |
| "num_tokens": 40144909.0, | |
| "step": 589 | |
| }, | |
| { | |
| "epoch": 1.1921132457027301, | |
| "grad_norm": 0.17041806876659393, | |
| "learning_rate": 0.0002874291868148642, | |
| "loss": 0.2819, | |
| "mean_token_accuracy": 0.8990175537765026, | |
| "num_tokens": 40217254.0, | |
| "step": 590 | |
| }, | |
| { | |
| "epoch": 1.1941354903943378, | |
| "grad_norm": 0.16470171511173248, | |
| "learning_rate": 0.0002870649223859386, | |
| "loss": 0.2773, | |
| "mean_token_accuracy": 0.9041831828653812, | |
| "num_tokens": 40280417.0, | |
| "step": 591 | |
| }, | |
| { | |
| "epoch": 1.1961577350859454, | |
| "grad_norm": 0.1665530502796173, | |
| "learning_rate": 0.00028670033875177053, | |
| "loss": 0.2663, | |
| "mean_token_accuracy": 0.9013455249369144, | |
| "num_tokens": 40350231.0, | |
| "step": 592 | |
| }, | |
| { | |
| "epoch": 1.198179979777553, | |
| "grad_norm": 0.19251202046871185, | |
| "learning_rate": 0.00028633543764764894, | |
| "loss": 0.3157, | |
| "mean_token_accuracy": 0.8875606693327427, | |
| "num_tokens": 40413686.0, | |
| "step": 593 | |
| }, | |
| { | |
| "epoch": 1.2002022244691608, | |
| "grad_norm": 0.17525707185268402, | |
| "learning_rate": 0.00028597022081037354, | |
| "loss": 0.2933, | |
| "mean_token_accuracy": 0.8971122018992901, | |
| "num_tokens": 40479649.0, | |
| "step": 594 | |
| }, | |
| { | |
| "epoch": 1.2022244691607684, | |
| "grad_norm": 0.19120153784751892, | |
| "learning_rate": 0.000285604689978247, | |
| "loss": 0.275, | |
| "mean_token_accuracy": 0.8998171053826809, | |
| "num_tokens": 40548513.0, | |
| "step": 595 | |
| }, | |
| { | |
| "epoch": 1.2042467138523762, | |
| "grad_norm": 0.15362586081027985, | |
| "learning_rate": 0.0002852388468910663, | |
| "loss": 0.2655, | |
| "mean_token_accuracy": 0.9043829254806042, | |
| "num_tokens": 40621501.0, | |
| "step": 596 | |
| }, | |
| { | |
| "epoch": 1.2062689585439839, | |
| "grad_norm": 0.1648460179567337, | |
| "learning_rate": 0.00028487269329011497, | |
| "loss": 0.2765, | |
| "mean_token_accuracy": 0.9020786061882973, | |
| "num_tokens": 40696483.0, | |
| "step": 597 | |
| }, | |
| { | |
| "epoch": 1.2082912032355915, | |
| "grad_norm": 0.1793263554573059, | |
| "learning_rate": 0.000284506230918154, | |
| "loss": 0.2914, | |
| "mean_token_accuracy": 0.8994336612522602, | |
| "num_tokens": 40765538.0, | |
| "step": 598 | |
| }, | |
| { | |
| "epoch": 1.210313447927199, | |
| "grad_norm": 0.17354300618171692, | |
| "learning_rate": 0.00028413946151941463, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.9005281217396259, | |
| "num_tokens": 40833551.0, | |
| "step": 599 | |
| }, | |
| { | |
| "epoch": 1.212335692618807, | |
| "grad_norm": 0.1781807243824005, | |
| "learning_rate": 0.00028377238683958885, | |
| "loss": 0.2849, | |
| "mean_token_accuracy": 0.8987740390002728, | |
| "num_tokens": 40895246.0, | |
| "step": 600 | |
| }, | |
| { | |
| "epoch": 1.2143579373104145, | |
| "grad_norm": 0.16701123118400574, | |
| "learning_rate": 0.0002834050086258221, | |
| "loss": 0.2607, | |
| "mean_token_accuracy": 0.9041876047849655, | |
| "num_tokens": 40964580.0, | |
| "step": 601 | |
| }, | |
| { | |
| "epoch": 1.2163801820020224, | |
| "grad_norm": 0.15654708445072174, | |
| "learning_rate": 0.00028303732862670417, | |
| "loss": 0.2702, | |
| "mean_token_accuracy": 0.9014758616685867, | |
| "num_tokens": 41039130.0, | |
| "step": 602 | |
| }, | |
| { | |
| "epoch": 1.21840242669363, | |
| "grad_norm": 0.18177339434623718, | |
| "learning_rate": 0.0002826693485922616, | |
| "loss": 0.2701, | |
| "mean_token_accuracy": 0.9032718986272812, | |
| "num_tokens": 41095473.0, | |
| "step": 603 | |
| }, | |
| { | |
| "epoch": 1.2204246713852376, | |
| "grad_norm": 0.16560594737529755, | |
| "learning_rate": 0.00028230107027394876, | |
| "loss": 0.2939, | |
| "mean_token_accuracy": 0.8934713453054428, | |
| "num_tokens": 41157491.0, | |
| "step": 604 | |
| }, | |
| { | |
| "epoch": 1.2224469160768452, | |
| "grad_norm": 0.18375754356384277, | |
| "learning_rate": 0.00028193249542463977, | |
| "loss": 0.2909, | |
| "mean_token_accuracy": 0.8953644298017025, | |
| "num_tokens": 41225218.0, | |
| "step": 605 | |
| }, | |
| { | |
| "epoch": 1.224469160768453, | |
| "grad_norm": 0.14936794340610504, | |
| "learning_rate": 0.0002815636257986204, | |
| "loss": 0.2539, | |
| "mean_token_accuracy": 0.9058601558208466, | |
| "num_tokens": 41307770.0, | |
| "step": 606 | |
| }, | |
| { | |
| "epoch": 1.2264914054600606, | |
| "grad_norm": 0.16326607763767242, | |
| "learning_rate": 0.00028119446315157896, | |
| "loss": 0.2507, | |
| "mean_token_accuracy": 0.9078186601400375, | |
| "num_tokens": 41371178.0, | |
| "step": 607 | |
| }, | |
| { | |
| "epoch": 1.2285136501516685, | |
| "grad_norm": 0.16785994172096252, | |
| "learning_rate": 0.0002808250092405989, | |
| "loss": 0.2589, | |
| "mean_token_accuracy": 0.9010850116610527, | |
| "num_tokens": 41444090.0, | |
| "step": 608 | |
| }, | |
| { | |
| "epoch": 1.230535894843276, | |
| "grad_norm": 0.17225563526153564, | |
| "learning_rate": 0.0002804552658241496, | |
| "loss": 0.2667, | |
| "mean_token_accuracy": 0.9027063623070717, | |
| "num_tokens": 41512243.0, | |
| "step": 609 | |
| }, | |
| { | |
| "epoch": 1.2325581395348837, | |
| "grad_norm": 0.16818945109844208, | |
| "learning_rate": 0.0002800852346620788, | |
| "loss": 0.2704, | |
| "mean_token_accuracy": 0.9012492336332798, | |
| "num_tokens": 41582048.0, | |
| "step": 610 | |
| }, | |
| { | |
| "epoch": 1.2345803842264913, | |
| "grad_norm": 0.1885753571987152, | |
| "learning_rate": 0.00027971491751560345, | |
| "loss": 0.2859, | |
| "mean_token_accuracy": 0.8967389948666096, | |
| "num_tokens": 41646351.0, | |
| "step": 611 | |
| }, | |
| { | |
| "epoch": 1.2366026289180991, | |
| "grad_norm": 0.15571804344654083, | |
| "learning_rate": 0.0002793443161473017, | |
| "loss": 0.2707, | |
| "mean_token_accuracy": 0.9040926285088062, | |
| "num_tokens": 41715042.0, | |
| "step": 612 | |
| }, | |
| { | |
| "epoch": 1.2386248736097067, | |
| "grad_norm": 0.1665385216474533, | |
| "learning_rate": 0.0002789734323211048, | |
| "loss": 0.2633, | |
| "mean_token_accuracy": 0.9024609327316284, | |
| "num_tokens": 41787021.0, | |
| "step": 613 | |
| }, | |
| { | |
| "epoch": 1.2406471183013146, | |
| "grad_norm": 0.17233288288116455, | |
| "learning_rate": 0.0002786022678022882, | |
| "loss": 0.3058, | |
| "mean_token_accuracy": 0.8898206166923046, | |
| "num_tokens": 41851767.0, | |
| "step": 614 | |
| }, | |
| { | |
| "epoch": 1.2426693629929222, | |
| "grad_norm": 0.1737981140613556, | |
| "learning_rate": 0.0002782308243574633, | |
| "loss": 0.2933, | |
| "mean_token_accuracy": 0.8971287794411182, | |
| "num_tokens": 41914797.0, | |
| "step": 615 | |
| }, | |
| { | |
| "epoch": 1.2446916076845298, | |
| "grad_norm": 0.16172519326210022, | |
| "learning_rate": 0.0002778591037545691, | |
| "loss": 0.2665, | |
| "mean_token_accuracy": 0.9057141467928886, | |
| "num_tokens": 41986868.0, | |
| "step": 616 | |
| }, | |
| { | |
| "epoch": 1.2467138523761374, | |
| "grad_norm": 0.15280866622924805, | |
| "learning_rate": 0.0002774871077628639, | |
| "loss": 0.2688, | |
| "mean_token_accuracy": 0.9038811773061752, | |
| "num_tokens": 42062995.0, | |
| "step": 617 | |
| }, | |
| { | |
| "epoch": 1.2487360970677452, | |
| "grad_norm": 0.17397160828113556, | |
| "learning_rate": 0.0002771148381529166, | |
| "loss": 0.2863, | |
| "mean_token_accuracy": 0.8941488154232502, | |
| "num_tokens": 42124939.0, | |
| "step": 618 | |
| }, | |
| { | |
| "epoch": 1.2507583417593529, | |
| "grad_norm": 0.1617380529642105, | |
| "learning_rate": 0.00027674229669659856, | |
| "loss": 0.2536, | |
| "mean_token_accuracy": 0.9045982100069523, | |
| "num_tokens": 42194011.0, | |
| "step": 619 | |
| }, | |
| { | |
| "epoch": 1.2527805864509607, | |
| "grad_norm": 0.15885986387729645, | |
| "learning_rate": 0.0002763694851670749, | |
| "loss": 0.2703, | |
| "mean_token_accuracy": 0.9061401709914207, | |
| "num_tokens": 42265919.0, | |
| "step": 620 | |
| }, | |
| { | |
| "epoch": 1.2548028311425683, | |
| "grad_norm": 0.16419966518878937, | |
| "learning_rate": 0.00027599640533879636, | |
| "loss": 0.2769, | |
| "mean_token_accuracy": 0.9034353755414486, | |
| "num_tokens": 42334638.0, | |
| "step": 621 | |
| }, | |
| { | |
| "epoch": 1.256825075834176, | |
| "grad_norm": 0.16629813611507416, | |
| "learning_rate": 0.0002756230589874905, | |
| "loss": 0.2687, | |
| "mean_token_accuracy": 0.9030461423099041, | |
| "num_tokens": 42404575.0, | |
| "step": 622 | |
| }, | |
| { | |
| "epoch": 1.2588473205257835, | |
| "grad_norm": 0.17728988826274872, | |
| "learning_rate": 0.00027524944789015366, | |
| "loss": 0.2751, | |
| "mean_token_accuracy": 0.9014569260179996, | |
| "num_tokens": 42475814.0, | |
| "step": 623 | |
| }, | |
| { | |
| "epoch": 1.2608695652173914, | |
| "grad_norm": 0.17427091300487518, | |
| "learning_rate": 0.00027487557382504195, | |
| "loss": 0.2657, | |
| "mean_token_accuracy": 0.9044037610292435, | |
| "num_tokens": 42543660.0, | |
| "step": 624 | |
| }, | |
| { | |
| "epoch": 1.262891809908999, | |
| "grad_norm": 0.1894424855709076, | |
| "learning_rate": 0.00027450143857166344, | |
| "loss": 0.2969, | |
| "mean_token_accuracy": 0.8965917490422726, | |
| "num_tokens": 42607124.0, | |
| "step": 625 | |
| }, | |
| { | |
| "epoch": 1.2649140546006068, | |
| "grad_norm": 0.15993963181972504, | |
| "learning_rate": 0.00027412704391076916, | |
| "loss": 0.2782, | |
| "mean_token_accuracy": 0.9031428508460522, | |
| "num_tokens": 42676066.0, | |
| "step": 626 | |
| }, | |
| { | |
| "epoch": 1.2669362992922144, | |
| "grad_norm": 0.17840322852134705, | |
| "learning_rate": 0.00027375239162434503, | |
| "loss": 0.2688, | |
| "mean_token_accuracy": 0.9015723317861557, | |
| "num_tokens": 42746212.0, | |
| "step": 627 | |
| }, | |
| { | |
| "epoch": 1.268958543983822, | |
| "grad_norm": 0.20184557139873505, | |
| "learning_rate": 0.00027337748349560276, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.8969193771481514, | |
| "num_tokens": 42803557.0, | |
| "step": 628 | |
| }, | |
| { | |
| "epoch": 1.2709807886754296, | |
| "grad_norm": 0.16635443270206451, | |
| "learning_rate": 0.0002730023213089724, | |
| "loss": 0.2884, | |
| "mean_token_accuracy": 0.8960177823901176, | |
| "num_tokens": 42866158.0, | |
| "step": 629 | |
| }, | |
| { | |
| "epoch": 1.2730030333670375, | |
| "grad_norm": 0.19960255920886993, | |
| "learning_rate": 0.0002726269068500926, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.8968143723905087, | |
| "num_tokens": 42927025.0, | |
| "step": 630 | |
| }, | |
| { | |
| "epoch": 1.275025278058645, | |
| "grad_norm": 0.1719711273908615, | |
| "learning_rate": 0.0002722512419058032, | |
| "loss": 0.2728, | |
| "mean_token_accuracy": 0.9018568396568298, | |
| "num_tokens": 43007744.0, | |
| "step": 631 | |
| }, | |
| { | |
| "epoch": 1.277047522750253, | |
| "grad_norm": 0.17668215930461884, | |
| "learning_rate": 0.00027187532826413607, | |
| "loss": 0.2683, | |
| "mean_token_accuracy": 0.9023380614817142, | |
| "num_tokens": 43071417.0, | |
| "step": 632 | |
| }, | |
| { | |
| "epoch": 1.2790697674418605, | |
| "grad_norm": 0.17645464837551117, | |
| "learning_rate": 0.00027149916771430677, | |
| "loss": 0.2787, | |
| "mean_token_accuracy": 0.9030827060341835, | |
| "num_tokens": 43143504.0, | |
| "step": 633 | |
| }, | |
| { | |
| "epoch": 1.2810920121334681, | |
| "grad_norm": 0.18298184871673584, | |
| "learning_rate": 0.00027112276204670617, | |
| "loss": 0.2886, | |
| "mean_token_accuracy": 0.8980408012866974, | |
| "num_tokens": 43219433.0, | |
| "step": 634 | |
| }, | |
| { | |
| "epoch": 1.2831142568250757, | |
| "grad_norm": 0.15996871888637543, | |
| "learning_rate": 0.00027074611305289147, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.902827687561512, | |
| "num_tokens": 43286472.0, | |
| "step": 635 | |
| }, | |
| { | |
| "epoch": 1.2851365015166836, | |
| "grad_norm": 0.1937190294265747, | |
| "learning_rate": 0.00027036922252557865, | |
| "loss": 0.2937, | |
| "mean_token_accuracy": 0.897728331387043, | |
| "num_tokens": 43346390.0, | |
| "step": 636 | |
| }, | |
| { | |
| "epoch": 1.2871587462082912, | |
| "grad_norm": 0.17584164440631866, | |
| "learning_rate": 0.00026999209225863263, | |
| "loss": 0.2896, | |
| "mean_token_accuracy": 0.897246178239584, | |
| "num_tokens": 43413853.0, | |
| "step": 637 | |
| }, | |
| { | |
| "epoch": 1.289180990899899, | |
| "grad_norm": 0.17733249068260193, | |
| "learning_rate": 0.0002696147240470598, | |
| "loss": 0.2882, | |
| "mean_token_accuracy": 0.8957457803189754, | |
| "num_tokens": 43478722.0, | |
| "step": 638 | |
| }, | |
| { | |
| "epoch": 1.2912032355915066, | |
| "grad_norm": 0.17890246212482452, | |
| "learning_rate": 0.0002692371196869992, | |
| "loss": 0.288, | |
| "mean_token_accuracy": 0.8960468098521233, | |
| "num_tokens": 43540378.0, | |
| "step": 639 | |
| }, | |
| { | |
| "epoch": 1.2932254802831142, | |
| "grad_norm": 0.15859632194042206, | |
| "learning_rate": 0.0002688592809757134, | |
| "loss": 0.2792, | |
| "mean_token_accuracy": 0.9036918766796589, | |
| "num_tokens": 43612284.0, | |
| "step": 640 | |
| }, | |
| { | |
| "epoch": 1.2952477249747218, | |
| "grad_norm": 0.16566091775894165, | |
| "learning_rate": 0.0002684812097115808, | |
| "loss": 0.2785, | |
| "mean_token_accuracy": 0.9012075029313564, | |
| "num_tokens": 43677352.0, | |
| "step": 641 | |
| }, | |
| { | |
| "epoch": 1.2972699696663297, | |
| "grad_norm": 0.17786841094493866, | |
| "learning_rate": 0.0002681029076940862, | |
| "loss": 0.2911, | |
| "mean_token_accuracy": 0.9009424708783627, | |
| "num_tokens": 43739163.0, | |
| "step": 642 | |
| }, | |
| { | |
| "epoch": 1.2992922143579373, | |
| "grad_norm": 0.15567278861999512, | |
| "learning_rate": 0.0002677243767238135, | |
| "loss": 0.2591, | |
| "mean_token_accuracy": 0.9091448336839676, | |
| "num_tokens": 43819970.0, | |
| "step": 643 | |
| }, | |
| { | |
| "epoch": 1.3013144590495451, | |
| "grad_norm": 0.20501317083835602, | |
| "learning_rate": 0.00026734561860243544, | |
| "loss": 0.3186, | |
| "mean_token_accuracy": 0.8898426368832588, | |
| "num_tokens": 43879943.0, | |
| "step": 644 | |
| }, | |
| { | |
| "epoch": 1.3033367037411527, | |
| "grad_norm": 0.18259315192699432, | |
| "learning_rate": 0.0002669666351327066, | |
| "loss": 0.2772, | |
| "mean_token_accuracy": 0.8982793055474758, | |
| "num_tokens": 43941000.0, | |
| "step": 645 | |
| }, | |
| { | |
| "epoch": 1.3053589484327603, | |
| "grad_norm": 0.18504492938518524, | |
| "learning_rate": 0.00026658742811845376, | |
| "loss": 0.2905, | |
| "mean_token_accuracy": 0.896319292485714, | |
| "num_tokens": 44000567.0, | |
| "step": 646 | |
| }, | |
| { | |
| "epoch": 1.307381193124368, | |
| "grad_norm": 0.17783911526203156, | |
| "learning_rate": 0.00026620799936456774, | |
| "loss": 0.2813, | |
| "mean_token_accuracy": 0.9009971134364605, | |
| "num_tokens": 44071352.0, | |
| "step": 647 | |
| }, | |
| { | |
| "epoch": 1.3094034378159758, | |
| "grad_norm": 0.21716438233852386, | |
| "learning_rate": 0.00026582835067699495, | |
| "loss": 0.2906, | |
| "mean_token_accuracy": 0.8958504274487495, | |
| "num_tokens": 44129790.0, | |
| "step": 648 | |
| }, | |
| { | |
| "epoch": 1.3114256825075834, | |
| "grad_norm": 0.1822315752506256, | |
| "learning_rate": 0.0002654484838627284, | |
| "loss": 0.2867, | |
| "mean_token_accuracy": 0.9037492237985134, | |
| "num_tokens": 44195417.0, | |
| "step": 649 | |
| }, | |
| { | |
| "epoch": 1.3134479271991912, | |
| "grad_norm": 0.15820986032485962, | |
| "learning_rate": 0.00026506840072979947, | |
| "loss": 0.2546, | |
| "mean_token_accuracy": 0.9098224155604839, | |
| "num_tokens": 44273153.0, | |
| "step": 650 | |
| }, | |
| { | |
| "epoch": 1.3154701718907988, | |
| "grad_norm": 0.1899651139974594, | |
| "learning_rate": 0.00026468810308726893, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.8995106518268585, | |
| "num_tokens": 44349738.0, | |
| "step": 651 | |
| }, | |
| { | |
| "epoch": 1.3174924165824065, | |
| "grad_norm": 0.18798086047172546, | |
| "learning_rate": 0.00026430759274521877, | |
| "loss": 0.2964, | |
| "mean_token_accuracy": 0.8899718299508095, | |
| "num_tokens": 44415133.0, | |
| "step": 652 | |
| }, | |
| { | |
| "epoch": 1.319514661274014, | |
| "grad_norm": 0.13753436505794525, | |
| "learning_rate": 0.0002639268715147432, | |
| "loss": 0.2307, | |
| "mean_token_accuracy": 0.9101770743727684, | |
| "num_tokens": 44484697.0, | |
| "step": 653 | |
| }, | |
| { | |
| "epoch": 1.321536905965622, | |
| "grad_norm": 0.20119944214820862, | |
| "learning_rate": 0.00026354594120794016, | |
| "loss": 0.2926, | |
| "mean_token_accuracy": 0.897066742181778, | |
| "num_tokens": 44551987.0, | |
| "step": 654 | |
| }, | |
| { | |
| "epoch": 1.3235591506572295, | |
| "grad_norm": 0.18725383281707764, | |
| "learning_rate": 0.000263164803637903, | |
| "loss": 0.2742, | |
| "mean_token_accuracy": 0.9033515900373459, | |
| "num_tokens": 44617511.0, | |
| "step": 655 | |
| }, | |
| { | |
| "epoch": 1.3255813953488373, | |
| "grad_norm": 0.15222612023353577, | |
| "learning_rate": 0.0002627834606187112, | |
| "loss": 0.2518, | |
| "mean_token_accuracy": 0.9108999036252499, | |
| "num_tokens": 44698150.0, | |
| "step": 656 | |
| }, | |
| { | |
| "epoch": 1.327603640040445, | |
| "grad_norm": 0.16968220472335815, | |
| "learning_rate": 0.0002624019139654223, | |
| "loss": 0.2834, | |
| "mean_token_accuracy": 0.9003202244639397, | |
| "num_tokens": 44769993.0, | |
| "step": 657 | |
| }, | |
| { | |
| "epoch": 1.3296258847320526, | |
| "grad_norm": 0.1526424139738083, | |
| "learning_rate": 0.000262020165494063, | |
| "loss": 0.2493, | |
| "mean_token_accuracy": 0.9069892205297947, | |
| "num_tokens": 44848710.0, | |
| "step": 658 | |
| }, | |
| { | |
| "epoch": 1.3316481294236602, | |
| "grad_norm": 0.16174714267253876, | |
| "learning_rate": 0.00026163821702162074, | |
| "loss": 0.2581, | |
| "mean_token_accuracy": 0.9058538265526295, | |
| "num_tokens": 44932916.0, | |
| "step": 659 | |
| }, | |
| { | |
| "epoch": 1.333670374115268, | |
| "grad_norm": 0.18540237843990326, | |
| "learning_rate": 0.0002612560703660346, | |
| "loss": 0.2823, | |
| "mean_token_accuracy": 0.9005630798637867, | |
| "num_tokens": 44997865.0, | |
| "step": 660 | |
| }, | |
| { | |
| "epoch": 1.3356926188068756, | |
| "grad_norm": 0.145268976688385, | |
| "learning_rate": 0.0002608737273461872, | |
| "loss": 0.2402, | |
| "mean_token_accuracy": 0.9093809016048908, | |
| "num_tokens": 45074165.0, | |
| "step": 661 | |
| }, | |
| { | |
| "epoch": 1.3377148634984835, | |
| "grad_norm": 0.16983529925346375, | |
| "learning_rate": 0.0002604911897818957, | |
| "loss": 0.2763, | |
| "mean_token_accuracy": 0.9002145752310753, | |
| "num_tokens": 45140578.0, | |
| "step": 662 | |
| }, | |
| { | |
| "epoch": 1.339737108190091, | |
| "grad_norm": 0.18206650018692017, | |
| "learning_rate": 0.00026010845949390326, | |
| "loss": 0.271, | |
| "mean_token_accuracy": 0.9040128998458385, | |
| "num_tokens": 45206573.0, | |
| "step": 663 | |
| }, | |
| { | |
| "epoch": 1.3417593528816987, | |
| "grad_norm": 0.17423690855503082, | |
| "learning_rate": 0.00025972553830387027, | |
| "loss": 0.276, | |
| "mean_token_accuracy": 0.9035660028457642, | |
| "num_tokens": 45273772.0, | |
| "step": 664 | |
| }, | |
| { | |
| "epoch": 1.3437815975733063, | |
| "grad_norm": 0.17948757112026215, | |
| "learning_rate": 0.0002593424280343656, | |
| "loss": 0.3073, | |
| "mean_token_accuracy": 0.8898307755589485, | |
| "num_tokens": 45333260.0, | |
| "step": 665 | |
| }, | |
| { | |
| "epoch": 1.3458038422649141, | |
| "grad_norm": 0.1973046064376831, | |
| "learning_rate": 0.0002589591305088585, | |
| "loss": 0.298, | |
| "mean_token_accuracy": 0.8946604765951633, | |
| "num_tokens": 45397184.0, | |
| "step": 666 | |
| }, | |
| { | |
| "epoch": 1.3478260869565217, | |
| "grad_norm": 0.16013695299625397, | |
| "learning_rate": 0.0002585756475517092, | |
| "loss": 0.2698, | |
| "mean_token_accuracy": 0.905727930366993, | |
| "num_tokens": 45478638.0, | |
| "step": 667 | |
| }, | |
| { | |
| "epoch": 1.3498483316481296, | |
| "grad_norm": 0.1567625254392624, | |
| "learning_rate": 0.00025819198098816034, | |
| "loss": 0.2765, | |
| "mean_token_accuracy": 0.9000396579504013, | |
| "num_tokens": 45548715.0, | |
| "step": 668 | |
| }, | |
| { | |
| "epoch": 1.3518705763397372, | |
| "grad_norm": 0.16354252398014069, | |
| "learning_rate": 0.00025780813264432884, | |
| "loss": 0.2659, | |
| "mean_token_accuracy": 0.9028089232742786, | |
| "num_tokens": 45624018.0, | |
| "step": 669 | |
| }, | |
| { | |
| "epoch": 1.3538928210313448, | |
| "grad_norm": 0.19890683889389038, | |
| "learning_rate": 0.0002574241043471967, | |
| "loss": 0.3082, | |
| "mean_token_accuracy": 0.89163389056921, | |
| "num_tokens": 45692190.0, | |
| "step": 670 | |
| }, | |
| { | |
| "epoch": 1.3559150657229524, | |
| "grad_norm": 0.1480788290500641, | |
| "learning_rate": 0.0002570398979246023, | |
| "loss": 0.2605, | |
| "mean_token_accuracy": 0.905091181397438, | |
| "num_tokens": 45771127.0, | |
| "step": 671 | |
| }, | |
| { | |
| "epoch": 1.3579373104145602, | |
| "grad_norm": 0.17679338157176971, | |
| "learning_rate": 0.00025665551520523194, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.8965117931365967, | |
| "num_tokens": 45835910.0, | |
| "step": 672 | |
| }, | |
| { | |
| "epoch": 1.3599595551061678, | |
| "grad_norm": 0.17713719606399536, | |
| "learning_rate": 0.00025627095801861107, | |
| "loss": 0.2905, | |
| "mean_token_accuracy": 0.8971158005297184, | |
| "num_tokens": 45901225.0, | |
| "step": 673 | |
| }, | |
| { | |
| "epoch": 1.3619817997977754, | |
| "grad_norm": 0.17695656418800354, | |
| "learning_rate": 0.0002558862281950955, | |
| "loss": 0.3268, | |
| "mean_token_accuracy": 0.8890945613384247, | |
| "num_tokens": 45972893.0, | |
| "step": 674 | |
| }, | |
| { | |
| "epoch": 1.3640040444893833, | |
| "grad_norm": 0.15022985637187958, | |
| "learning_rate": 0.0002555013275658627, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.9022598974406719, | |
| "num_tokens": 46053862.0, | |
| "step": 675 | |
| }, | |
| { | |
| "epoch": 1.366026289180991, | |
| "grad_norm": 0.16728746891021729, | |
| "learning_rate": 0.0002551162579629031, | |
| "loss": 0.2735, | |
| "mean_token_accuracy": 0.9004092961549759, | |
| "num_tokens": 46123535.0, | |
| "step": 676 | |
| }, | |
| { | |
| "epoch": 1.3680485338725985, | |
| "grad_norm": 0.17287185788154602, | |
| "learning_rate": 0.0002547310212190115, | |
| "loss": 0.2803, | |
| "mean_token_accuracy": 0.8980144336819649, | |
| "num_tokens": 46193498.0, | |
| "step": 677 | |
| }, | |
| { | |
| "epoch": 1.3700707785642063, | |
| "grad_norm": 0.184726744890213, | |
| "learning_rate": 0.0002543456191677781, | |
| "loss": 0.2927, | |
| "mean_token_accuracy": 0.8962498530745506, | |
| "num_tokens": 46261698.0, | |
| "step": 678 | |
| }, | |
| { | |
| "epoch": 1.372093023255814, | |
| "grad_norm": 0.15757699310779572, | |
| "learning_rate": 0.00025396005364357994, | |
| "loss": 0.2809, | |
| "mean_token_accuracy": 0.8978969343006611, | |
| "num_tokens": 46329372.0, | |
| "step": 679 | |
| }, | |
| { | |
| "epoch": 1.3741152679474216, | |
| "grad_norm": 0.18496832251548767, | |
| "learning_rate": 0.0002535743264815723, | |
| "loss": 0.2948, | |
| "mean_token_accuracy": 0.8964893855154514, | |
| "num_tokens": 46389989.0, | |
| "step": 680 | |
| }, | |
| { | |
| "epoch": 1.3761375126390294, | |
| "grad_norm": 0.19771555066108704, | |
| "learning_rate": 0.0002531884395176794, | |
| "loss": 0.3045, | |
| "mean_token_accuracy": 0.8947297558188438, | |
| "num_tokens": 46451529.0, | |
| "step": 681 | |
| }, | |
| { | |
| "epoch": 1.378159757330637, | |
| "grad_norm": 0.1643752008676529, | |
| "learning_rate": 0.0002528023945885866, | |
| "loss": 0.2691, | |
| "mean_token_accuracy": 0.9002487845718861, | |
| "num_tokens": 46518234.0, | |
| "step": 682 | |
| }, | |
| { | |
| "epoch": 1.3801820020222446, | |
| "grad_norm": 0.15709805488586426, | |
| "learning_rate": 0.00025241619353173056, | |
| "loss": 0.2517, | |
| "mean_token_accuracy": 0.9091945327818394, | |
| "num_tokens": 46590312.0, | |
| "step": 683 | |
| }, | |
| { | |
| "epoch": 1.3822042467138524, | |
| "grad_norm": 0.17834722995758057, | |
| "learning_rate": 0.00025202983818529154, | |
| "loss": 0.294, | |
| "mean_token_accuracy": 0.8986290767788887, | |
| "num_tokens": 46658404.0, | |
| "step": 684 | |
| }, | |
| { | |
| "epoch": 1.38422649140546, | |
| "grad_norm": 0.15814678370952606, | |
| "learning_rate": 0.00025164333038818384, | |
| "loss": 0.2708, | |
| "mean_token_accuracy": 0.9031675830483437, | |
| "num_tokens": 46724887.0, | |
| "step": 685 | |
| }, | |
| { | |
| "epoch": 1.3862487360970677, | |
| "grad_norm": 0.17998504638671875, | |
| "learning_rate": 0.0002512566719800475, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.89876314625144, | |
| "num_tokens": 46795038.0, | |
| "step": 686 | |
| }, | |
| { | |
| "epoch": 1.3882709807886755, | |
| "grad_norm": 0.17202328145503998, | |
| "learning_rate": 0.0002508698648012394, | |
| "loss": 0.2965, | |
| "mean_token_accuracy": 0.8947253711521626, | |
| "num_tokens": 46856174.0, | |
| "step": 687 | |
| }, | |
| { | |
| "epoch": 1.3902932254802831, | |
| "grad_norm": 0.16402584314346313, | |
| "learning_rate": 0.00025048291069282443, | |
| "loss": 0.2633, | |
| "mean_token_accuracy": 0.9063729159533978, | |
| "num_tokens": 46925752.0, | |
| "step": 688 | |
| }, | |
| { | |
| "epoch": 1.3923154701718907, | |
| "grad_norm": 0.19435186684131622, | |
| "learning_rate": 0.00025009581149656703, | |
| "loss": 0.2756, | |
| "mean_token_accuracy": 0.9030190780758858, | |
| "num_tokens": 46993260.0, | |
| "step": 689 | |
| }, | |
| { | |
| "epoch": 1.3943377148634986, | |
| "grad_norm": 0.18806155025959015, | |
| "learning_rate": 0.000249708569054922, | |
| "loss": 0.3033, | |
| "mean_token_accuracy": 0.896921843290329, | |
| "num_tokens": 47060294.0, | |
| "step": 690 | |
| }, | |
| { | |
| "epoch": 1.3963599595551062, | |
| "grad_norm": 0.19206839799880981, | |
| "learning_rate": 0.000249321185211026, | |
| "loss": 0.282, | |
| "mean_token_accuracy": 0.8990140780806541, | |
| "num_tokens": 47123248.0, | |
| "step": 691 | |
| }, | |
| { | |
| "epoch": 1.3983822042467138, | |
| "grad_norm": 0.16943977773189545, | |
| "learning_rate": 0.00024893366180868875, | |
| "loss": 0.2728, | |
| "mean_token_accuracy": 0.9020564220845699, | |
| "num_tokens": 47185179.0, | |
| "step": 692 | |
| }, | |
| { | |
| "epoch": 1.4004044489383216, | |
| "grad_norm": 0.1619652956724167, | |
| "learning_rate": 0.00024854600069238407, | |
| "loss": 0.2728, | |
| "mean_token_accuracy": 0.9024368785321712, | |
| "num_tokens": 47259239.0, | |
| "step": 693 | |
| }, | |
| { | |
| "epoch": 1.4024266936299292, | |
| "grad_norm": 0.17677046358585358, | |
| "learning_rate": 0.00024815820370724156, | |
| "loss": 0.2697, | |
| "mean_token_accuracy": 0.90378213301301, | |
| "num_tokens": 47322333.0, | |
| "step": 694 | |
| }, | |
| { | |
| "epoch": 1.4044489383215368, | |
| "grad_norm": 0.15612858533859253, | |
| "learning_rate": 0.0002477702726990372, | |
| "loss": 0.2826, | |
| "mean_token_accuracy": 0.9020431824028492, | |
| "num_tokens": 47391001.0, | |
| "step": 695 | |
| }, | |
| { | |
| "epoch": 1.4064711830131447, | |
| "grad_norm": 0.16640524566173553, | |
| "learning_rate": 0.000247382209514185, | |
| "loss": 0.2948, | |
| "mean_token_accuracy": 0.8942111246287823, | |
| "num_tokens": 47455737.0, | |
| "step": 696 | |
| }, | |
| { | |
| "epoch": 1.4084934277047523, | |
| "grad_norm": 0.16898459196090698, | |
| "learning_rate": 0.0002469940159997281, | |
| "loss": 0.2687, | |
| "mean_token_accuracy": 0.9056588634848595, | |
| "num_tokens": 47525615.0, | |
| "step": 697 | |
| }, | |
| { | |
| "epoch": 1.4105156723963599, | |
| "grad_norm": 0.18844769895076752, | |
| "learning_rate": 0.00024660569400332996, | |
| "loss": 0.2946, | |
| "mean_token_accuracy": 0.895747821778059, | |
| "num_tokens": 47592079.0, | |
| "step": 698 | |
| }, | |
| { | |
| "epoch": 1.4125379170879677, | |
| "grad_norm": 0.16074754297733307, | |
| "learning_rate": 0.00024621724537326545, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.9034741893410683, | |
| "num_tokens": 47667233.0, | |
| "step": 699 | |
| }, | |
| { | |
| "epoch": 1.4145601617795753, | |
| "grad_norm": 0.16710326075553894, | |
| "learning_rate": 0.00024582867195841227, | |
| "loss": 0.2863, | |
| "mean_token_accuracy": 0.9007730670273304, | |
| "num_tokens": 47743310.0, | |
| "step": 700 | |
| }, | |
| { | |
| "epoch": 1.416582406471183, | |
| "grad_norm": 0.18456129729747772, | |
| "learning_rate": 0.0002454399756082422, | |
| "loss": 0.2765, | |
| "mean_token_accuracy": 0.8989297412335873, | |
| "num_tokens": 47804656.0, | |
| "step": 701 | |
| }, | |
| { | |
| "epoch": 1.4186046511627908, | |
| "grad_norm": 0.14485791325569153, | |
| "learning_rate": 0.0002450511581728118, | |
| "loss": 0.2378, | |
| "mean_token_accuracy": 0.9135924205183983, | |
| "num_tokens": 47877505.0, | |
| "step": 702 | |
| }, | |
| { | |
| "epoch": 1.4206268958543984, | |
| "grad_norm": 0.16109082102775574, | |
| "learning_rate": 0.00024466222150275427, | |
| "loss": 0.2701, | |
| "mean_token_accuracy": 0.9057381004095078, | |
| "num_tokens": 47947797.0, | |
| "step": 703 | |
| }, | |
| { | |
| "epoch": 1.422649140546006, | |
| "grad_norm": 0.17397062480449677, | |
| "learning_rate": 0.00024427316744927015, | |
| "loss": 0.2748, | |
| "mean_token_accuracy": 0.9010849967598915, | |
| "num_tokens": 48013032.0, | |
| "step": 704 | |
| }, | |
| { | |
| "epoch": 1.4246713852376138, | |
| "grad_norm": 0.17228464782238007, | |
| "learning_rate": 0.0002438839978641188, | |
| "loss": 0.2902, | |
| "mean_token_accuracy": 0.8968134559690952, | |
| "num_tokens": 48077137.0, | |
| "step": 705 | |
| }, | |
| { | |
| "epoch": 1.4266936299292214, | |
| "grad_norm": 0.15708769857883453, | |
| "learning_rate": 0.00024349471459960933, | |
| "loss": 0.2639, | |
| "mean_token_accuracy": 0.9076020307838917, | |
| "num_tokens": 48148193.0, | |
| "step": 706 | |
| }, | |
| { | |
| "epoch": 1.428715874620829, | |
| "grad_norm": 0.16323234140872955, | |
| "learning_rate": 0.000243105319508592, | |
| "loss": 0.2767, | |
| "mean_token_accuracy": 0.9031167514622211, | |
| "num_tokens": 48216944.0, | |
| "step": 707 | |
| }, | |
| { | |
| "epoch": 1.4307381193124369, | |
| "grad_norm": 0.19718225300312042, | |
| "learning_rate": 0.00024271581444444936, | |
| "loss": 0.2857, | |
| "mean_token_accuracy": 0.8991989493370056, | |
| "num_tokens": 48289278.0, | |
| "step": 708 | |
| }, | |
| { | |
| "epoch": 1.4327603640040445, | |
| "grad_norm": 0.18652518093585968, | |
| "learning_rate": 0.0002423262012610874, | |
| "loss": 0.2761, | |
| "mean_token_accuracy": 0.8964316956698895, | |
| "num_tokens": 48356711.0, | |
| "step": 709 | |
| }, | |
| { | |
| "epoch": 1.434782608695652, | |
| "grad_norm": 0.15871575474739075, | |
| "learning_rate": 0.00024193648181292657, | |
| "loss": 0.2667, | |
| "mean_token_accuracy": 0.9004132300615311, | |
| "num_tokens": 48431698.0, | |
| "step": 710 | |
| }, | |
| { | |
| "epoch": 1.43680485338726, | |
| "grad_norm": 0.1658415049314499, | |
| "learning_rate": 0.00024154665795489324, | |
| "loss": 0.2923, | |
| "mean_token_accuracy": 0.8983742482960224, | |
| "num_tokens": 48499782.0, | |
| "step": 711 | |
| }, | |
| { | |
| "epoch": 1.4388270980788676, | |
| "grad_norm": 0.14790105819702148, | |
| "learning_rate": 0.00024115673154241082, | |
| "loss": 0.2752, | |
| "mean_token_accuracy": 0.9012794457376003, | |
| "num_tokens": 48575015.0, | |
| "step": 712 | |
| }, | |
| { | |
| "epoch": 1.4408493427704752, | |
| "grad_norm": 0.1578913778066635, | |
| "learning_rate": 0.00024076670443139056, | |
| "loss": 0.2717, | |
| "mean_token_accuracy": 0.9049608968198299, | |
| "num_tokens": 48645644.0, | |
| "step": 713 | |
| }, | |
| { | |
| "epoch": 1.442871587462083, | |
| "grad_norm": 0.14726778864860535, | |
| "learning_rate": 0.00024037657847822327, | |
| "loss": 0.2472, | |
| "mean_token_accuracy": 0.9099989496171474, | |
| "num_tokens": 48721939.0, | |
| "step": 714 | |
| }, | |
| { | |
| "epoch": 1.4448938321536906, | |
| "grad_norm": 0.1682555377483368, | |
| "learning_rate": 0.00023998635553977, | |
| "loss": 0.255, | |
| "mean_token_accuracy": 0.9088139645755291, | |
| "num_tokens": 48781700.0, | |
| "step": 715 | |
| }, | |
| { | |
| "epoch": 1.4469160768452982, | |
| "grad_norm": 0.1937257945537567, | |
| "learning_rate": 0.00023959603747335364, | |
| "loss": 0.2787, | |
| "mean_token_accuracy": 0.9022819362580776, | |
| "num_tokens": 48848209.0, | |
| "step": 716 | |
| }, | |
| { | |
| "epoch": 1.448938321536906, | |
| "grad_norm": 0.18163816630840302, | |
| "learning_rate": 0.0002392056261367497, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.9066541865468025, | |
| "num_tokens": 48908683.0, | |
| "step": 717 | |
| }, | |
| { | |
| "epoch": 1.4509605662285137, | |
| "grad_norm": 0.17626726627349854, | |
| "learning_rate": 0.00023881512338817763, | |
| "loss": 0.2719, | |
| "mean_token_accuracy": 0.9030824415385723, | |
| "num_tokens": 48971539.0, | |
| "step": 718 | |
| }, | |
| { | |
| "epoch": 1.4529828109201213, | |
| "grad_norm": 0.19325651228427887, | |
| "learning_rate": 0.00023842453108629207, | |
| "loss": 0.2825, | |
| "mean_token_accuracy": 0.9008334875106812, | |
| "num_tokens": 49036641.0, | |
| "step": 719 | |
| }, | |
| { | |
| "epoch": 1.4550050556117289, | |
| "grad_norm": 0.15112407505512238, | |
| "learning_rate": 0.00023803385109017375, | |
| "loss": 0.2491, | |
| "mean_token_accuracy": 0.908204834908247, | |
| "num_tokens": 49116609.0, | |
| "step": 720 | |
| }, | |
| { | |
| "epoch": 1.4570273003033367, | |
| "grad_norm": 0.1619442254304886, | |
| "learning_rate": 0.000237643085259321, | |
| "loss": 0.2674, | |
| "mean_token_accuracy": 0.9027148932218552, | |
| "num_tokens": 49184904.0, | |
| "step": 721 | |
| }, | |
| { | |
| "epoch": 1.4590495449949443, | |
| "grad_norm": 0.18082739412784576, | |
| "learning_rate": 0.00023725223545364036, | |
| "loss": 0.2897, | |
| "mean_token_accuracy": 0.8995592929422855, | |
| "num_tokens": 49242882.0, | |
| "step": 722 | |
| }, | |
| { | |
| "epoch": 1.4610717896865522, | |
| "grad_norm": 0.16797882318496704, | |
| "learning_rate": 0.00023686130353343842, | |
| "loss": 0.2752, | |
| "mean_token_accuracy": 0.9008001163601875, | |
| "num_tokens": 49314113.0, | |
| "step": 723 | |
| }, | |
| { | |
| "epoch": 1.4630940343781598, | |
| "grad_norm": 0.16804397106170654, | |
| "learning_rate": 0.00023647029135941247, | |
| "loss": 0.28, | |
| "mean_token_accuracy": 0.9004204832017422, | |
| "num_tokens": 49380492.0, | |
| "step": 724 | |
| }, | |
| { | |
| "epoch": 1.4651162790697674, | |
| "grad_norm": 0.189345121383667, | |
| "learning_rate": 0.00023607920079264164, | |
| "loss": 0.3136, | |
| "mean_token_accuracy": 0.8898900300264359, | |
| "num_tokens": 49442489.0, | |
| "step": 725 | |
| }, | |
| { | |
| "epoch": 1.467138523761375, | |
| "grad_norm": 0.1601288765668869, | |
| "learning_rate": 0.0002356880336945785, | |
| "loss": 0.2766, | |
| "mean_token_accuracy": 0.8993977271020412, | |
| "num_tokens": 49515310.0, | |
| "step": 726 | |
| }, | |
| { | |
| "epoch": 1.4691607684529828, | |
| "grad_norm": 0.16616767644882202, | |
| "learning_rate": 0.00023529679192703956, | |
| "loss": 0.2233, | |
| "mean_token_accuracy": 0.9060333073139191, | |
| "num_tokens": 49579141.0, | |
| "step": 727 | |
| }, | |
| { | |
| "epoch": 1.4711830131445904, | |
| "grad_norm": 0.17813973128795624, | |
| "learning_rate": 0.00023490547735219682, | |
| "loss": 0.2772, | |
| "mean_token_accuracy": 0.902538850903511, | |
| "num_tokens": 49651616.0, | |
| "step": 728 | |
| }, | |
| { | |
| "epoch": 1.4732052578361983, | |
| "grad_norm": 0.16227717697620392, | |
| "learning_rate": 0.0002345140918325689, | |
| "loss": 0.2725, | |
| "mean_token_accuracy": 0.9031726457178593, | |
| "num_tokens": 49723462.0, | |
| "step": 729 | |
| }, | |
| { | |
| "epoch": 1.4752275025278059, | |
| "grad_norm": 0.17003865540027618, | |
| "learning_rate": 0.00023412263723101214, | |
| "loss": 0.2961, | |
| "mean_token_accuracy": 0.8977791368961334, | |
| "num_tokens": 49787491.0, | |
| "step": 730 | |
| }, | |
| { | |
| "epoch": 1.4772497472194135, | |
| "grad_norm": 0.16923342645168304, | |
| "learning_rate": 0.0002337311154107115, | |
| "loss": 0.2787, | |
| "mean_token_accuracy": 0.9015961550176144, | |
| "num_tokens": 49854833.0, | |
| "step": 731 | |
| }, | |
| { | |
| "epoch": 1.479271991911021, | |
| "grad_norm": 0.1851927489042282, | |
| "learning_rate": 0.00023333952823517194, | |
| "loss": 0.2898, | |
| "mean_token_accuracy": 0.8972079865634441, | |
| "num_tokens": 49922341.0, | |
| "step": 732 | |
| }, | |
| { | |
| "epoch": 1.481294236602629, | |
| "grad_norm": 0.1822906881570816, | |
| "learning_rate": 0.0002329478775682095, | |
| "loss": 0.2829, | |
| "mean_token_accuracy": 0.900902509689331, | |
| "num_tokens": 49979729.0, | |
| "step": 733 | |
| }, | |
| { | |
| "epoch": 1.4833164812942365, | |
| "grad_norm": 0.1649109125137329, | |
| "learning_rate": 0.00023255616527394256, | |
| "loss": 0.2727, | |
| "mean_token_accuracy": 0.9016978107392788, | |
| "num_tokens": 50047775.0, | |
| "step": 734 | |
| }, | |
| { | |
| "epoch": 1.4853387259858444, | |
| "grad_norm": 0.1738775372505188, | |
| "learning_rate": 0.00023216439321678266, | |
| "loss": 0.281, | |
| "mean_token_accuracy": 0.9027018882334232, | |
| "num_tokens": 50118326.0, | |
| "step": 735 | |
| }, | |
| { | |
| "epoch": 1.487360970677452, | |
| "grad_norm": 0.1651855707168579, | |
| "learning_rate": 0.00023177256326142577, | |
| "loss": 0.2885, | |
| "mean_token_accuracy": 0.9000568836927414, | |
| "num_tokens": 50188336.0, | |
| "step": 736 | |
| }, | |
| { | |
| "epoch": 1.4893832153690596, | |
| "grad_norm": 0.17814993858337402, | |
| "learning_rate": 0.00023138067727284352, | |
| "loss": 0.2649, | |
| "mean_token_accuracy": 0.9053604751825333, | |
| "num_tokens": 50253602.0, | |
| "step": 737 | |
| }, | |
| { | |
| "epoch": 1.4914054600606672, | |
| "grad_norm": 0.18156695365905762, | |
| "learning_rate": 0.00023098873711627427, | |
| "loss": 0.2789, | |
| "mean_token_accuracy": 0.9026945792138577, | |
| "num_tokens": 50320254.0, | |
| "step": 738 | |
| }, | |
| { | |
| "epoch": 1.493427704752275, | |
| "grad_norm": 0.1529979407787323, | |
| "learning_rate": 0.00023059674465721402, | |
| "loss": 0.2575, | |
| "mean_token_accuracy": 0.9098235592246056, | |
| "num_tokens": 50394210.0, | |
| "step": 739 | |
| }, | |
| { | |
| "epoch": 1.4954499494438827, | |
| "grad_norm": 0.18546129763126373, | |
| "learning_rate": 0.000230204701761408, | |
| "loss": 0.2723, | |
| "mean_token_accuracy": 0.9047368690371513, | |
| "num_tokens": 50462482.0, | |
| "step": 740 | |
| }, | |
| { | |
| "epoch": 1.4974721941354905, | |
| "grad_norm": 0.17348864674568176, | |
| "learning_rate": 0.00022981261029484117, | |
| "loss": 0.2877, | |
| "mean_token_accuracy": 0.9010139890015125, | |
| "num_tokens": 50533752.0, | |
| "step": 741 | |
| }, | |
| { | |
| "epoch": 1.499494438827098, | |
| "grad_norm": 0.18445433676242828, | |
| "learning_rate": 0.00022942047212372996, | |
| "loss": 0.2889, | |
| "mean_token_accuracy": 0.8973320014774799, | |
| "num_tokens": 50595611.0, | |
| "step": 742 | |
| }, | |
| { | |
| "epoch": 1.5015166835187057, | |
| "grad_norm": 0.1771615445613861, | |
| "learning_rate": 0.00022902828911451284, | |
| "loss": 0.2869, | |
| "mean_token_accuracy": 0.9018849320709705, | |
| "num_tokens": 50660163.0, | |
| "step": 743 | |
| }, | |
| { | |
| "epoch": 1.5035389282103133, | |
| "grad_norm": 0.17673981189727783, | |
| "learning_rate": 0.00022863606313384193, | |
| "loss": 0.2745, | |
| "mean_token_accuracy": 0.9061728455126286, | |
| "num_tokens": 50735476.0, | |
| "step": 744 | |
| }, | |
| { | |
| "epoch": 1.5055611729019212, | |
| "grad_norm": 0.16728192567825317, | |
| "learning_rate": 0.00022824379604857376, | |
| "loss": 0.27, | |
| "mean_token_accuracy": 0.8988127410411835, | |
| "num_tokens": 50802788.0, | |
| "step": 745 | |
| }, | |
| { | |
| "epoch": 1.5075834175935288, | |
| "grad_norm": 0.15720367431640625, | |
| "learning_rate": 0.0002278514897257605, | |
| "loss": 0.2768, | |
| "mean_token_accuracy": 0.903729647397995, | |
| "num_tokens": 50871752.0, | |
| "step": 746 | |
| }, | |
| { | |
| "epoch": 1.5096056622851366, | |
| "grad_norm": 0.1581096202135086, | |
| "learning_rate": 0.00022745914603264114, | |
| "loss": 0.2782, | |
| "mean_token_accuracy": 0.9031247049570084, | |
| "num_tokens": 50946163.0, | |
| "step": 747 | |
| }, | |
| { | |
| "epoch": 1.5116279069767442, | |
| "grad_norm": 0.16542676091194153, | |
| "learning_rate": 0.00022706676683663239, | |
| "loss": 0.2615, | |
| "mean_token_accuracy": 0.9070020318031311, | |
| "num_tokens": 51020476.0, | |
| "step": 748 | |
| }, | |
| { | |
| "epoch": 1.5136501516683518, | |
| "grad_norm": 0.15188099443912506, | |
| "learning_rate": 0.00022667435400532013, | |
| "loss": 0.2683, | |
| "mean_token_accuracy": 0.9043072015047073, | |
| "num_tokens": 51099534.0, | |
| "step": 749 | |
| }, | |
| { | |
| "epoch": 1.5156723963599594, | |
| "grad_norm": 0.16521647572517395, | |
| "learning_rate": 0.00022628190940645023, | |
| "loss": 0.2762, | |
| "mean_token_accuracy": 0.9001554064452648, | |
| "num_tokens": 51160512.0, | |
| "step": 750 | |
| }, | |
| { | |
| "epoch": 1.5176946410515673, | |
| "grad_norm": 0.14251260459423065, | |
| "learning_rate": 0.00022588943490791974, | |
| "loss": 0.2354, | |
| "mean_token_accuracy": 0.9080785401165485, | |
| "num_tokens": 51240154.0, | |
| "step": 751 | |
| }, | |
| { | |
| "epoch": 1.5197168857431749, | |
| "grad_norm": 0.18312643468379974, | |
| "learning_rate": 0.00022549693237776812, | |
| "loss": 0.2882, | |
| "mean_token_accuracy": 0.896622322499752, | |
| "num_tokens": 51306825.0, | |
| "step": 752 | |
| }, | |
| { | |
| "epoch": 1.5217391304347827, | |
| "grad_norm": 0.1863006204366684, | |
| "learning_rate": 0.00022510440368416813, | |
| "loss": 0.2827, | |
| "mean_token_accuracy": 0.9015981592237949, | |
| "num_tokens": 51374019.0, | |
| "step": 753 | |
| }, | |
| { | |
| "epoch": 1.5237613751263903, | |
| "grad_norm": 0.2003999501466751, | |
| "learning_rate": 0.0002247118506954172, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.8948666267096996, | |
| "num_tokens": 51437280.0, | |
| "step": 754 | |
| }, | |
| { | |
| "epoch": 1.525783619817998, | |
| "grad_norm": 0.15196073055267334, | |
| "learning_rate": 0.00022431927527992822, | |
| "loss": 0.2457, | |
| "mean_token_accuracy": 0.9064719304442406, | |
| "num_tokens": 51516774.0, | |
| "step": 755 | |
| }, | |
| { | |
| "epoch": 1.5278058645096055, | |
| "grad_norm": 0.16066138446331024, | |
| "learning_rate": 0.00022392667930622105, | |
| "loss": 0.2567, | |
| "mean_token_accuracy": 0.9101277217268944, | |
| "num_tokens": 51587203.0, | |
| "step": 756 | |
| }, | |
| { | |
| "epoch": 1.5298281092012134, | |
| "grad_norm": 0.2019067108631134, | |
| "learning_rate": 0.0002235340646429131, | |
| "loss": 0.288, | |
| "mean_token_accuracy": 0.8997247666120529, | |
| "num_tokens": 51647601.0, | |
| "step": 757 | |
| }, | |
| { | |
| "epoch": 1.531850353892821, | |
| "grad_norm": 0.1630539447069168, | |
| "learning_rate": 0.00022314143315871107, | |
| "loss": 0.2839, | |
| "mean_token_accuracy": 0.9003589190542698, | |
| "num_tokens": 51709791.0, | |
| "step": 758 | |
| }, | |
| { | |
| "epoch": 1.5338725985844288, | |
| "grad_norm": 0.17768684029579163, | |
| "learning_rate": 0.0002227487867224014, | |
| "loss": 0.2953, | |
| "mean_token_accuracy": 0.8950943425297737, | |
| "num_tokens": 51775485.0, | |
| "step": 759 | |
| }, | |
| { | |
| "epoch": 1.5358948432760364, | |
| "grad_norm": 0.16720645129680634, | |
| "learning_rate": 0.000222356127202842, | |
| "loss": 0.268, | |
| "mean_token_accuracy": 0.9044617936015129, | |
| "num_tokens": 51840213.0, | |
| "step": 760 | |
| }, | |
| { | |
| "epoch": 1.537917087967644, | |
| "grad_norm": 0.18721389770507812, | |
| "learning_rate": 0.00022196345646895282, | |
| "loss": 0.3132, | |
| "mean_token_accuracy": 0.8925869949162006, | |
| "num_tokens": 51902000.0, | |
| "step": 761 | |
| }, | |
| { | |
| "epoch": 1.5399393326592516, | |
| "grad_norm": 0.16676832735538483, | |
| "learning_rate": 0.00022157077638970733, | |
| "loss": 0.2685, | |
| "mean_token_accuracy": 0.9057548753917217, | |
| "num_tokens": 51971547.0, | |
| "step": 762 | |
| }, | |
| { | |
| "epoch": 1.5419615773508595, | |
| "grad_norm": 0.17367734014987946, | |
| "learning_rate": 0.00022117808883412337, | |
| "loss": 0.2919, | |
| "mean_token_accuracy": 0.8966298326849937, | |
| "num_tokens": 52041743.0, | |
| "step": 763 | |
| }, | |
| { | |
| "epoch": 1.543983822042467, | |
| "grad_norm": 0.15831947326660156, | |
| "learning_rate": 0.0002207853956712544, | |
| "loss": 0.2713, | |
| "mean_token_accuracy": 0.9037296660244465, | |
| "num_tokens": 52114445.0, | |
| "step": 764 | |
| }, | |
| { | |
| "epoch": 1.546006066734075, | |
| "grad_norm": 0.1643955409526825, | |
| "learning_rate": 0.00022039269877018066, | |
| "loss": 0.2555, | |
| "mean_token_accuracy": 0.9053449369966984, | |
| "num_tokens": 52184749.0, | |
| "step": 765 | |
| }, | |
| { | |
| "epoch": 1.5480283114256825, | |
| "grad_norm": 0.19596439599990845, | |
| "learning_rate": 0.00022000000000000003, | |
| "loss": 0.2991, | |
| "mean_token_accuracy": 0.8983559235930443, | |
| "num_tokens": 52246858.0, | |
| "step": 766 | |
| }, | |
| { | |
| "epoch": 1.5500505561172901, | |
| "grad_norm": 0.17947359383106232, | |
| "learning_rate": 0.00021960730122981938, | |
| "loss": 0.3053, | |
| "mean_token_accuracy": 0.894125934690237, | |
| "num_tokens": 52311538.0, | |
| "step": 767 | |
| }, | |
| { | |
| "epoch": 1.5520728008088978, | |
| "grad_norm": 0.1566184163093567, | |
| "learning_rate": 0.00021921460432874565, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9079805836081505, | |
| "num_tokens": 52377316.0, | |
| "step": 768 | |
| }, | |
| { | |
| "epoch": 1.5540950455005056, | |
| "grad_norm": 0.1782991886138916, | |
| "learning_rate": 0.0002188219111658767, | |
| "loss": 0.293, | |
| "mean_token_accuracy": 0.8960098177194595, | |
| "num_tokens": 52439738.0, | |
| "step": 769 | |
| }, | |
| { | |
| "epoch": 1.5561172901921132, | |
| "grad_norm": 0.1581069380044937, | |
| "learning_rate": 0.0002184292236102927, | |
| "loss": 0.2728, | |
| "mean_token_accuracy": 0.901589822024107, | |
| "num_tokens": 52511123.0, | |
| "step": 770 | |
| }, | |
| { | |
| "epoch": 1.558139534883721, | |
| "grad_norm": 0.16994433104991913, | |
| "learning_rate": 0.0002180365435310472, | |
| "loss": 0.2735, | |
| "mean_token_accuracy": 0.9033331945538521, | |
| "num_tokens": 52576097.0, | |
| "step": 771 | |
| }, | |
| { | |
| "epoch": 1.5601617795753286, | |
| "grad_norm": 0.1678851991891861, | |
| "learning_rate": 0.00021764387279715806, | |
| "loss": 0.2903, | |
| "mean_token_accuracy": 0.8981217853724957, | |
| "num_tokens": 52651544.0, | |
| "step": 772 | |
| }, | |
| { | |
| "epoch": 1.5621840242669363, | |
| "grad_norm": 0.19909563660621643, | |
| "learning_rate": 0.00021725121327759866, | |
| "loss": 0.2981, | |
| "mean_token_accuracy": 0.8940173611044884, | |
| "num_tokens": 52710252.0, | |
| "step": 773 | |
| }, | |
| { | |
| "epoch": 1.5642062689585439, | |
| "grad_norm": 0.15204082429409027, | |
| "learning_rate": 0.00021685856684128897, | |
| "loss": 0.2523, | |
| "mean_token_accuracy": 0.9075472876429558, | |
| "num_tokens": 52781084.0, | |
| "step": 774 | |
| }, | |
| { | |
| "epoch": 1.5662285136501517, | |
| "grad_norm": 0.19516132771968842, | |
| "learning_rate": 0.00021646593535708695, | |
| "loss": 0.2984, | |
| "mean_token_accuracy": 0.8923540487885475, | |
| "num_tokens": 52844889.0, | |
| "step": 775 | |
| }, | |
| { | |
| "epoch": 1.5682507583417593, | |
| "grad_norm": 0.16001375019550323, | |
| "learning_rate": 0.00021607332069377902, | |
| "loss": 0.2668, | |
| "mean_token_accuracy": 0.898894976824522, | |
| "num_tokens": 52910879.0, | |
| "step": 776 | |
| }, | |
| { | |
| "epoch": 1.5702730030333671, | |
| "grad_norm": 0.1823982298374176, | |
| "learning_rate": 0.00021568072472007185, | |
| "loss": 0.301, | |
| "mean_token_accuracy": 0.8939221948385239, | |
| "num_tokens": 52970597.0, | |
| "step": 777 | |
| }, | |
| { | |
| "epoch": 1.5722952477249748, | |
| "grad_norm": 0.17761389911174774, | |
| "learning_rate": 0.0002152881493045829, | |
| "loss": 0.2601, | |
| "mean_token_accuracy": 0.9084571748971939, | |
| "num_tokens": 53042768.0, | |
| "step": 778 | |
| }, | |
| { | |
| "epoch": 1.5743174924165824, | |
| "grad_norm": 0.17410063743591309, | |
| "learning_rate": 0.00021489559631583194, | |
| "loss": 0.272, | |
| "mean_token_accuracy": 0.8999650180339813, | |
| "num_tokens": 53103091.0, | |
| "step": 779 | |
| }, | |
| { | |
| "epoch": 1.57633973710819, | |
| "grad_norm": 0.15084944665431976, | |
| "learning_rate": 0.00021450306762223198, | |
| "loss": 0.2387, | |
| "mean_token_accuracy": 0.9114542976021767, | |
| "num_tokens": 53180173.0, | |
| "step": 780 | |
| }, | |
| { | |
| "epoch": 1.5783619817997978, | |
| "grad_norm": 0.18222583830356598, | |
| "learning_rate": 0.00021411056509208033, | |
| "loss": 0.2994, | |
| "mean_token_accuracy": 0.8931626752018929, | |
| "num_tokens": 53243670.0, | |
| "step": 781 | |
| }, | |
| { | |
| "epoch": 1.5803842264914054, | |
| "grad_norm": 0.19381971657276154, | |
| "learning_rate": 0.0002137180905935499, | |
| "loss": 0.3116, | |
| "mean_token_accuracy": 0.8938373290002346, | |
| "num_tokens": 53301834.0, | |
| "step": 782 | |
| }, | |
| { | |
| "epoch": 1.5824064711830133, | |
| "grad_norm": 0.157192200422287, | |
| "learning_rate": 0.00021332564599467997, | |
| "loss": 0.2654, | |
| "mean_token_accuracy": 0.9061449654400349, | |
| "num_tokens": 53368342.0, | |
| "step": 783 | |
| }, | |
| { | |
| "epoch": 1.5844287158746209, | |
| "grad_norm": 0.1753574013710022, | |
| "learning_rate": 0.00021293323316336774, | |
| "loss": 0.2793, | |
| "mean_token_accuracy": 0.901081707328558, | |
| "num_tokens": 53430874.0, | |
| "step": 784 | |
| }, | |
| { | |
| "epoch": 1.5864509605662285, | |
| "grad_norm": 0.17201204597949982, | |
| "learning_rate": 0.00021254085396735895, | |
| "loss": 0.2965, | |
| "mean_token_accuracy": 0.8940661884844303, | |
| "num_tokens": 53503559.0, | |
| "step": 785 | |
| }, | |
| { | |
| "epoch": 1.588473205257836, | |
| "grad_norm": 0.16792644560337067, | |
| "learning_rate": 0.00021214851027423953, | |
| "loss": 0.2853, | |
| "mean_token_accuracy": 0.8981418162584305, | |
| "num_tokens": 53570685.0, | |
| "step": 786 | |
| }, | |
| { | |
| "epoch": 1.590495449949444, | |
| "grad_norm": 0.1627027541399002, | |
| "learning_rate": 0.00021175620395142631, | |
| "loss": 0.2726, | |
| "mean_token_accuracy": 0.9035519734025002, | |
| "num_tokens": 53641626.0, | |
| "step": 787 | |
| }, | |
| { | |
| "epoch": 1.5925176946410515, | |
| "grad_norm": 0.1961667835712433, | |
| "learning_rate": 0.00021136393686615814, | |
| "loss": 0.2932, | |
| "mean_token_accuracy": 0.8953234739601612, | |
| "num_tokens": 53703211.0, | |
| "step": 788 | |
| }, | |
| { | |
| "epoch": 1.5945399393326594, | |
| "grad_norm": 0.16637316346168518, | |
| "learning_rate": 0.00021097171088548718, | |
| "loss": 0.2643, | |
| "mean_token_accuracy": 0.8997809141874313, | |
| "num_tokens": 53774031.0, | |
| "step": 789 | |
| }, | |
| { | |
| "epoch": 1.596562184024267, | |
| "grad_norm": 0.16356298327445984, | |
| "learning_rate": 0.0002105795278762701, | |
| "loss": 0.2812, | |
| "mean_token_accuracy": 0.9001871235668659, | |
| "num_tokens": 53842430.0, | |
| "step": 790 | |
| }, | |
| { | |
| "epoch": 1.5985844287158746, | |
| "grad_norm": 0.15379726886749268, | |
| "learning_rate": 0.00021018738970515885, | |
| "loss": 0.2753, | |
| "mean_token_accuracy": 0.9013938829302788, | |
| "num_tokens": 53918815.0, | |
| "step": 791 | |
| }, | |
| { | |
| "epoch": 1.6006066734074822, | |
| "grad_norm": 0.17770731449127197, | |
| "learning_rate": 0.000209795298238592, | |
| "loss": 0.2775, | |
| "mean_token_accuracy": 0.9020545892417431, | |
| "num_tokens": 53984486.0, | |
| "step": 792 | |
| }, | |
| { | |
| "epoch": 1.60262891809909, | |
| "grad_norm": 0.18510940670967102, | |
| "learning_rate": 0.00020940325534278596, | |
| "loss": 0.3084, | |
| "mean_token_accuracy": 0.8904885537922382, | |
| "num_tokens": 54049329.0, | |
| "step": 793 | |
| }, | |
| { | |
| "epoch": 1.6046511627906976, | |
| "grad_norm": 0.15585996210575104, | |
| "learning_rate": 0.00020901126288372574, | |
| "loss": 0.243, | |
| "mean_token_accuracy": 0.9050154872238636, | |
| "num_tokens": 54121377.0, | |
| "step": 794 | |
| }, | |
| { | |
| "epoch": 1.6066734074823055, | |
| "grad_norm": 0.2045961320400238, | |
| "learning_rate": 0.0002086193227271565, | |
| "loss": 0.3026, | |
| "mean_token_accuracy": 0.8903013169765472, | |
| "num_tokens": 54181133.0, | |
| "step": 795 | |
| }, | |
| { | |
| "epoch": 1.608695652173913, | |
| "grad_norm": 0.15141364932060242, | |
| "learning_rate": 0.00020822743673857424, | |
| "loss": 0.2622, | |
| "mean_token_accuracy": 0.90499372407794, | |
| "num_tokens": 54263363.0, | |
| "step": 796 | |
| }, | |
| { | |
| "epoch": 1.6107178968655207, | |
| "grad_norm": 0.15345874428749084, | |
| "learning_rate": 0.0002078356067832174, | |
| "loss": 0.2547, | |
| "mean_token_accuracy": 0.904791995882988, | |
| "num_tokens": 54334487.0, | |
| "step": 797 | |
| }, | |
| { | |
| "epoch": 1.6127401415571283, | |
| "grad_norm": 0.16000673174858093, | |
| "learning_rate": 0.00020744383472605745, | |
| "loss": 0.2731, | |
| "mean_token_accuracy": 0.9041004255414009, | |
| "num_tokens": 54403142.0, | |
| "step": 798 | |
| }, | |
| { | |
| "epoch": 1.6147623862487361, | |
| "grad_norm": 0.16879165172576904, | |
| "learning_rate": 0.0002070521224317905, | |
| "loss": 0.2736, | |
| "mean_token_accuracy": 0.9043679311871529, | |
| "num_tokens": 54467728.0, | |
| "step": 799 | |
| }, | |
| { | |
| "epoch": 1.6167846309403437, | |
| "grad_norm": 0.16487041115760803, | |
| "learning_rate": 0.00020666047176482816, | |
| "loss": 0.2929, | |
| "mean_token_accuracy": 0.8993552401661873, | |
| "num_tokens": 54537407.0, | |
| "step": 800 | |
| }, | |
| { | |
| "epoch": 1.6188068756319516, | |
| "grad_norm": 0.17032210528850555, | |
| "learning_rate": 0.00020626888458928858, | |
| "loss": 0.2799, | |
| "mean_token_accuracy": 0.8998575955629349, | |
| "num_tokens": 54599784.0, | |
| "step": 801 | |
| }, | |
| { | |
| "epoch": 1.6208291203235592, | |
| "grad_norm": 0.17334811389446259, | |
| "learning_rate": 0.00020587736276898798, | |
| "loss": 0.2606, | |
| "mean_token_accuracy": 0.901070773601532, | |
| "num_tokens": 54663420.0, | |
| "step": 802 | |
| }, | |
| { | |
| "epoch": 1.6228513650151668, | |
| "grad_norm": 0.1741548776626587, | |
| "learning_rate": 0.00020548590816743108, | |
| "loss": 0.2823, | |
| "mean_token_accuracy": 0.8988193459808826, | |
| "num_tokens": 54727696.0, | |
| "step": 803 | |
| }, | |
| { | |
| "epoch": 1.6248736097067744, | |
| "grad_norm": 0.1664174348115921, | |
| "learning_rate": 0.00020509452264780325, | |
| "loss": 0.2643, | |
| "mean_token_accuracy": 0.9047059267759323, | |
| "num_tokens": 54791606.0, | |
| "step": 804 | |
| }, | |
| { | |
| "epoch": 1.6268958543983822, | |
| "grad_norm": 0.14816100895404816, | |
| "learning_rate": 0.0002047032080729605, | |
| "loss": 0.2449, | |
| "mean_token_accuracy": 0.903932623565197, | |
| "num_tokens": 54865039.0, | |
| "step": 805 | |
| }, | |
| { | |
| "epoch": 1.6289180990899899, | |
| "grad_norm": 0.12588512897491455, | |
| "learning_rate": 0.00020431196630542152, | |
| "loss": 0.227, | |
| "mean_token_accuracy": 0.9160388633608818, | |
| "num_tokens": 54958620.0, | |
| "step": 806 | |
| }, | |
| { | |
| "epoch": 1.6309403437815977, | |
| "grad_norm": 0.17510341107845306, | |
| "learning_rate": 0.00020392079920735835, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.8962272480130196, | |
| "num_tokens": 55024008.0, | |
| "step": 807 | |
| }, | |
| { | |
| "epoch": 1.6329625884732053, | |
| "grad_norm": 0.1542372852563858, | |
| "learning_rate": 0.00020352970864058757, | |
| "loss": 0.2614, | |
| "mean_token_accuracy": 0.9044002443552017, | |
| "num_tokens": 55087163.0, | |
| "step": 808 | |
| }, | |
| { | |
| "epoch": 1.634984833164813, | |
| "grad_norm": 0.16116388142108917, | |
| "learning_rate": 0.00020313869646656162, | |
| "loss": 0.2721, | |
| "mean_token_accuracy": 0.9043215177953243, | |
| "num_tokens": 55154699.0, | |
| "step": 809 | |
| }, | |
| { | |
| "epoch": 1.6370070778564205, | |
| "grad_norm": 0.1448214203119278, | |
| "learning_rate": 0.0002027477645463597, | |
| "loss": 0.2598, | |
| "mean_token_accuracy": 0.9074460677802563, | |
| "num_tokens": 55224995.0, | |
| "step": 810 | |
| }, | |
| { | |
| "epoch": 1.6390293225480284, | |
| "grad_norm": 0.17575567960739136, | |
| "learning_rate": 0.00020235691474067912, | |
| "loss": 0.2647, | |
| "mean_token_accuracy": 0.9028755128383636, | |
| "num_tokens": 55291465.0, | |
| "step": 811 | |
| }, | |
| { | |
| "epoch": 1.641051567239636, | |
| "grad_norm": 0.1718558669090271, | |
| "learning_rate": 0.0002019661489098263, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9058180525898933, | |
| "num_tokens": 55356793.0, | |
| "step": 812 | |
| }, | |
| { | |
| "epoch": 1.6430738119312438, | |
| "grad_norm": 0.16898474097251892, | |
| "learning_rate": 0.00020157546891370797, | |
| "loss": 0.2868, | |
| "mean_token_accuracy": 0.9008054211735725, | |
| "num_tokens": 55428748.0, | |
| "step": 813 | |
| }, | |
| { | |
| "epoch": 1.6450960566228514, | |
| "grad_norm": 0.1628302037715912, | |
| "learning_rate": 0.00020118487661182241, | |
| "loss": 0.2667, | |
| "mean_token_accuracy": 0.9067884795367718, | |
| "num_tokens": 55499100.0, | |
| "step": 814 | |
| }, | |
| { | |
| "epoch": 1.647118301314459, | |
| "grad_norm": 0.18391703069210052, | |
| "learning_rate": 0.00020079437386325032, | |
| "loss": 0.3138, | |
| "mean_token_accuracy": 0.8893741592764854, | |
| "num_tokens": 55570834.0, | |
| "step": 815 | |
| }, | |
| { | |
| "epoch": 1.6491405460060666, | |
| "grad_norm": 0.17336952686309814, | |
| "learning_rate": 0.00020040396252664642, | |
| "loss": 0.2778, | |
| "mean_token_accuracy": 0.9026199728250504, | |
| "num_tokens": 55640251.0, | |
| "step": 816 | |
| }, | |
| { | |
| "epoch": 1.6511627906976745, | |
| "grad_norm": 0.14611810445785522, | |
| "learning_rate": 0.00020001364446023002, | |
| "loss": 0.2433, | |
| "mean_token_accuracy": 0.9094121158123016, | |
| "num_tokens": 55722284.0, | |
| "step": 817 | |
| }, | |
| { | |
| "epoch": 1.653185035389282, | |
| "grad_norm": 0.15244677662849426, | |
| "learning_rate": 0.0001996234215217768, | |
| "loss": 0.2825, | |
| "mean_token_accuracy": 0.8958746008574963, | |
| "num_tokens": 55792901.0, | |
| "step": 818 | |
| }, | |
| { | |
| "epoch": 1.65520728008089, | |
| "grad_norm": 0.17220915853977203, | |
| "learning_rate": 0.00019923329556860954, | |
| "loss": 0.2887, | |
| "mean_token_accuracy": 0.8924598507583141, | |
| "num_tokens": 55854966.0, | |
| "step": 819 | |
| }, | |
| { | |
| "epoch": 1.6572295247724975, | |
| "grad_norm": 0.1598389595746994, | |
| "learning_rate": 0.00019884326845758925, | |
| "loss": 0.2718, | |
| "mean_token_accuracy": 0.9044957980513573, | |
| "num_tokens": 55929575.0, | |
| "step": 820 | |
| }, | |
| { | |
| "epoch": 1.6592517694641051, | |
| "grad_norm": 0.1721997857093811, | |
| "learning_rate": 0.0001984533420451068, | |
| "loss": 0.2944, | |
| "mean_token_accuracy": 0.8970884680747986, | |
| "num_tokens": 55997255.0, | |
| "step": 821 | |
| }, | |
| { | |
| "epoch": 1.6612740141557127, | |
| "grad_norm": 0.184437558054924, | |
| "learning_rate": 0.0001980635181870735, | |
| "loss": 0.2996, | |
| "mean_token_accuracy": 0.8938624709844589, | |
| "num_tokens": 56059407.0, | |
| "step": 822 | |
| }, | |
| { | |
| "epoch": 1.6632962588473206, | |
| "grad_norm": 0.17128629982471466, | |
| "learning_rate": 0.0001976737987389127, | |
| "loss": 0.2999, | |
| "mean_token_accuracy": 0.892108865082264, | |
| "num_tokens": 56124709.0, | |
| "step": 823 | |
| }, | |
| { | |
| "epoch": 1.6653185035389282, | |
| "grad_norm": 0.18039193749427795, | |
| "learning_rate": 0.00019728418555555068, | |
| "loss": 0.248, | |
| "mean_token_accuracy": 0.8995288237929344, | |
| "num_tokens": 56191016.0, | |
| "step": 824 | |
| }, | |
| { | |
| "epoch": 1.667340748230536, | |
| "grad_norm": 0.16719485819339752, | |
| "learning_rate": 0.00019689468049140802, | |
| "loss": 0.2673, | |
| "mean_token_accuracy": 0.9037236869335175, | |
| "num_tokens": 56259334.0, | |
| "step": 825 | |
| }, | |
| { | |
| "epoch": 1.6693629929221436, | |
| "grad_norm": 0.16847628355026245, | |
| "learning_rate": 0.00019650528540039077, | |
| "loss": 0.266, | |
| "mean_token_accuracy": 0.9061383940279484, | |
| "num_tokens": 56326477.0, | |
| "step": 826 | |
| }, | |
| { | |
| "epoch": 1.6713852376137512, | |
| "grad_norm": 0.17857936024665833, | |
| "learning_rate": 0.00019611600213588127, | |
| "loss": 0.3023, | |
| "mean_token_accuracy": 0.8900899365544319, | |
| "num_tokens": 56386327.0, | |
| "step": 827 | |
| }, | |
| { | |
| "epoch": 1.6734074823053589, | |
| "grad_norm": 0.18187786638736725, | |
| "learning_rate": 0.0001957268325507299, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.8927515000104904, | |
| "num_tokens": 56446400.0, | |
| "step": 828 | |
| }, | |
| { | |
| "epoch": 1.6754297269969667, | |
| "grad_norm": 0.15920601785182953, | |
| "learning_rate": 0.0001953377784972458, | |
| "loss": 0.2834, | |
| "mean_token_accuracy": 0.8984440118074417, | |
| "num_tokens": 56516627.0, | |
| "step": 829 | |
| }, | |
| { | |
| "epoch": 1.6774519716885743, | |
| "grad_norm": 0.16971920430660248, | |
| "learning_rate": 0.00019494884182718827, | |
| "loss": 0.2845, | |
| "mean_token_accuracy": 0.8991547487676144, | |
| "num_tokens": 56586404.0, | |
| "step": 830 | |
| }, | |
| { | |
| "epoch": 1.6794742163801821, | |
| "grad_norm": 0.16059236228466034, | |
| "learning_rate": 0.00019456002439175794, | |
| "loss": 0.2658, | |
| "mean_token_accuracy": 0.9038873426616192, | |
| "num_tokens": 56657253.0, | |
| "step": 831 | |
| }, | |
| { | |
| "epoch": 1.6814964610717897, | |
| "grad_norm": 0.16817672550678253, | |
| "learning_rate": 0.00019417132804158777, | |
| "loss": 0.2825, | |
| "mean_token_accuracy": 0.8981058970093727, | |
| "num_tokens": 56725926.0, | |
| "step": 832 | |
| }, | |
| { | |
| "epoch": 1.6835187057633973, | |
| "grad_norm": 0.15651072561740875, | |
| "learning_rate": 0.00019378275462673464, | |
| "loss": 0.2683, | |
| "mean_token_accuracy": 0.9055442661046982, | |
| "num_tokens": 56794928.0, | |
| "step": 833 | |
| }, | |
| { | |
| "epoch": 1.685540950455005, | |
| "grad_norm": 0.16662436723709106, | |
| "learning_rate": 0.00019339430599667009, | |
| "loss": 0.2795, | |
| "mean_token_accuracy": 0.9005163908004761, | |
| "num_tokens": 56861202.0, | |
| "step": 834 | |
| }, | |
| { | |
| "epoch": 1.6875631951466128, | |
| "grad_norm": 0.15520507097244263, | |
| "learning_rate": 0.0001930059840002719, | |
| "loss": 0.2789, | |
| "mean_token_accuracy": 0.9018525704741478, | |
| "num_tokens": 56940546.0, | |
| "step": 835 | |
| }, | |
| { | |
| "epoch": 1.6895854398382204, | |
| "grad_norm": 0.16705678403377533, | |
| "learning_rate": 0.00019261779048581498, | |
| "loss": 0.2817, | |
| "mean_token_accuracy": 0.9004562273621559, | |
| "num_tokens": 57010510.0, | |
| "step": 836 | |
| }, | |
| { | |
| "epoch": 1.6916076845298282, | |
| "grad_norm": 0.17928999662399292, | |
| "learning_rate": 0.00019222972730096281, | |
| "loss": 0.2898, | |
| "mean_token_accuracy": 0.8954050242900848, | |
| "num_tokens": 57076063.0, | |
| "step": 837 | |
| }, | |
| { | |
| "epoch": 1.6936299292214358, | |
| "grad_norm": 0.17176282405853271, | |
| "learning_rate": 0.00019184179629275842, | |
| "loss": 0.2784, | |
| "mean_token_accuracy": 0.9002024792134762, | |
| "num_tokens": 57139142.0, | |
| "step": 838 | |
| }, | |
| { | |
| "epoch": 1.6956521739130435, | |
| "grad_norm": 0.2011646181344986, | |
| "learning_rate": 0.00019145399930761592, | |
| "loss": 0.33, | |
| "mean_token_accuracy": 0.8854256272315979, | |
| "num_tokens": 57196478.0, | |
| "step": 839 | |
| }, | |
| { | |
| "epoch": 1.697674418604651, | |
| "grad_norm": 0.1873674988746643, | |
| "learning_rate": 0.00019106633819131132, | |
| "loss": 0.2956, | |
| "mean_token_accuracy": 0.8927418142557144, | |
| "num_tokens": 57257834.0, | |
| "step": 840 | |
| }, | |
| { | |
| "epoch": 1.699696663296259, | |
| "grad_norm": 0.15767039358615875, | |
| "learning_rate": 0.00019067881478897406, | |
| "loss": 0.2603, | |
| "mean_token_accuracy": 0.8997323326766491, | |
| "num_tokens": 57332979.0, | |
| "step": 841 | |
| }, | |
| { | |
| "epoch": 1.7017189079878665, | |
| "grad_norm": 0.1793752908706665, | |
| "learning_rate": 0.00019029143094507803, | |
| "loss": 0.2914, | |
| "mean_token_accuracy": 0.8960652127861977, | |
| "num_tokens": 57393796.0, | |
| "step": 842 | |
| }, | |
| { | |
| "epoch": 1.7037411526794743, | |
| "grad_norm": 0.17136353254318237, | |
| "learning_rate": 0.00018990418850343299, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.8985873088240623, | |
| "num_tokens": 57461020.0, | |
| "step": 843 | |
| }, | |
| { | |
| "epoch": 1.705763397371082, | |
| "grad_norm": 0.15511257946491241, | |
| "learning_rate": 0.0001895170893071756, | |
| "loss": 0.2648, | |
| "mean_token_accuracy": 0.9044994860887527, | |
| "num_tokens": 57534185.0, | |
| "step": 844 | |
| }, | |
| { | |
| "epoch": 1.7077856420626896, | |
| "grad_norm": 0.16191929578781128, | |
| "learning_rate": 0.00018913013519876066, | |
| "loss": 0.2689, | |
| "mean_token_accuracy": 0.9016621857881546, | |
| "num_tokens": 57602240.0, | |
| "step": 845 | |
| }, | |
| { | |
| "epoch": 1.7098078867542972, | |
| "grad_norm": 0.17678587138652802, | |
| "learning_rate": 0.00018874332801995257, | |
| "loss": 0.2791, | |
| "mean_token_accuracy": 0.9009885340929031, | |
| "num_tokens": 57665999.0, | |
| "step": 846 | |
| }, | |
| { | |
| "epoch": 1.7118301314459048, | |
| "grad_norm": 0.14890553057193756, | |
| "learning_rate": 0.0001883566696118162, | |
| "loss": 0.2469, | |
| "mean_token_accuracy": 0.9083396308124065, | |
| "num_tokens": 57745250.0, | |
| "step": 847 | |
| }, | |
| { | |
| "epoch": 1.7138523761375126, | |
| "grad_norm": 0.16399073600769043, | |
| "learning_rate": 0.00018797016181470856, | |
| "loss": 0.2699, | |
| "mean_token_accuracy": 0.9010614044964314, | |
| "num_tokens": 57820665.0, | |
| "step": 848 | |
| }, | |
| { | |
| "epoch": 1.7158746208291205, | |
| "grad_norm": 0.17773596942424774, | |
| "learning_rate": 0.00018758380646826943, | |
| "loss": 0.2801, | |
| "mean_token_accuracy": 0.9002369157969952, | |
| "num_tokens": 57882848.0, | |
| "step": 849 | |
| }, | |
| { | |
| "epoch": 1.717896865520728, | |
| "grad_norm": 0.17527812719345093, | |
| "learning_rate": 0.00018719760541141347, | |
| "loss": 0.285, | |
| "mean_token_accuracy": 0.8988419659435749, | |
| "num_tokens": 57956449.0, | |
| "step": 850 | |
| }, | |
| { | |
| "epoch": 1.7199191102123357, | |
| "grad_norm": 0.17075812816619873, | |
| "learning_rate": 0.00018681156048232063, | |
| "loss": 0.2797, | |
| "mean_token_accuracy": 0.8975733481347561, | |
| "num_tokens": 58019620.0, | |
| "step": 851 | |
| }, | |
| { | |
| "epoch": 1.7219413549039433, | |
| "grad_norm": 0.162892147898674, | |
| "learning_rate": 0.00018642567351842776, | |
| "loss": 0.3048, | |
| "mean_token_accuracy": 0.8936393298208714, | |
| "num_tokens": 58084770.0, | |
| "step": 852 | |
| }, | |
| { | |
| "epoch": 1.723963599595551, | |
| "grad_norm": 0.1569058746099472, | |
| "learning_rate": 0.0001860399463564201, | |
| "loss": 0.2779, | |
| "mean_token_accuracy": 0.90330421179533, | |
| "num_tokens": 58149930.0, | |
| "step": 853 | |
| }, | |
| { | |
| "epoch": 1.7259858442871587, | |
| "grad_norm": 0.15333376824855804, | |
| "learning_rate": 0.00018565438083222193, | |
| "loss": 0.2431, | |
| "mean_token_accuracy": 0.9056979790329933, | |
| "num_tokens": 58218764.0, | |
| "step": 854 | |
| }, | |
| { | |
| "epoch": 1.7280080889787666, | |
| "grad_norm": 0.18095627427101135, | |
| "learning_rate": 0.00018526897878098857, | |
| "loss": 0.2914, | |
| "mean_token_accuracy": 0.8964138776063919, | |
| "num_tokens": 58280108.0, | |
| "step": 855 | |
| }, | |
| { | |
| "epoch": 1.7300303336703742, | |
| "grad_norm": 0.17549115419387817, | |
| "learning_rate": 0.00018488374203709694, | |
| "loss": 0.2715, | |
| "mean_token_accuracy": 0.9019583091139793, | |
| "num_tokens": 58349603.0, | |
| "step": 856 | |
| }, | |
| { | |
| "epoch": 1.7320525783619818, | |
| "grad_norm": 0.14637798070907593, | |
| "learning_rate": 0.00018449867243413732, | |
| "loss": 0.2479, | |
| "mean_token_accuracy": 0.9110586978495121, | |
| "num_tokens": 58423158.0, | |
| "step": 857 | |
| }, | |
| { | |
| "epoch": 1.7340748230535894, | |
| "grad_norm": 0.18153415620326996, | |
| "learning_rate": 0.00018411377180490454, | |
| "loss": 0.2838, | |
| "mean_token_accuracy": 0.8981715328991413, | |
| "num_tokens": 58489878.0, | |
| "step": 858 | |
| }, | |
| { | |
| "epoch": 1.736097067745197, | |
| "grad_norm": 0.14081305265426636, | |
| "learning_rate": 0.00018372904198138895, | |
| "loss": 0.2421, | |
| "mean_token_accuracy": 0.9120564199984074, | |
| "num_tokens": 58567119.0, | |
| "step": 859 | |
| }, | |
| { | |
| "epoch": 1.7381193124368048, | |
| "grad_norm": 0.19423925876617432, | |
| "learning_rate": 0.0001833444847947681, | |
| "loss": 0.2827, | |
| "mean_token_accuracy": 0.8960412628948689, | |
| "num_tokens": 58629512.0, | |
| "step": 860 | |
| }, | |
| { | |
| "epoch": 1.7401415571284127, | |
| "grad_norm": 0.1835591346025467, | |
| "learning_rate": 0.00018296010207539775, | |
| "loss": 0.3066, | |
| "mean_token_accuracy": 0.8935861364006996, | |
| "num_tokens": 58692056.0, | |
| "step": 861 | |
| }, | |
| { | |
| "epoch": 1.7421638018200203, | |
| "grad_norm": 0.17017914354801178, | |
| "learning_rate": 0.00018257589565280337, | |
| "loss": 0.2839, | |
| "mean_token_accuracy": 0.8955631256103516, | |
| "num_tokens": 58757823.0, | |
| "step": 862 | |
| }, | |
| { | |
| "epoch": 1.744186046511628, | |
| "grad_norm": 0.17654229700565338, | |
| "learning_rate": 0.0001821918673556712, | |
| "loss": 0.2856, | |
| "mean_token_accuracy": 0.9003425352275372, | |
| "num_tokens": 58820185.0, | |
| "step": 863 | |
| }, | |
| { | |
| "epoch": 1.7462082912032355, | |
| "grad_norm": 0.18433596193790436, | |
| "learning_rate": 0.00018180801901183967, | |
| "loss": 0.276, | |
| "mean_token_accuracy": 0.9058196842670441, | |
| "num_tokens": 58888573.0, | |
| "step": 864 | |
| }, | |
| { | |
| "epoch": 1.7482305358948431, | |
| "grad_norm": 0.16942624747753143, | |
| "learning_rate": 0.0001814243524482909, | |
| "loss": 0.2676, | |
| "mean_token_accuracy": 0.9044988267123699, | |
| "num_tokens": 58953010.0, | |
| "step": 865 | |
| }, | |
| { | |
| "epoch": 1.750252780586451, | |
| "grad_norm": 0.1317698061466217, | |
| "learning_rate": 0.0001810408694911415, | |
| "loss": 0.2423, | |
| "mean_token_accuracy": 0.9077907241880894, | |
| "num_tokens": 59032037.0, | |
| "step": 866 | |
| }, | |
| { | |
| "epoch": 1.7522750252780588, | |
| "grad_norm": 0.17676536738872528, | |
| "learning_rate": 0.00018065757196563444, | |
| "loss": 0.2834, | |
| "mean_token_accuracy": 0.90084283426404, | |
| "num_tokens": 59102101.0, | |
| "step": 867 | |
| }, | |
| { | |
| "epoch": 1.7542972699696664, | |
| "grad_norm": 0.16460995376110077, | |
| "learning_rate": 0.00018027446169612983, | |
| "loss": 0.266, | |
| "mean_token_accuracy": 0.8994225487112999, | |
| "num_tokens": 59175507.0, | |
| "step": 868 | |
| }, | |
| { | |
| "epoch": 1.756319514661274, | |
| "grad_norm": 0.15954379737377167, | |
| "learning_rate": 0.0001798915405060968, | |
| "loss": 0.2613, | |
| "mean_token_accuracy": 0.9075300879776478, | |
| "num_tokens": 59241915.0, | |
| "step": 869 | |
| }, | |
| { | |
| "epoch": 1.7583417593528816, | |
| "grad_norm": 0.17243851721286774, | |
| "learning_rate": 0.00017950881021810435, | |
| "loss": 0.2653, | |
| "mean_token_accuracy": 0.9034992009401321, | |
| "num_tokens": 59305436.0, | |
| "step": 870 | |
| }, | |
| { | |
| "epoch": 1.7603640040444892, | |
| "grad_norm": 0.17760290205478668, | |
| "learning_rate": 0.00017912627265381285, | |
| "loss": 0.2885, | |
| "mean_token_accuracy": 0.8978960253298283, | |
| "num_tokens": 59370395.0, | |
| "step": 871 | |
| }, | |
| { | |
| "epoch": 1.762386248736097, | |
| "grad_norm": 0.17663156986236572, | |
| "learning_rate": 0.00017874392963396552, | |
| "loss": 0.2931, | |
| "mean_token_accuracy": 0.8978605940937996, | |
| "num_tokens": 59435634.0, | |
| "step": 872 | |
| }, | |
| { | |
| "epoch": 1.764408493427705, | |
| "grad_norm": 0.17674268782138824, | |
| "learning_rate": 0.00017836178297837938, | |
| "loss": 0.2717, | |
| "mean_token_accuracy": 0.9010186977684498, | |
| "num_tokens": 59500074.0, | |
| "step": 873 | |
| }, | |
| { | |
| "epoch": 1.7664307381193125, | |
| "grad_norm": 0.16433486342430115, | |
| "learning_rate": 0.0001779798345059371, | |
| "loss": 0.2598, | |
| "mean_token_accuracy": 0.9028237722814083, | |
| "num_tokens": 59574564.0, | |
| "step": 874 | |
| }, | |
| { | |
| "epoch": 1.7684529828109201, | |
| "grad_norm": 0.15955936908721924, | |
| "learning_rate": 0.0001775980860345778, | |
| "loss": 0.2677, | |
| "mean_token_accuracy": 0.9025723747909069, | |
| "num_tokens": 59650978.0, | |
| "step": 875 | |
| }, | |
| { | |
| "epoch": 1.7704752275025277, | |
| "grad_norm": 0.1750318855047226, | |
| "learning_rate": 0.00017721653938128888, | |
| "loss": 0.2866, | |
| "mean_token_accuracy": 0.8999117016792297, | |
| "num_tokens": 59714437.0, | |
| "step": 876 | |
| }, | |
| { | |
| "epoch": 1.7724974721941353, | |
| "grad_norm": 0.15407449007034302, | |
| "learning_rate": 0.00017683519636209707, | |
| "loss": 0.2586, | |
| "mean_token_accuracy": 0.9031764194369316, | |
| "num_tokens": 59795096.0, | |
| "step": 877 | |
| }, | |
| { | |
| "epoch": 1.7745197168857432, | |
| "grad_norm": 0.16260726749897003, | |
| "learning_rate": 0.00017645405879205983, | |
| "loss": 0.275, | |
| "mean_token_accuracy": 0.9040297567844391, | |
| "num_tokens": 59862394.0, | |
| "step": 878 | |
| }, | |
| { | |
| "epoch": 1.776541961577351, | |
| "grad_norm": 0.16649970412254333, | |
| "learning_rate": 0.0001760731284852568, | |
| "loss": 0.278, | |
| "mean_token_accuracy": 0.8974411375820637, | |
| "num_tokens": 59932031.0, | |
| "step": 879 | |
| }, | |
| { | |
| "epoch": 1.7785642062689586, | |
| "grad_norm": 0.1494332104921341, | |
| "learning_rate": 0.0001756924072547813, | |
| "loss": 0.2579, | |
| "mean_token_accuracy": 0.905670553445816, | |
| "num_tokens": 60011025.0, | |
| "step": 880 | |
| }, | |
| { | |
| "epoch": 1.7805864509605662, | |
| "grad_norm": 0.18167705833911896, | |
| "learning_rate": 0.00017531189691273106, | |
| "loss": 0.2776, | |
| "mean_token_accuracy": 0.8976808004081249, | |
| "num_tokens": 60068820.0, | |
| "step": 881 | |
| }, | |
| { | |
| "epoch": 1.7826086956521738, | |
| "grad_norm": 0.16186164319515228, | |
| "learning_rate": 0.00017493159927020054, | |
| "loss": 0.2811, | |
| "mean_token_accuracy": 0.9016175977885723, | |
| "num_tokens": 60130140.0, | |
| "step": 882 | |
| }, | |
| { | |
| "epoch": 1.7846309403437814, | |
| "grad_norm": 0.17380307614803314, | |
| "learning_rate": 0.0001745515161372716, | |
| "loss": 0.2945, | |
| "mean_token_accuracy": 0.8955324217677116, | |
| "num_tokens": 60193073.0, | |
| "step": 883 | |
| }, | |
| { | |
| "epoch": 1.7866531850353893, | |
| "grad_norm": 0.17945754528045654, | |
| "learning_rate": 0.00017417164932300502, | |
| "loss": 0.2722, | |
| "mean_token_accuracy": 0.8948768936097622, | |
| "num_tokens": 60255959.0, | |
| "step": 884 | |
| }, | |
| { | |
| "epoch": 1.7886754297269971, | |
| "grad_norm": 0.16201643645763397, | |
| "learning_rate": 0.00017379200063543225, | |
| "loss": 0.2761, | |
| "mean_token_accuracy": 0.8984379507601261, | |
| "num_tokens": 60331653.0, | |
| "step": 885 | |
| }, | |
| { | |
| "epoch": 1.7906976744186047, | |
| "grad_norm": 0.17004264891147614, | |
| "learning_rate": 0.00017341257188154625, | |
| "loss": 0.2785, | |
| "mean_token_accuracy": 0.902726124972105, | |
| "num_tokens": 60397891.0, | |
| "step": 886 | |
| }, | |
| { | |
| "epoch": 1.7927199191102123, | |
| "grad_norm": 0.17423401772975922, | |
| "learning_rate": 0.0001730333648672934, | |
| "loss": 0.2663, | |
| "mean_token_accuracy": 0.9040607661008835, | |
| "num_tokens": 60463271.0, | |
| "step": 887 | |
| }, | |
| { | |
| "epoch": 1.79474216380182, | |
| "grad_norm": 0.17113754153251648, | |
| "learning_rate": 0.00017265438139756455, | |
| "loss": 0.2754, | |
| "mean_token_accuracy": 0.901301734149456, | |
| "num_tokens": 60527757.0, | |
| "step": 888 | |
| }, | |
| { | |
| "epoch": 1.7967644084934276, | |
| "grad_norm": 0.1624325066804886, | |
| "learning_rate": 0.00017227562327618655, | |
| "loss": 0.264, | |
| "mean_token_accuracy": 0.8982259891927242, | |
| "num_tokens": 60590938.0, | |
| "step": 889 | |
| }, | |
| { | |
| "epoch": 1.7987866531850354, | |
| "grad_norm": 0.17364652454853058, | |
| "learning_rate": 0.00017189709230591376, | |
| "loss": 0.2768, | |
| "mean_token_accuracy": 0.8977219946682453, | |
| "num_tokens": 60666355.0, | |
| "step": 890 | |
| }, | |
| { | |
| "epoch": 1.8008088978766432, | |
| "grad_norm": 0.15866470336914062, | |
| "learning_rate": 0.00017151879028841935, | |
| "loss": 0.2556, | |
| "mean_token_accuracy": 0.9094675220549107, | |
| "num_tokens": 60734511.0, | |
| "step": 891 | |
| }, | |
| { | |
| "epoch": 1.8028311425682508, | |
| "grad_norm": 0.16959354281425476, | |
| "learning_rate": 0.0001711407190242867, | |
| "loss": 0.2827, | |
| "mean_token_accuracy": 0.9035419821739197, | |
| "num_tokens": 60800072.0, | |
| "step": 892 | |
| }, | |
| { | |
| "epoch": 1.8048533872598584, | |
| "grad_norm": 0.13593734800815582, | |
| "learning_rate": 0.00017076288031300086, | |
| "loss": 0.2233, | |
| "mean_token_accuracy": 0.9116230644285679, | |
| "num_tokens": 60877569.0, | |
| "step": 893 | |
| }, | |
| { | |
| "epoch": 1.806875631951466, | |
| "grad_norm": 0.159558966755867, | |
| "learning_rate": 0.00017038527595294016, | |
| "loss": 0.2713, | |
| "mean_token_accuracy": 0.9025290682911873, | |
| "num_tokens": 60946273.0, | |
| "step": 894 | |
| }, | |
| { | |
| "epoch": 1.8088978766430737, | |
| "grad_norm": 0.14993025362491608, | |
| "learning_rate": 0.00017000790774136744, | |
| "loss": 0.2563, | |
| "mean_token_accuracy": 0.906671367585659, | |
| "num_tokens": 61021490.0, | |
| "step": 895 | |
| }, | |
| { | |
| "epoch": 1.8109201213346815, | |
| "grad_norm": 0.16624176502227783, | |
| "learning_rate": 0.00016963077747442147, | |
| "loss": 0.285, | |
| "mean_token_accuracy": 0.9001613892614841, | |
| "num_tokens": 61087077.0, | |
| "step": 896 | |
| }, | |
| { | |
| "epoch": 1.8129423660262893, | |
| "grad_norm": 0.18598856031894684, | |
| "learning_rate": 0.00016925388694710857, | |
| "loss": 0.2816, | |
| "mean_token_accuracy": 0.8991341292858124, | |
| "num_tokens": 61155366.0, | |
| "step": 897 | |
| }, | |
| { | |
| "epoch": 1.814964610717897, | |
| "grad_norm": 0.15858127176761627, | |
| "learning_rate": 0.00016887723795329395, | |
| "loss": 0.259, | |
| "mean_token_accuracy": 0.9013683348894119, | |
| "num_tokens": 61227279.0, | |
| "step": 898 | |
| }, | |
| { | |
| "epoch": 1.8169868554095046, | |
| "grad_norm": 0.17217408120632172, | |
| "learning_rate": 0.00016850083228569327, | |
| "loss": 0.3001, | |
| "mean_token_accuracy": 0.8970577903091908, | |
| "num_tokens": 61294506.0, | |
| "step": 899 | |
| }, | |
| { | |
| "epoch": 1.8190091001011122, | |
| "grad_norm": 0.15035738050937653, | |
| "learning_rate": 0.00016812467173586395, | |
| "loss": 0.2645, | |
| "mean_token_accuracy": 0.9000033251941204, | |
| "num_tokens": 61365391.0, | |
| "step": 900 | |
| }, | |
| { | |
| "epoch": 1.8210313447927198, | |
| "grad_norm": 0.17095452547073364, | |
| "learning_rate": 0.0001677487580941968, | |
| "loss": 0.2723, | |
| "mean_token_accuracy": 0.9036833345890045, | |
| "num_tokens": 61430318.0, | |
| "step": 901 | |
| }, | |
| { | |
| "epoch": 1.8230535894843276, | |
| "grad_norm": 0.18995128571987152, | |
| "learning_rate": 0.00016737309314990742, | |
| "loss": 0.2963, | |
| "mean_token_accuracy": 0.897097785025835, | |
| "num_tokens": 61490667.0, | |
| "step": 902 | |
| }, | |
| { | |
| "epoch": 1.8250758341759354, | |
| "grad_norm": 0.15838812291622162, | |
| "learning_rate": 0.00016699767869102767, | |
| "loss": 0.2597, | |
| "mean_token_accuracy": 0.9020838551223278, | |
| "num_tokens": 61566103.0, | |
| "step": 903 | |
| }, | |
| { | |
| "epoch": 1.827098078867543, | |
| "grad_norm": 0.17972201108932495, | |
| "learning_rate": 0.00016662251650439725, | |
| "loss": 0.2853, | |
| "mean_token_accuracy": 0.899272233247757, | |
| "num_tokens": 61628595.0, | |
| "step": 904 | |
| }, | |
| { | |
| "epoch": 1.8291203235591507, | |
| "grad_norm": 0.1463383138179779, | |
| "learning_rate": 0.0001662476083756551, | |
| "loss": 0.26, | |
| "mean_token_accuracy": 0.9080706797540188, | |
| "num_tokens": 61703786.0, | |
| "step": 905 | |
| }, | |
| { | |
| "epoch": 1.8311425682507583, | |
| "grad_norm": 0.16255010664463043, | |
| "learning_rate": 0.00016587295608923088, | |
| "loss": 0.2805, | |
| "mean_token_accuracy": 0.9013442508876324, | |
| "num_tokens": 61776819.0, | |
| "step": 906 | |
| }, | |
| { | |
| "epoch": 1.8331648129423659, | |
| "grad_norm": 0.17431674897670746, | |
| "learning_rate": 0.0001654985614283366, | |
| "loss": 0.2877, | |
| "mean_token_accuracy": 0.9047906063497066, | |
| "num_tokens": 61846922.0, | |
| "step": 907 | |
| }, | |
| { | |
| "epoch": 1.8351870576339737, | |
| "grad_norm": 0.1731417030096054, | |
| "learning_rate": 0.00016512442617495804, | |
| "loss": 0.2809, | |
| "mean_token_accuracy": 0.8943095356225967, | |
| "num_tokens": 61913305.0, | |
| "step": 908 | |
| }, | |
| { | |
| "epoch": 1.8372093023255816, | |
| "grad_norm": 0.17473085224628448, | |
| "learning_rate": 0.00016475055210984641, | |
| "loss": 0.2765, | |
| "mean_token_accuracy": 0.9039146527647972, | |
| "num_tokens": 61974613.0, | |
| "step": 909 | |
| }, | |
| { | |
| "epoch": 1.8392315470171892, | |
| "grad_norm": 0.1697629690170288, | |
| "learning_rate": 0.00016437694101250952, | |
| "loss": 0.2672, | |
| "mean_token_accuracy": 0.9050569906830788, | |
| "num_tokens": 62042608.0, | |
| "step": 910 | |
| }, | |
| { | |
| "epoch": 1.8412537917087968, | |
| "grad_norm": 0.1614944189786911, | |
| "learning_rate": 0.00016400359466120366, | |
| "loss": 0.2737, | |
| "mean_token_accuracy": 0.9029634855687618, | |
| "num_tokens": 62112444.0, | |
| "step": 911 | |
| }, | |
| { | |
| "epoch": 1.8432760364004044, | |
| "grad_norm": 0.17687106132507324, | |
| "learning_rate": 0.00016363051483292513, | |
| "loss": 0.2648, | |
| "mean_token_accuracy": 0.9044081643223763, | |
| "num_tokens": 62181562.0, | |
| "step": 912 | |
| }, | |
| { | |
| "epoch": 1.845298281092012, | |
| "grad_norm": 0.1807907372713089, | |
| "learning_rate": 0.0001632577033034015, | |
| "loss": 0.274, | |
| "mean_token_accuracy": 0.9006009586155415, | |
| "num_tokens": 62245198.0, | |
| "step": 913 | |
| }, | |
| { | |
| "epoch": 1.8473205257836198, | |
| "grad_norm": 0.19810381531715393, | |
| "learning_rate": 0.00016288516184708346, | |
| "loss": 0.2893, | |
| "mean_token_accuracy": 0.8972717076539993, | |
| "num_tokens": 62308012.0, | |
| "step": 914 | |
| }, | |
| { | |
| "epoch": 1.8493427704752277, | |
| "grad_norm": 0.15699312090873718, | |
| "learning_rate": 0.00016251289223713616, | |
| "loss": 0.2744, | |
| "mean_token_accuracy": 0.9030490145087242, | |
| "num_tokens": 62379728.0, | |
| "step": 915 | |
| }, | |
| { | |
| "epoch": 1.8513650151668353, | |
| "grad_norm": 0.1709468960762024, | |
| "learning_rate": 0.000162140896245431, | |
| "loss": 0.2471, | |
| "mean_token_accuracy": 0.9071713648736477, | |
| "num_tokens": 62446402.0, | |
| "step": 916 | |
| }, | |
| { | |
| "epoch": 1.8533872598584429, | |
| "grad_norm": 0.152323380112648, | |
| "learning_rate": 0.00016176917564253679, | |
| "loss": 0.231, | |
| "mean_token_accuracy": 0.9163475334644318, | |
| "num_tokens": 62521000.0, | |
| "step": 917 | |
| }, | |
| { | |
| "epoch": 1.8554095045500505, | |
| "grad_norm": 0.1929645538330078, | |
| "learning_rate": 0.00016139773219771186, | |
| "loss": 0.2972, | |
| "mean_token_accuracy": 0.8963135108351707, | |
| "num_tokens": 62582288.0, | |
| "step": 918 | |
| }, | |
| { | |
| "epoch": 1.857431749241658, | |
| "grad_norm": 0.14357374608516693, | |
| "learning_rate": 0.00016102656767889522, | |
| "loss": 0.2525, | |
| "mean_token_accuracy": 0.905899915844202, | |
| "num_tokens": 62659214.0, | |
| "step": 919 | |
| }, | |
| { | |
| "epoch": 1.859453993933266, | |
| "grad_norm": 0.18147152662277222, | |
| "learning_rate": 0.00016065568385269834, | |
| "loss": 0.3062, | |
| "mean_token_accuracy": 0.891651626676321, | |
| "num_tokens": 62721172.0, | |
| "step": 920 | |
| }, | |
| { | |
| "epoch": 1.8614762386248738, | |
| "grad_norm": 0.15798717737197876, | |
| "learning_rate": 0.0001602850824843967, | |
| "loss": 0.2441, | |
| "mean_token_accuracy": 0.9044736139476299, | |
| "num_tokens": 62790693.0, | |
| "step": 921 | |
| }, | |
| { | |
| "epoch": 1.8634984833164814, | |
| "grad_norm": 0.20527228713035583, | |
| "learning_rate": 0.00015991476533792125, | |
| "loss": 0.2862, | |
| "mean_token_accuracy": 0.9020938500761986, | |
| "num_tokens": 62861978.0, | |
| "step": 922 | |
| }, | |
| { | |
| "epoch": 1.865520728008089, | |
| "grad_norm": 0.17211146652698517, | |
| "learning_rate": 0.00015954473417585042, | |
| "loss": 0.278, | |
| "mean_token_accuracy": 0.901647973805666, | |
| "num_tokens": 62928176.0, | |
| "step": 923 | |
| }, | |
| { | |
| "epoch": 1.8675429726996966, | |
| "grad_norm": 0.14656521379947662, | |
| "learning_rate": 0.00015917499075940116, | |
| "loss": 0.2436, | |
| "mean_token_accuracy": 0.9071595072746277, | |
| "num_tokens": 63008955.0, | |
| "step": 924 | |
| }, | |
| { | |
| "epoch": 1.8695652173913042, | |
| "grad_norm": 0.16858512163162231, | |
| "learning_rate": 0.000158805536848421, | |
| "loss": 0.2838, | |
| "mean_token_accuracy": 0.8976234942674637, | |
| "num_tokens": 63077352.0, | |
| "step": 925 | |
| }, | |
| { | |
| "epoch": 1.871587462082912, | |
| "grad_norm": 0.15110129117965698, | |
| "learning_rate": 0.00015843637420137965, | |
| "loss": 0.2491, | |
| "mean_token_accuracy": 0.9075470231473446, | |
| "num_tokens": 63155136.0, | |
| "step": 926 | |
| }, | |
| { | |
| "epoch": 1.8736097067745199, | |
| "grad_norm": 0.16917841136455536, | |
| "learning_rate": 0.00015806750457536016, | |
| "loss": 0.2777, | |
| "mean_token_accuracy": 0.9005607068538666, | |
| "num_tokens": 63228469.0, | |
| "step": 927 | |
| }, | |
| { | |
| "epoch": 1.8756319514661275, | |
| "grad_norm": 0.15289200842380524, | |
| "learning_rate": 0.00015769892972605125, | |
| "loss": 0.2535, | |
| "mean_token_accuracy": 0.9035063087940216, | |
| "num_tokens": 63299131.0, | |
| "step": 928 | |
| }, | |
| { | |
| "epoch": 1.877654196157735, | |
| "grad_norm": 0.16520720720291138, | |
| "learning_rate": 0.00015733065140773845, | |
| "loss": 0.2742, | |
| "mean_token_accuracy": 0.9034424312412739, | |
| "num_tokens": 63370295.0, | |
| "step": 929 | |
| }, | |
| { | |
| "epoch": 1.8796764408493427, | |
| "grad_norm": 0.15712064504623413, | |
| "learning_rate": 0.00015696267137329584, | |
| "loss": 0.2667, | |
| "mean_token_accuracy": 0.9040120244026184, | |
| "num_tokens": 63437736.0, | |
| "step": 930 | |
| }, | |
| { | |
| "epoch": 1.8816986855409503, | |
| "grad_norm": 0.1603911817073822, | |
| "learning_rate": 0.00015659499137417798, | |
| "loss": 0.2507, | |
| "mean_token_accuracy": 0.9087044671177864, | |
| "num_tokens": 63509676.0, | |
| "step": 931 | |
| }, | |
| { | |
| "epoch": 1.8837209302325582, | |
| "grad_norm": 0.16669879853725433, | |
| "learning_rate": 0.00015622761316041114, | |
| "loss": 0.278, | |
| "mean_token_accuracy": 0.8977540507912636, | |
| "num_tokens": 63576754.0, | |
| "step": 932 | |
| }, | |
| { | |
| "epoch": 1.885743174924166, | |
| "grad_norm": 0.17182767391204834, | |
| "learning_rate": 0.00015586053848058536, | |
| "loss": 0.2526, | |
| "mean_token_accuracy": 0.9016401395201683, | |
| "num_tokens": 63643843.0, | |
| "step": 933 | |
| }, | |
| { | |
| "epoch": 1.8877654196157736, | |
| "grad_norm": 0.17400912940502167, | |
| "learning_rate": 0.00015549376908184596, | |
| "loss": 0.282, | |
| "mean_token_accuracy": 0.8970470912754536, | |
| "num_tokens": 63712033.0, | |
| "step": 934 | |
| }, | |
| { | |
| "epoch": 1.8897876643073812, | |
| "grad_norm": 0.16362541913986206, | |
| "learning_rate": 0.00015512730670988508, | |
| "loss": 0.2794, | |
| "mean_token_accuracy": 0.9033955708146095, | |
| "num_tokens": 63783615.0, | |
| "step": 935 | |
| }, | |
| { | |
| "epoch": 1.8918099089989888, | |
| "grad_norm": 0.20413319766521454, | |
| "learning_rate": 0.00015476115310893374, | |
| "loss": 0.2973, | |
| "mean_token_accuracy": 0.8986438475549221, | |
| "num_tokens": 63837579.0, | |
| "step": 936 | |
| }, | |
| { | |
| "epoch": 1.8938321536905964, | |
| "grad_norm": 0.173280730843544, | |
| "learning_rate": 0.00015439531002175305, | |
| "loss": 0.2614, | |
| "mean_token_accuracy": 0.9053931087255478, | |
| "num_tokens": 63904296.0, | |
| "step": 937 | |
| }, | |
| { | |
| "epoch": 1.8958543983822043, | |
| "grad_norm": 0.16067558526992798, | |
| "learning_rate": 0.00015402977918962653, | |
| "loss": 0.2688, | |
| "mean_token_accuracy": 0.905962623655796, | |
| "num_tokens": 63982577.0, | |
| "step": 938 | |
| }, | |
| { | |
| "epoch": 1.897876643073812, | |
| "grad_norm": 0.18021517992019653, | |
| "learning_rate": 0.00015366456235235113, | |
| "loss": 0.2935, | |
| "mean_token_accuracy": 0.8951955139636993, | |
| "num_tokens": 64038048.0, | |
| "step": 939 | |
| }, | |
| { | |
| "epoch": 1.8998988877654197, | |
| "grad_norm": 0.14851278066635132, | |
| "learning_rate": 0.0001532996612482295, | |
| "loss": 0.2661, | |
| "mean_token_accuracy": 0.9066961444914341, | |
| "num_tokens": 64113768.0, | |
| "step": 940 | |
| }, | |
| { | |
| "epoch": 1.9019211324570273, | |
| "grad_norm": 0.17288359999656677, | |
| "learning_rate": 0.00015293507761406148, | |
| "loss": 0.271, | |
| "mean_token_accuracy": 0.9030660726130009, | |
| "num_tokens": 64178434.0, | |
| "step": 941 | |
| }, | |
| { | |
| "epoch": 1.903943377148635, | |
| "grad_norm": 0.16324573755264282, | |
| "learning_rate": 0.00015257081318513583, | |
| "loss": 0.274, | |
| "mean_token_accuracy": 0.9019493535161018, | |
| "num_tokens": 64249882.0, | |
| "step": 942 | |
| }, | |
| { | |
| "epoch": 1.9059656218402425, | |
| "grad_norm": 0.15509222447872162, | |
| "learning_rate": 0.0001522068696952221, | |
| "loss": 0.2354, | |
| "mean_token_accuracy": 0.9143304452300072, | |
| "num_tokens": 64322937.0, | |
| "step": 943 | |
| }, | |
| { | |
| "epoch": 1.9079878665318504, | |
| "grad_norm": 0.1547105610370636, | |
| "learning_rate": 0.00015184324887656208, | |
| "loss": 0.2553, | |
| "mean_token_accuracy": 0.9079734869301319, | |
| "num_tokens": 64393253.0, | |
| "step": 944 | |
| }, | |
| { | |
| "epoch": 1.910010111223458, | |
| "grad_norm": 0.15001994371414185, | |
| "learning_rate": 0.00015147995245986203, | |
| "loss": 0.2549, | |
| "mean_token_accuracy": 0.9065254330635071, | |
| "num_tokens": 64470294.0, | |
| "step": 945 | |
| }, | |
| { | |
| "epoch": 1.9120323559150658, | |
| "grad_norm": 0.17263031005859375, | |
| "learning_rate": 0.00015111698217428385, | |
| "loss": 0.2766, | |
| "mean_token_accuracy": 0.9019508697092533, | |
| "num_tokens": 64541359.0, | |
| "step": 946 | |
| }, | |
| { | |
| "epoch": 1.9140546006066734, | |
| "grad_norm": 0.19937334954738617, | |
| "learning_rate": 0.0001507543397474375, | |
| "loss": 0.2893, | |
| "mean_token_accuracy": 0.8960909508168697, | |
| "num_tokens": 64601687.0, | |
| "step": 947 | |
| }, | |
| { | |
| "epoch": 1.916076845298281, | |
| "grad_norm": 0.20299410820007324, | |
| "learning_rate": 0.00015039202690537233, | |
| "loss": 0.2875, | |
| "mean_token_accuracy": 0.8969489298760891, | |
| "num_tokens": 64662730.0, | |
| "step": 948 | |
| }, | |
| { | |
| "epoch": 1.9180990899898887, | |
| "grad_norm": 0.17673259973526, | |
| "learning_rate": 0.0001500300453725688, | |
| "loss": 0.285, | |
| "mean_token_accuracy": 0.8983747102320194, | |
| "num_tokens": 64726699.0, | |
| "step": 949 | |
| }, | |
| { | |
| "epoch": 1.9201213346814965, | |
| "grad_norm": 0.14203934371471405, | |
| "learning_rate": 0.00014966839687193074, | |
| "loss": 0.2413, | |
| "mean_token_accuracy": 0.9119373075664043, | |
| "num_tokens": 64804474.0, | |
| "step": 950 | |
| }, | |
| { | |
| "epoch": 1.922143579373104, | |
| "grad_norm": 0.18115116655826569, | |
| "learning_rate": 0.0001493070831247767, | |
| "loss": 0.2618, | |
| "mean_token_accuracy": 0.9050916060805321, | |
| "num_tokens": 64867023.0, | |
| "step": 951 | |
| }, | |
| { | |
| "epoch": 1.924165824064712, | |
| "grad_norm": 0.15658792853355408, | |
| "learning_rate": 0.00014894610585083196, | |
| "loss": 0.2539, | |
| "mean_token_accuracy": 0.9065564014017582, | |
| "num_tokens": 64933593.0, | |
| "step": 952 | |
| }, | |
| { | |
| "epoch": 1.9261880687563195, | |
| "grad_norm": 0.18066135048866272, | |
| "learning_rate": 0.00014858546676822023, | |
| "loss": 0.2731, | |
| "mean_token_accuracy": 0.9004339128732681, | |
| "num_tokens": 64997732.0, | |
| "step": 953 | |
| }, | |
| { | |
| "epoch": 1.9282103134479271, | |
| "grad_norm": 0.15237212181091309, | |
| "learning_rate": 0.0001482251675934557, | |
| "loss": 0.2476, | |
| "mean_token_accuracy": 0.9087250605225563, | |
| "num_tokens": 65080000.0, | |
| "step": 954 | |
| }, | |
| { | |
| "epoch": 1.9302325581395348, | |
| "grad_norm": 0.18067006766796112, | |
| "learning_rate": 0.00014786521004143467, | |
| "loss": 0.2712, | |
| "mean_token_accuracy": 0.9025260508060455, | |
| "num_tokens": 65148696.0, | |
| "step": 955 | |
| }, | |
| { | |
| "epoch": 1.9322548028311426, | |
| "grad_norm": 0.15837518870830536, | |
| "learning_rate": 0.00014750559582542736, | |
| "loss": 0.2606, | |
| "mean_token_accuracy": 0.9080248959362507, | |
| "num_tokens": 65223230.0, | |
| "step": 956 | |
| }, | |
| { | |
| "epoch": 1.9342770475227502, | |
| "grad_norm": 0.16518649458885193, | |
| "learning_rate": 0.00014714632665706985, | |
| "loss": 0.2539, | |
| "mean_token_accuracy": 0.9098630361258984, | |
| "num_tokens": 65292846.0, | |
| "step": 957 | |
| }, | |
| { | |
| "epoch": 1.936299292214358, | |
| "grad_norm": 0.18779224157333374, | |
| "learning_rate": 0.000146787404246356, | |
| "loss": 0.282, | |
| "mean_token_accuracy": 0.8994725160300732, | |
| "num_tokens": 65354948.0, | |
| "step": 958 | |
| }, | |
| { | |
| "epoch": 1.9383215369059656, | |
| "grad_norm": 0.16804009675979614, | |
| "learning_rate": 0.0001464288303016292, | |
| "loss": 0.2521, | |
| "mean_token_accuracy": 0.9077105298638344, | |
| "num_tokens": 65425082.0, | |
| "step": 959 | |
| }, | |
| { | |
| "epoch": 1.9403437815975733, | |
| "grad_norm": 0.17569729685783386, | |
| "learning_rate": 0.00014607060652957414, | |
| "loss": 0.2914, | |
| "mean_token_accuracy": 0.8924459666013718, | |
| "num_tokens": 65491402.0, | |
| "step": 960 | |
| }, | |
| { | |
| "epoch": 1.9423660262891809, | |
| "grad_norm": 0.14672434329986572, | |
| "learning_rate": 0.00014571273463520897, | |
| "loss": 0.2628, | |
| "mean_token_accuracy": 0.9076977856457233, | |
| "num_tokens": 65563535.0, | |
| "step": 961 | |
| }, | |
| { | |
| "epoch": 1.9443882709807887, | |
| "grad_norm": 0.1623447835445404, | |
| "learning_rate": 0.00014535521632187703, | |
| "loss": 0.2759, | |
| "mean_token_accuracy": 0.9029062166810036, | |
| "num_tokens": 65629601.0, | |
| "step": 962 | |
| }, | |
| { | |
| "epoch": 1.9464105156723963, | |
| "grad_norm": 0.1764685958623886, | |
| "learning_rate": 0.00014499805329123858, | |
| "loss": 0.3043, | |
| "mean_token_accuracy": 0.8929594941437244, | |
| "num_tokens": 65689192.0, | |
| "step": 963 | |
| }, | |
| { | |
| "epoch": 1.9484327603640041, | |
| "grad_norm": 0.1544012725353241, | |
| "learning_rate": 0.000144641247243263, | |
| "loss": 0.2664, | |
| "mean_token_accuracy": 0.9056011252105236, | |
| "num_tokens": 65761600.0, | |
| "step": 964 | |
| }, | |
| { | |
| "epoch": 1.9504550050556118, | |
| "grad_norm": 0.17178235948085785, | |
| "learning_rate": 0.00014428479987622055, | |
| "loss": 0.2688, | |
| "mean_token_accuracy": 0.9024265073239803, | |
| "num_tokens": 65824048.0, | |
| "step": 965 | |
| }, | |
| { | |
| "epoch": 1.9524772497472194, | |
| "grad_norm": 0.17977994680404663, | |
| "learning_rate": 0.00014392871288667415, | |
| "loss": 0.2762, | |
| "mean_token_accuracy": 0.9031669199466705, | |
| "num_tokens": 65889268.0, | |
| "step": 966 | |
| }, | |
| { | |
| "epoch": 1.954499494438827, | |
| "grad_norm": 0.15329943597316742, | |
| "learning_rate": 0.00014357298796947168, | |
| "loss": 0.2841, | |
| "mean_token_accuracy": 0.8999549075961113, | |
| "num_tokens": 65961165.0, | |
| "step": 967 | |
| }, | |
| { | |
| "epoch": 1.9565217391304348, | |
| "grad_norm": 0.17066965997219086, | |
| "learning_rate": 0.00014321762681773762, | |
| "loss": 0.2636, | |
| "mean_token_accuracy": 0.9027245566248894, | |
| "num_tokens": 66022951.0, | |
| "step": 968 | |
| }, | |
| { | |
| "epoch": 1.9585439838220424, | |
| "grad_norm": 0.1542961746454239, | |
| "learning_rate": 0.00014286263112286472, | |
| "loss": 0.2441, | |
| "mean_token_accuracy": 0.9134857915341854, | |
| "num_tokens": 66099844.0, | |
| "step": 969 | |
| }, | |
| { | |
| "epoch": 1.9605662285136503, | |
| "grad_norm": 0.17265184223651886, | |
| "learning_rate": 0.00014250800257450684, | |
| "loss": 0.2797, | |
| "mean_token_accuracy": 0.9043730795383453, | |
| "num_tokens": 66173153.0, | |
| "step": 970 | |
| }, | |
| { | |
| "epoch": 1.9625884732052579, | |
| "grad_norm": 0.1839493066072464, | |
| "learning_rate": 0.00014215374286057005, | |
| "loss": 0.2908, | |
| "mean_token_accuracy": 0.8951999023556709, | |
| "num_tokens": 66234689.0, | |
| "step": 971 | |
| }, | |
| { | |
| "epoch": 1.9646107178968655, | |
| "grad_norm": 0.15913142263889313, | |
| "learning_rate": 0.00014179985366720495, | |
| "loss": 0.2837, | |
| "mean_token_accuracy": 0.9044655375182629, | |
| "num_tokens": 66305941.0, | |
| "step": 972 | |
| }, | |
| { | |
| "epoch": 1.966632962588473, | |
| "grad_norm": 0.13867108523845673, | |
| "learning_rate": 0.0001414463366787984, | |
| "loss": 0.216, | |
| "mean_token_accuracy": 0.9169092550873756, | |
| "num_tokens": 66381037.0, | |
| "step": 973 | |
| }, | |
| { | |
| "epoch": 1.968655207280081, | |
| "grad_norm": 0.1802113801240921, | |
| "learning_rate": 0.00014109319357796606, | |
| "loss": 0.3038, | |
| "mean_token_accuracy": 0.893009040504694, | |
| "num_tokens": 66440797.0, | |
| "step": 974 | |
| }, | |
| { | |
| "epoch": 1.9706774519716885, | |
| "grad_norm": 0.17021583020687103, | |
| "learning_rate": 0.00014074042604554374, | |
| "loss": 0.2733, | |
| "mean_token_accuracy": 0.9027226865291595, | |
| "num_tokens": 66505699.0, | |
| "step": 975 | |
| }, | |
| { | |
| "epoch": 1.9726996966632964, | |
| "grad_norm": 0.19243739545345306, | |
| "learning_rate": 0.00014038803576057985, | |
| "loss": 0.3087, | |
| "mean_token_accuracy": 0.8891540095210075, | |
| "num_tokens": 66567155.0, | |
| "step": 976 | |
| }, | |
| { | |
| "epoch": 1.974721941354904, | |
| "grad_norm": 0.15427738428115845, | |
| "learning_rate": 0.00014003602440032693, | |
| "loss": 0.3055, | |
| "mean_token_accuracy": 0.8956649079918861, | |
| "num_tokens": 66644385.0, | |
| "step": 977 | |
| }, | |
| { | |
| "epoch": 1.9767441860465116, | |
| "grad_norm": 0.15167449414730072, | |
| "learning_rate": 0.00013968439364023442, | |
| "loss": 0.2612, | |
| "mean_token_accuracy": 0.9073714017868042, | |
| "num_tokens": 66714503.0, | |
| "step": 978 | |
| }, | |
| { | |
| "epoch": 1.9787664307381192, | |
| "grad_norm": 0.1547987014055252, | |
| "learning_rate": 0.00013933314515393995, | |
| "loss": 0.2721, | |
| "mean_token_accuracy": 0.9024667181074619, | |
| "num_tokens": 66779572.0, | |
| "step": 979 | |
| }, | |
| { | |
| "epoch": 1.980788675429727, | |
| "grad_norm": 0.14774559438228607, | |
| "learning_rate": 0.0001389822806132617, | |
| "loss": 0.2571, | |
| "mean_token_accuracy": 0.907380323857069, | |
| "num_tokens": 66855257.0, | |
| "step": 980 | |
| }, | |
| { | |
| "epoch": 1.9828109201213346, | |
| "grad_norm": 0.16043910384178162, | |
| "learning_rate": 0.00013863180168819048, | |
| "loss": 0.2523, | |
| "mean_token_accuracy": 0.9105048142373562, | |
| "num_tokens": 66919243.0, | |
| "step": 981 | |
| }, | |
| { | |
| "epoch": 1.9848331648129425, | |
| "grad_norm": 0.18005625903606415, | |
| "learning_rate": 0.0001382817100468816, | |
| "loss": 0.2914, | |
| "mean_token_accuracy": 0.8974611833691597, | |
| "num_tokens": 66987494.0, | |
| "step": 982 | |
| }, | |
| { | |
| "epoch": 1.98685540950455, | |
| "grad_norm": 0.1641789674758911, | |
| "learning_rate": 0.00013793200735564716, | |
| "loss": 0.2826, | |
| "mean_token_accuracy": 0.8971075974404812, | |
| "num_tokens": 67053248.0, | |
| "step": 983 | |
| }, | |
| { | |
| "epoch": 1.9888776541961577, | |
| "grad_norm": 0.15405279397964478, | |
| "learning_rate": 0.00013758269527894778, | |
| "loss": 0.2559, | |
| "mean_token_accuracy": 0.9002925455570221, | |
| "num_tokens": 67124156.0, | |
| "step": 984 | |
| }, | |
| { | |
| "epoch": 1.9908998988877653, | |
| "grad_norm": 0.15380239486694336, | |
| "learning_rate": 0.00013723377547938522, | |
| "loss": 0.2569, | |
| "mean_token_accuracy": 0.9101624749600887, | |
| "num_tokens": 67196079.0, | |
| "step": 985 | |
| }, | |
| { | |
| "epoch": 1.9929221435793731, | |
| "grad_norm": 0.16303087770938873, | |
| "learning_rate": 0.00013688524961769395, | |
| "loss": 0.2669, | |
| "mean_token_accuracy": 0.9014462493360043, | |
| "num_tokens": 67264659.0, | |
| "step": 986 | |
| }, | |
| { | |
| "epoch": 1.9949443882709808, | |
| "grad_norm": 0.17709141969680786, | |
| "learning_rate": 0.00013653711935273326, | |
| "loss": 0.2831, | |
| "mean_token_accuracy": 0.9000302441418171, | |
| "num_tokens": 67326567.0, | |
| "step": 987 | |
| }, | |
| { | |
| "epoch": 1.9969666329625886, | |
| "grad_norm": 0.16408245265483856, | |
| "learning_rate": 0.00013618938634147996, | |
| "loss": 0.2563, | |
| "mean_token_accuracy": 0.9081169851124287, | |
| "num_tokens": 67393706.0, | |
| "step": 988 | |
| }, | |
| { | |
| "epoch": 1.9989888776541962, | |
| "grad_norm": 0.1415863037109375, | |
| "learning_rate": 0.00013584205223901976, | |
| "loss": 0.2386, | |
| "mean_token_accuracy": 0.909894797950983, | |
| "num_tokens": 67472677.0, | |
| "step": 989 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.19028227031230927, | |
| "learning_rate": 0.00013549511869853973, | |
| "loss": 0.2248, | |
| "mean_token_accuracy": 0.9155159220099449, | |
| "num_tokens": 67511282.0, | |
| "step": 990 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 1485, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 8.243396605005267e+18, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |