MING-ZCH's picture
Upload folder using huggingface_hub
49fac3e verified
{
"best_global_step": 2360,
"best_metric": 0.71119624376297,
"best_model_checkpoint": "/mnt/shared-storage-user/zhangchenhao/work/LLaMA-Factory-own/LLaMA-Factory/saves/SFT_StepCount_all_with_plus_without_point_reasoning_optimized_no_prompt_answer80_point2p5/checkpoint-2360",
"epoch": 3.0,
"eval_steps": 295,
"global_step": 3537,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.004244031830238726,
"grad_norm": 297.75304987581995,
"learning_rate": 5.649717514124295e-07,
"loss": 5.8629,
"num_input_tokens_seen": 4530624,
"step": 5,
"train_runtime": 653.1761,
"train_tokens_per_second": 6936.298
},
{
"epoch": 0.008488063660477453,
"grad_norm": 131.78345996041816,
"learning_rate": 1.2711864406779662e-06,
"loss": 4.6976,
"num_input_tokens_seen": 9034496,
"step": 10,
"train_runtime": 1273.69,
"train_tokens_per_second": 7093.167
},
{
"epoch": 0.01273209549071618,
"grad_norm": 41.38402483800692,
"learning_rate": 1.977401129943503e-06,
"loss": 3.047,
"num_input_tokens_seen": 13435712,
"step": 15,
"train_runtime": 1887.0646,
"train_tokens_per_second": 7119.9
},
{
"epoch": 0.016976127320954906,
"grad_norm": 30.090022612502594,
"learning_rate": 2.6836158192090396e-06,
"loss": 2.1792,
"num_input_tokens_seen": 17831104,
"step": 20,
"train_runtime": 2539.5416,
"train_tokens_per_second": 7021.387
},
{
"epoch": 0.021220159151193633,
"grad_norm": 10.507378327830093,
"learning_rate": 3.3898305084745763e-06,
"loss": 1.806,
"num_input_tokens_seen": 22489280,
"step": 25,
"train_runtime": 3223.2496,
"train_tokens_per_second": 6977.207
},
{
"epoch": 0.02546419098143236,
"grad_norm": 7.849105701004419,
"learning_rate": 4.096045197740113e-06,
"loss": 1.5334,
"num_input_tokens_seen": 26938368,
"step": 30,
"train_runtime": 3838.47,
"train_tokens_per_second": 7017.996
},
{
"epoch": 0.029708222811671087,
"grad_norm": 7.864574931572695,
"learning_rate": 4.80225988700565e-06,
"loss": 1.377,
"num_input_tokens_seen": 31539968,
"step": 35,
"train_runtime": 4503.6915,
"train_tokens_per_second": 7003.137
},
{
"epoch": 0.03395225464190981,
"grad_norm": 7.3747833148860025,
"learning_rate": 5.508474576271187e-06,
"loss": 1.3089,
"num_input_tokens_seen": 35979392,
"step": 40,
"train_runtime": 5147.1318,
"train_tokens_per_second": 6990.183
},
{
"epoch": 0.03819628647214854,
"grad_norm": 8.361663447491122,
"learning_rate": 6.214689265536724e-06,
"loss": 1.2385,
"num_input_tokens_seen": 40533440,
"step": 45,
"train_runtime": 5799.5796,
"train_tokens_per_second": 6989.031
},
{
"epoch": 0.042440318302387266,
"grad_norm": 7.700383821460495,
"learning_rate": 6.92090395480226e-06,
"loss": 1.1772,
"num_input_tokens_seen": 45172288,
"step": 50,
"train_runtime": 6470.1331,
"train_tokens_per_second": 6981.663
},
{
"epoch": 0.04668435013262599,
"grad_norm": 7.153074169274688,
"learning_rate": 7.627118644067798e-06,
"loss": 1.1281,
"num_input_tokens_seen": 49855040,
"step": 55,
"train_runtime": 7138.9239,
"train_tokens_per_second": 6983.551
},
{
"epoch": 0.05092838196286472,
"grad_norm": 5.863677860447595,
"learning_rate": 8.333333333333334e-06,
"loss": 1.0659,
"num_input_tokens_seen": 54260032,
"step": 60,
"train_runtime": 7733.863,
"train_tokens_per_second": 7015.903
},
{
"epoch": 0.05517241379310345,
"grad_norm": 5.211482731988629,
"learning_rate": 9.039548022598871e-06,
"loss": 0.992,
"num_input_tokens_seen": 58804672,
"step": 65,
"train_runtime": 8385.1953,
"train_tokens_per_second": 7012.916
},
{
"epoch": 0.059416445623342175,
"grad_norm": 3.0025030899721137,
"learning_rate": 9.745762711864407e-06,
"loss": 0.9647,
"num_input_tokens_seen": 63162880,
"step": 70,
"train_runtime": 9001.3827,
"train_tokens_per_second": 7017.02
},
{
"epoch": 0.0636604774535809,
"grad_norm": 1.999806623127411,
"learning_rate": 1.0451977401129943e-05,
"loss": 0.9392,
"num_input_tokens_seen": 67642560,
"step": 75,
"train_runtime": 9659.9049,
"train_tokens_per_second": 7002.404
},
{
"epoch": 0.06790450928381962,
"grad_norm": 2.1507559480146172,
"learning_rate": 1.115819209039548e-05,
"loss": 0.9183,
"num_input_tokens_seen": 72091264,
"step": 80,
"train_runtime": 10244.6396,
"train_tokens_per_second": 7036.974
},
{
"epoch": 0.07214854111405836,
"grad_norm": 2.2495505770555297,
"learning_rate": 1.1864406779661018e-05,
"loss": 0.9072,
"num_input_tokens_seen": 76507136,
"step": 85,
"train_runtime": 10853.5827,
"train_tokens_per_second": 7049.021
},
{
"epoch": 0.07639257294429708,
"grad_norm": 2.121224741489407,
"learning_rate": 1.2570621468926556e-05,
"loss": 0.9108,
"num_input_tokens_seen": 80870400,
"step": 90,
"train_runtime": 11472.3492,
"train_tokens_per_second": 7049.158
},
{
"epoch": 0.08063660477453581,
"grad_norm": 2.588502611123473,
"learning_rate": 1.3276836158192092e-05,
"loss": 0.8924,
"num_input_tokens_seen": 85329024,
"step": 95,
"train_runtime": 12124.6375,
"train_tokens_per_second": 7037.656
},
{
"epoch": 0.08488063660477453,
"grad_norm": 2.168204787290024,
"learning_rate": 1.3983050847457627e-05,
"loss": 0.9035,
"num_input_tokens_seen": 89666944,
"step": 100,
"train_runtime": 12721.9573,
"train_tokens_per_second": 7048.203
},
{
"epoch": 0.08912466843501327,
"grad_norm": 2.039908089915845,
"learning_rate": 1.4689265536723165e-05,
"loss": 0.8925,
"num_input_tokens_seen": 94089920,
"step": 105,
"train_runtime": 13381.4037,
"train_tokens_per_second": 7031.394
},
{
"epoch": 0.09336870026525199,
"grad_norm": 120.4758045680371,
"learning_rate": 1.5395480225988703e-05,
"loss": 0.909,
"num_input_tokens_seen": 98437760,
"step": 110,
"train_runtime": 14013.3414,
"train_tokens_per_second": 7024.574
},
{
"epoch": 0.09761273209549072,
"grad_norm": 1.953705157702643,
"learning_rate": 1.6101694915254237e-05,
"loss": 0.9098,
"num_input_tokens_seen": 103115456,
"step": 115,
"train_runtime": 14689.3018,
"train_tokens_per_second": 7019.766
},
{
"epoch": 0.10185676392572944,
"grad_norm": 1.548682015852324,
"learning_rate": 1.6807909604519774e-05,
"loss": 0.8787,
"num_input_tokens_seen": 107696768,
"step": 120,
"train_runtime": 15307.4003,
"train_tokens_per_second": 7035.601
},
{
"epoch": 0.10610079575596817,
"grad_norm": 1.4155432758105517,
"learning_rate": 1.7514124293785312e-05,
"loss": 0.8971,
"num_input_tokens_seen": 112245632,
"step": 125,
"train_runtime": 15954.3044,
"train_tokens_per_second": 7035.445
},
{
"epoch": 0.1103448275862069,
"grad_norm": 1.5268658891716769,
"learning_rate": 1.8220338983050846e-05,
"loss": 0.8735,
"num_input_tokens_seen": 116619392,
"step": 130,
"train_runtime": 16586.124,
"train_tokens_per_second": 7031.142
},
{
"epoch": 0.11458885941644563,
"grad_norm": 1.4550371590475242,
"learning_rate": 1.8926553672316387e-05,
"loss": 0.8871,
"num_input_tokens_seen": 121198656,
"step": 135,
"train_runtime": 17239.014,
"train_tokens_per_second": 7030.487
},
{
"epoch": 0.11883289124668435,
"grad_norm": 1.4816395671279814,
"learning_rate": 1.963276836158192e-05,
"loss": 0.8735,
"num_input_tokens_seen": 125766592,
"step": 140,
"train_runtime": 17856.4497,
"train_tokens_per_second": 7043.203
},
{
"epoch": 0.12307692307692308,
"grad_norm": 1.8562529667931933,
"learning_rate": 2.033898305084746e-05,
"loss": 0.8841,
"num_input_tokens_seen": 130417216,
"step": 145,
"train_runtime": 18488.9056,
"train_tokens_per_second": 7053.809
},
{
"epoch": 0.1273209549071618,
"grad_norm": 1.3288711166284626,
"learning_rate": 2.1045197740112996e-05,
"loss": 0.8847,
"num_input_tokens_seen": 134684288,
"step": 150,
"train_runtime": 19070.7007,
"train_tokens_per_second": 7062.367
},
{
"epoch": 0.13156498673740052,
"grad_norm": 1.716843857648477,
"learning_rate": 2.175141242937853e-05,
"loss": 0.8699,
"num_input_tokens_seen": 139319872,
"step": 155,
"train_runtime": 19757.791,
"train_tokens_per_second": 7051.389
},
{
"epoch": 0.13580901856763924,
"grad_norm": 2.015691688957,
"learning_rate": 2.245762711864407e-05,
"loss": 0.8707,
"num_input_tokens_seen": 143717952,
"step": 160,
"train_runtime": 20371.2146,
"train_tokens_per_second": 7054.953
},
{
"epoch": 0.140053050397878,
"grad_norm": 1.787168466261934,
"learning_rate": 2.3163841807909606e-05,
"loss": 0.8796,
"num_input_tokens_seen": 148245632,
"step": 165,
"train_runtime": 21001.0169,
"train_tokens_per_second": 7058.974
},
{
"epoch": 0.1442970822281167,
"grad_norm": 4.713178832164353,
"learning_rate": 2.3870056497175143e-05,
"loss": 0.8977,
"num_input_tokens_seen": 152687296,
"step": 170,
"train_runtime": 21617.8243,
"train_tokens_per_second": 7063.028
},
{
"epoch": 0.14854111405835543,
"grad_norm": 1.4590858301256864,
"learning_rate": 2.457627118644068e-05,
"loss": 0.8959,
"num_input_tokens_seen": 157314368,
"step": 175,
"train_runtime": 22299.4419,
"train_tokens_per_second": 7054.633
},
{
"epoch": 0.15278514588859415,
"grad_norm": 1.4882611183761851,
"learning_rate": 2.5282485875706215e-05,
"loss": 0.8729,
"num_input_tokens_seen": 161815808,
"step": 180,
"train_runtime": 22965.2954,
"train_tokens_per_second": 7046.102
},
{
"epoch": 0.1570291777188329,
"grad_norm": 1.2937684487451329,
"learning_rate": 2.5988700564971752e-05,
"loss": 0.8801,
"num_input_tokens_seen": 166361792,
"step": 185,
"train_runtime": 23622.1066,
"train_tokens_per_second": 7042.631
},
{
"epoch": 0.16127320954907162,
"grad_norm": 1.326371860515381,
"learning_rate": 2.669491525423729e-05,
"loss": 0.8837,
"num_input_tokens_seen": 170963072,
"step": 190,
"train_runtime": 24288.8241,
"train_tokens_per_second": 7038.755
},
{
"epoch": 0.16551724137931034,
"grad_norm": 1.3881050133378776,
"learning_rate": 2.7401129943502824e-05,
"loss": 0.8848,
"num_input_tokens_seen": 175401600,
"step": 195,
"train_runtime": 24906.6505,
"train_tokens_per_second": 7042.36
},
{
"epoch": 0.16976127320954906,
"grad_norm": 1.5698900690401176,
"learning_rate": 2.8107344632768362e-05,
"loss": 0.8687,
"num_input_tokens_seen": 179779200,
"step": 200,
"train_runtime": 25515.4439,
"train_tokens_per_second": 7045.897
},
{
"epoch": 0.1740053050397878,
"grad_norm": 1.4897036851687533,
"learning_rate": 2.88135593220339e-05,
"loss": 0.8782,
"num_input_tokens_seen": 184224512,
"step": 205,
"train_runtime": 26153.0151,
"train_tokens_per_second": 7044.102
},
{
"epoch": 0.17824933687002653,
"grad_norm": 1.82238111934615,
"learning_rate": 2.951977401129944e-05,
"loss": 0.8838,
"num_input_tokens_seen": 188802432,
"step": 210,
"train_runtime": 26835.6565,
"train_tokens_per_second": 7035.506
},
{
"epoch": 0.18249336870026525,
"grad_norm": 1.653089989083378,
"learning_rate": 3.022598870056497e-05,
"loss": 0.8724,
"num_input_tokens_seen": 193510336,
"step": 215,
"train_runtime": 27522.5514,
"train_tokens_per_second": 7030.974
},
{
"epoch": 0.18673740053050397,
"grad_norm": 1.4627059373154376,
"learning_rate": 3.093220338983051e-05,
"loss": 0.8649,
"num_input_tokens_seen": 197993536,
"step": 220,
"train_runtime": 28129.6066,
"train_tokens_per_second": 7038.617
},
{
"epoch": 0.1909814323607427,
"grad_norm": 1.247186929390262,
"learning_rate": 3.1638418079096046e-05,
"loss": 0.8707,
"num_input_tokens_seen": 202834048,
"step": 225,
"train_runtime": 28804.0632,
"train_tokens_per_second": 7041.855
},
{
"epoch": 0.19522546419098144,
"grad_norm": 1.5240142785161233,
"learning_rate": 3.234463276836158e-05,
"loss": 0.8686,
"num_input_tokens_seen": 207360320,
"step": 230,
"train_runtime": 29445.3467,
"train_tokens_per_second": 7042.21
},
{
"epoch": 0.19946949602122016,
"grad_norm": 1.186249792586507,
"learning_rate": 3.305084745762712e-05,
"loss": 0.8591,
"num_input_tokens_seen": 211972800,
"step": 235,
"train_runtime": 30125.9512,
"train_tokens_per_second": 7036.219
},
{
"epoch": 0.20371352785145888,
"grad_norm": 1.5311530250979444,
"learning_rate": 3.375706214689266e-05,
"loss": 0.88,
"num_input_tokens_seen": 216741248,
"step": 240,
"train_runtime": 30816.4459,
"train_tokens_per_second": 7033.298
},
{
"epoch": 0.2079575596816976,
"grad_norm": 1.0601153174356541,
"learning_rate": 3.446327683615819e-05,
"loss": 0.8828,
"num_input_tokens_seen": 221190080,
"step": 245,
"train_runtime": 31439.204,
"train_tokens_per_second": 7035.486
},
{
"epoch": 0.21220159151193635,
"grad_norm": 1.75722517567588,
"learning_rate": 3.516949152542373e-05,
"loss": 0.8921,
"num_input_tokens_seen": 225847168,
"step": 250,
"train_runtime": 32147.0921,
"train_tokens_per_second": 7025.431
},
{
"epoch": 0.21644562334217507,
"grad_norm": 2.0274303550036263,
"learning_rate": 3.587570621468927e-05,
"loss": 0.9012,
"num_input_tokens_seen": 230582208,
"step": 255,
"train_runtime": 32842.8641,
"train_tokens_per_second": 7020.77
},
{
"epoch": 0.2206896551724138,
"grad_norm": 1.9253331362655286,
"learning_rate": 3.6581920903954806e-05,
"loss": 0.8999,
"num_input_tokens_seen": 235274688,
"step": 260,
"train_runtime": 33539.0669,
"train_tokens_per_second": 7014.944
},
{
"epoch": 0.2249336870026525,
"grad_norm": 2.2082649940934975,
"learning_rate": 3.728813559322034e-05,
"loss": 0.9029,
"num_input_tokens_seen": 239864000,
"step": 265,
"train_runtime": 34221.492,
"train_tokens_per_second": 7009.163
},
{
"epoch": 0.22917771883289126,
"grad_norm": 1.452691571968028,
"learning_rate": 3.799435028248588e-05,
"loss": 0.9046,
"num_input_tokens_seen": 244225344,
"step": 270,
"train_runtime": 34836.4135,
"train_tokens_per_second": 7010.634
},
{
"epoch": 0.23342175066312998,
"grad_norm": 2.730577739346072,
"learning_rate": 3.8700564971751415e-05,
"loss": 0.8874,
"num_input_tokens_seen": 248726272,
"step": 275,
"train_runtime": 35499.0341,
"train_tokens_per_second": 7006.565
},
{
"epoch": 0.2376657824933687,
"grad_norm": 1.093231645373866,
"learning_rate": 3.940677966101695e-05,
"loss": 0.8809,
"num_input_tokens_seen": 253245952,
"step": 280,
"train_runtime": 36135.7808,
"train_tokens_per_second": 7008.177
},
{
"epoch": 0.24190981432360742,
"grad_norm": 1.307461678503626,
"learning_rate": 4.011299435028249e-05,
"loss": 0.8598,
"num_input_tokens_seen": 257684480,
"step": 285,
"train_runtime": 36728.042,
"train_tokens_per_second": 7016.015
},
{
"epoch": 0.24615384615384617,
"grad_norm": 1.2991784389959953,
"learning_rate": 4.0819209039548024e-05,
"loss": 0.8981,
"num_input_tokens_seen": 262108992,
"step": 290,
"train_runtime": 37341.918,
"train_tokens_per_second": 7019.163
},
{
"epoch": 0.25039787798408486,
"grad_norm": 1.4781716766460902,
"learning_rate": 4.152542372881356e-05,
"loss": 0.9007,
"num_input_tokens_seen": 266677504,
"step": 295,
"train_runtime": 38006.4439,
"train_tokens_per_second": 7016.639
},
{
"epoch": 0.25039787798408486,
"eval_loss": 0.8760802745819092,
"eval_runtime": 1055.1289,
"eval_samples_per_second": 2.888,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 266677504,
"step": 295
},
{
"epoch": 0.2546419098143236,
"grad_norm": 1.3728608285035016,
"learning_rate": 4.22316384180791e-05,
"loss": 0.8857,
"num_input_tokens_seen": 271486592,
"step": 300,
"train_runtime": 39774.4771,
"train_tokens_per_second": 6825.648
},
{
"epoch": 0.25888594164456236,
"grad_norm": 1.3128466580163847,
"learning_rate": 4.2937853107344634e-05,
"loss": 1.0976,
"num_input_tokens_seen": 276035072,
"step": 305,
"train_runtime": 40413.7335,
"train_tokens_per_second": 6830.229
},
{
"epoch": 0.26312997347480105,
"grad_norm": 1.9380613988266078,
"learning_rate": 4.3644067796610175e-05,
"loss": 0.8879,
"num_input_tokens_seen": 280424512,
"step": 310,
"train_runtime": 41016.3815,
"train_tokens_per_second": 6836.891
},
{
"epoch": 0.2673740053050398,
"grad_norm": 1.7436380468280226,
"learning_rate": 4.435028248587571e-05,
"loss": 0.894,
"num_input_tokens_seen": 284818432,
"step": 315,
"train_runtime": 41649.8882,
"train_tokens_per_second": 6838.396
},
{
"epoch": 0.2716180371352785,
"grad_norm": 1.81753757832679,
"learning_rate": 4.505649717514124e-05,
"loss": 0.8883,
"num_input_tokens_seen": 289399424,
"step": 320,
"train_runtime": 42320.1609,
"train_tokens_per_second": 6838.335
},
{
"epoch": 0.27586206896551724,
"grad_norm": 0.9404537896048348,
"learning_rate": 4.5762711864406784e-05,
"loss": 0.9124,
"num_input_tokens_seen": 293917312,
"step": 325,
"train_runtime": 42970.7229,
"train_tokens_per_second": 6839.943
},
{
"epoch": 0.280106100795756,
"grad_norm": 2.947005859311638,
"learning_rate": 4.646892655367232e-05,
"loss": 0.8892,
"num_input_tokens_seen": 298366272,
"step": 330,
"train_runtime": 43553.8539,
"train_tokens_per_second": 6850.514
},
{
"epoch": 0.2843501326259947,
"grad_norm": 1.6174994735745432,
"learning_rate": 4.717514124293785e-05,
"loss": 0.8881,
"num_input_tokens_seen": 302869952,
"step": 335,
"train_runtime": 44211.0204,
"train_tokens_per_second": 6850.553
},
{
"epoch": 0.2885941644562334,
"grad_norm": 1.7792646072660459,
"learning_rate": 4.788135593220339e-05,
"loss": 0.8803,
"num_input_tokens_seen": 307426560,
"step": 340,
"train_runtime": 44880.4175,
"train_tokens_per_second": 6849.904
},
{
"epoch": 0.2928381962864722,
"grad_norm": 1.0669701860722622,
"learning_rate": 4.8587570621468934e-05,
"loss": 0.897,
"num_input_tokens_seen": 312164928,
"step": 345,
"train_runtime": 45578.8582,
"train_tokens_per_second": 6848.897
},
{
"epoch": 0.29708222811671087,
"grad_norm": 1.2083166811125388,
"learning_rate": 4.929378531073446e-05,
"loss": 0.9033,
"num_input_tokens_seen": 316741824,
"step": 350,
"train_runtime": 46190.9574,
"train_tokens_per_second": 6857.226
},
{
"epoch": 0.3013262599469496,
"grad_norm": 2.667811908170323,
"learning_rate": 5e-05,
"loss": 0.8758,
"num_input_tokens_seen": 321123008,
"step": 355,
"train_runtime": 46809.8332,
"train_tokens_per_second": 6860.161
},
{
"epoch": 0.3055702917771883,
"grad_norm": 1.5315598777999704,
"learning_rate": 4.999969557829892e-05,
"loss": 0.8975,
"num_input_tokens_seen": 325636416,
"step": 360,
"train_runtime": 47466.7463,
"train_tokens_per_second": 6860.306
},
{
"epoch": 0.30981432360742706,
"grad_norm": 1.0061504802501977,
"learning_rate": 4.999878232060946e-05,
"loss": 0.8919,
"num_input_tokens_seen": 330217472,
"step": 365,
"train_runtime": 48143.739,
"train_tokens_per_second": 6858.991
},
{
"epoch": 0.3140583554376658,
"grad_norm": 1.0691191893512106,
"learning_rate": 4.999726024917288e-05,
"loss": 0.8775,
"num_input_tokens_seen": 334605888,
"step": 370,
"train_runtime": 48800.4766,
"train_tokens_per_second": 6856.611
},
{
"epoch": 0.3183023872679045,
"grad_norm": 1.1646960467870506,
"learning_rate": 4.99951294010573e-05,
"loss": 0.8944,
"num_input_tokens_seen": 339190016,
"step": 375,
"train_runtime": 49427.2381,
"train_tokens_per_second": 6862.411
},
{
"epoch": 0.32254641909814324,
"grad_norm": 13.073420853243217,
"learning_rate": 4.999238982815683e-05,
"loss": 0.908,
"num_input_tokens_seen": 343751808,
"step": 380,
"train_runtime": 50079.7525,
"train_tokens_per_second": 6864.088
},
{
"epoch": 0.32679045092838194,
"grad_norm": 1.030515508528764,
"learning_rate": 4.99890415971903e-05,
"loss": 0.9207,
"num_input_tokens_seen": 348170752,
"step": 385,
"train_runtime": 50717.2414,
"train_tokens_per_second": 6864.939
},
{
"epoch": 0.3310344827586207,
"grad_norm": 2.462002678202629,
"learning_rate": 4.9985084789699645e-05,
"loss": 0.8857,
"num_input_tokens_seen": 352658368,
"step": 390,
"train_runtime": 51341.856,
"train_tokens_per_second": 6868.828
},
{
"epoch": 0.33527851458885943,
"grad_norm": 1.1178056520913622,
"learning_rate": 4.998051950204792e-05,
"loss": 0.8942,
"num_input_tokens_seen": 357241472,
"step": 395,
"train_runtime": 51991.5243,
"train_tokens_per_second": 6871.148
},
{
"epoch": 0.3395225464190981,
"grad_norm": 12.174478000037658,
"learning_rate": 4.997534584541692e-05,
"loss": 0.9033,
"num_input_tokens_seen": 361706368,
"step": 400,
"train_runtime": 52622.4401,
"train_tokens_per_second": 6873.615
},
{
"epoch": 0.3437665782493369,
"grad_norm": 1.3129250294672423,
"learning_rate": 4.996956394580453e-05,
"loss": 0.9534,
"num_input_tokens_seen": 366301824,
"step": 405,
"train_runtime": 53287.8409,
"train_tokens_per_second": 6874.023
},
{
"epoch": 0.3480106100795756,
"grad_norm": 1.2922663914137134,
"learning_rate": 4.9963173944021604e-05,
"loss": 0.907,
"num_input_tokens_seen": 370813440,
"step": 410,
"train_runtime": 53932.276,
"train_tokens_per_second": 6875.539
},
{
"epoch": 0.3522546419098143,
"grad_norm": 3.97037390684764,
"learning_rate": 4.995617599568855e-05,
"loss": 0.8908,
"num_input_tokens_seen": 375343232,
"step": 415,
"train_runtime": 54587.718,
"train_tokens_per_second": 6875.965
},
{
"epoch": 0.35649867374005306,
"grad_norm": 1.198328142782024,
"learning_rate": 4.9948570271231553e-05,
"loss": 0.8871,
"num_input_tokens_seen": 379904000,
"step": 420,
"train_runtime": 55281.7215,
"train_tokens_per_second": 6872.145
},
{
"epoch": 0.36074270557029176,
"grad_norm": 0.971040029995864,
"learning_rate": 4.9940356955878436e-05,
"loss": 0.883,
"num_input_tokens_seen": 384479488,
"step": 425,
"train_runtime": 55946.6877,
"train_tokens_per_second": 6872.248
},
{
"epoch": 0.3649867374005305,
"grad_norm": 1.1690338997234486,
"learning_rate": 4.99315362496541e-05,
"loss": 0.8915,
"num_input_tokens_seen": 389250176,
"step": 430,
"train_runtime": 56652.1149,
"train_tokens_per_second": 6870.885
},
{
"epoch": 0.36923076923076925,
"grad_norm": 1.33505203332503,
"learning_rate": 4.9922108367375695e-05,
"loss": 0.8924,
"num_input_tokens_seen": 393810688,
"step": 435,
"train_runtime": 57300.4821,
"train_tokens_per_second": 6872.729
},
{
"epoch": 0.37347480106100794,
"grad_norm": 1.049216620404627,
"learning_rate": 4.991207353864739e-05,
"loss": 0.8777,
"num_input_tokens_seen": 398511168,
"step": 440,
"train_runtime": 57988.9682,
"train_tokens_per_second": 6872.189
},
{
"epoch": 0.3777188328912467,
"grad_norm": 1.1473154938029155,
"learning_rate": 4.9901432007854744e-05,
"loss": 0.8633,
"num_input_tokens_seen": 403089152,
"step": 445,
"train_runtime": 58633.6036,
"train_tokens_per_second": 6874.712
},
{
"epoch": 0.3819628647214854,
"grad_norm": 1.4204892310138295,
"learning_rate": 4.9890184034158794e-05,
"loss": 0.8873,
"num_input_tokens_seen": 407921792,
"step": 450,
"train_runtime": 59298.1247,
"train_tokens_per_second": 6879.169
},
{
"epoch": 0.38620689655172413,
"grad_norm": 1.1936514251690153,
"learning_rate": 4.987832989148973e-05,
"loss": 0.8795,
"num_input_tokens_seen": 412324096,
"step": 455,
"train_runtime": 59926.4345,
"train_tokens_per_second": 6880.504
},
{
"epoch": 0.3904509283819629,
"grad_norm": 1.10287952418463,
"learning_rate": 4.986586986854019e-05,
"loss": 0.8606,
"num_input_tokens_seen": 416742912,
"step": 460,
"train_runtime": 60568.403,
"train_tokens_per_second": 6880.533
},
{
"epoch": 0.3946949602122016,
"grad_norm": 1.072213338502524,
"learning_rate": 4.985280426875831e-05,
"loss": 0.872,
"num_input_tokens_seen": 421138880,
"step": 465,
"train_runtime": 61260.5252,
"train_tokens_per_second": 6874.555
},
{
"epoch": 0.3989389920424403,
"grad_norm": 1.0219296197838135,
"learning_rate": 4.983913341034026e-05,
"loss": 0.8775,
"num_input_tokens_seen": 425727936,
"step": 470,
"train_runtime": 61904.3467,
"train_tokens_per_second": 6877.19
},
{
"epoch": 0.40318302387267907,
"grad_norm": 1.3843761799310907,
"learning_rate": 4.98248576262225e-05,
"loss": 0.8775,
"num_input_tokens_seen": 430157696,
"step": 475,
"train_runtime": 62556.696,
"train_tokens_per_second": 6876.285
},
{
"epoch": 0.40742705570291776,
"grad_norm": 1.1025487338096294,
"learning_rate": 4.980997726407371e-05,
"loss": 0.8504,
"num_input_tokens_seen": 434654208,
"step": 480,
"train_runtime": 63190.0546,
"train_tokens_per_second": 6878.522
},
{
"epoch": 0.4116710875331565,
"grad_norm": 1.2747087605024068,
"learning_rate": 4.979449268628632e-05,
"loss": 0.8666,
"num_input_tokens_seen": 439274752,
"step": 485,
"train_runtime": 63846.4067,
"train_tokens_per_second": 6880.18
},
{
"epoch": 0.4159151193633952,
"grad_norm": 1.1710609815467128,
"learning_rate": 4.977840426996763e-05,
"loss": 0.8805,
"num_input_tokens_seen": 443719872,
"step": 490,
"train_runtime": 64497.93,
"train_tokens_per_second": 6879.599
},
{
"epoch": 0.42015915119363395,
"grad_norm": 1.0097086187416695,
"learning_rate": 4.97617124069307e-05,
"loss": 0.8903,
"num_input_tokens_seen": 448255296,
"step": 495,
"train_runtime": 65132.19,
"train_tokens_per_second": 6882.239
},
{
"epoch": 0.4244031830238727,
"grad_norm": 37.867408485972554,
"learning_rate": 4.974441750368476e-05,
"loss": 0.8397,
"num_input_tokens_seen": 452923520,
"step": 500,
"train_runtime": 65815.3494,
"train_tokens_per_second": 6881.731
},
{
"epoch": 0.4286472148541114,
"grad_norm": 1.0330730902667171,
"learning_rate": 4.97265199814253e-05,
"loss": 0.8865,
"num_input_tokens_seen": 457377280,
"step": 505,
"train_runtime": 66451.0204,
"train_tokens_per_second": 6882.923
},
{
"epoch": 0.43289124668435014,
"grad_norm": 1.3051062489077976,
"learning_rate": 4.9708020276023874e-05,
"loss": 0.86,
"num_input_tokens_seen": 461956224,
"step": 510,
"train_runtime": 67114.9924,
"train_tokens_per_second": 6883.056
},
{
"epoch": 0.43713527851458883,
"grad_norm": 2.1187078081806012,
"learning_rate": 4.968891883801742e-05,
"loss": 0.8749,
"num_input_tokens_seen": 466374976,
"step": 515,
"train_runtime": 67739.4567,
"train_tokens_per_second": 6884.835
},
{
"epoch": 0.4413793103448276,
"grad_norm": 1.4438973622990432,
"learning_rate": 4.966921613259731e-05,
"loss": 0.871,
"num_input_tokens_seen": 470742528,
"step": 520,
"train_runtime": 68365.462,
"train_tokens_per_second": 6885.678
},
{
"epoch": 0.44562334217506633,
"grad_norm": 1.53355639196128,
"learning_rate": 4.964891263959803e-05,
"loss": 0.8369,
"num_input_tokens_seen": 475324480,
"step": 525,
"train_runtime": 69025.3358,
"train_tokens_per_second": 6886.232
},
{
"epoch": 0.449867374005305,
"grad_norm": 1.128289481595987,
"learning_rate": 4.962800885348551e-05,
"loss": 0.863,
"num_input_tokens_seen": 479877312,
"step": 530,
"train_runtime": 69684.2916,
"train_tokens_per_second": 6886.449
},
{
"epoch": 0.45411140583554377,
"grad_norm": 1.0503072430304274,
"learning_rate": 4.960650528334502e-05,
"loss": 0.8667,
"num_input_tokens_seen": 484343232,
"step": 535,
"train_runtime": 70344.4784,
"train_tokens_per_second": 6885.306
},
{
"epoch": 0.4583554376657825,
"grad_norm": 0.9545521304763791,
"learning_rate": 4.958440245286884e-05,
"loss": 0.8696,
"num_input_tokens_seen": 488876416,
"step": 540,
"train_runtime": 70972.8212,
"train_tokens_per_second": 6888.22
},
{
"epoch": 0.4625994694960212,
"grad_norm": 1.3295608584891012,
"learning_rate": 4.956170090034346e-05,
"loss": 0.8349,
"num_input_tokens_seen": 493485888,
"step": 545,
"train_runtime": 71650.7674,
"train_tokens_per_second": 6887.378
},
{
"epoch": 0.46684350132625996,
"grad_norm": 1.1735342027871698,
"learning_rate": 4.953840117863652e-05,
"loss": 0.8458,
"num_input_tokens_seen": 498090432,
"step": 550,
"train_runtime": 72292.0675,
"train_tokens_per_second": 6889.974
},
{
"epoch": 0.47108753315649865,
"grad_norm": 1.2695672366224662,
"learning_rate": 4.951450385518328e-05,
"loss": 0.8423,
"num_input_tokens_seen": 502546368,
"step": 555,
"train_runtime": 72919.7187,
"train_tokens_per_second": 6891.776
},
{
"epoch": 0.4753315649867374,
"grad_norm": 1.0194113412118773,
"learning_rate": 4.9490009511972856e-05,
"loss": 0.8536,
"num_input_tokens_seen": 507353920,
"step": 560,
"train_runtime": 73610.9277,
"train_tokens_per_second": 6892.372
},
{
"epoch": 0.47957559681697615,
"grad_norm": 1.0743184753428263,
"learning_rate": 4.9464918745534e-05,
"loss": 0.8325,
"num_input_tokens_seen": 511882560,
"step": 565,
"train_runtime": 74223.3431,
"train_tokens_per_second": 6896.517
},
{
"epoch": 0.48381962864721484,
"grad_norm": 1.1038161960566173,
"learning_rate": 4.943923216692064e-05,
"loss": 0.834,
"num_input_tokens_seen": 516353792,
"step": 570,
"train_runtime": 74834.7805,
"train_tokens_per_second": 6899.917
},
{
"epoch": 0.4880636604774536,
"grad_norm": 1.0619822713768314,
"learning_rate": 4.941295040169692e-05,
"loss": 0.8388,
"num_input_tokens_seen": 520893376,
"step": 575,
"train_runtime": 75515.5945,
"train_tokens_per_second": 6897.825
},
{
"epoch": 0.49230769230769234,
"grad_norm": 0.9968217355531681,
"learning_rate": 4.938607408992201e-05,
"loss": 0.8393,
"num_input_tokens_seen": 525369600,
"step": 580,
"train_runtime": 76150.219,
"train_tokens_per_second": 6899.121
},
{
"epoch": 0.496551724137931,
"grad_norm": 1.081156576705322,
"learning_rate": 4.9358603886134516e-05,
"loss": 0.8227,
"num_input_tokens_seen": 529878080,
"step": 585,
"train_runtime": 76814.7653,
"train_tokens_per_second": 6898.128
},
{
"epoch": 0.5007957559681697,
"grad_norm": 0.9811791489788025,
"learning_rate": 4.9330540459336536e-05,
"loss": 0.8409,
"num_input_tokens_seen": 534499648,
"step": 590,
"train_runtime": 77463.1501,
"train_tokens_per_second": 6900.051
},
{
"epoch": 0.5007957559681697,
"eval_loss": 0.8492689728736877,
"eval_runtime": 1055.1977,
"eval_samples_per_second": 2.888,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 534499648,
"step": 590
},
{
"epoch": 0.5050397877984085,
"grad_norm": 1.0876099733444793,
"learning_rate": 4.930188449297737e-05,
"loss": 0.8384,
"num_input_tokens_seen": 538899968,
"step": 595,
"train_runtime": 79158.0223,
"train_tokens_per_second": 6807.901
},
{
"epoch": 0.5092838196286472,
"grad_norm": 0.999155054979559,
"learning_rate": 4.927263668493683e-05,
"loss": 0.8359,
"num_input_tokens_seen": 543296704,
"step": 600,
"train_runtime": 79818.1806,
"train_tokens_per_second": 6806.679
},
{
"epoch": 0.5135278514588859,
"grad_norm": 1.3228294516057693,
"learning_rate": 4.924279774750835e-05,
"loss": 0.8315,
"num_input_tokens_seen": 548007296,
"step": 605,
"train_runtime": 80478.8962,
"train_tokens_per_second": 6809.329
},
{
"epoch": 0.5177718832891247,
"grad_norm": 1.1178865175204313,
"learning_rate": 4.9212368407381515e-05,
"loss": 0.8577,
"num_input_tokens_seen": 552534656,
"step": 610,
"train_runtime": 81169.77,
"train_tokens_per_second": 6807.148
},
{
"epoch": 0.5220159151193634,
"grad_norm": 1.1717734488513787,
"learning_rate": 4.9181349405624444e-05,
"loss": 0.8449,
"num_input_tokens_seen": 557040512,
"step": 615,
"train_runtime": 81818.0476,
"train_tokens_per_second": 6808.284
},
{
"epoch": 0.5262599469496021,
"grad_norm": 1.281506485794031,
"learning_rate": 4.9149741497665724e-05,
"loss": 0.8236,
"num_input_tokens_seen": 561632640,
"step": 620,
"train_runtime": 82448.1767,
"train_tokens_per_second": 6811.947
},
{
"epoch": 0.5305039787798409,
"grad_norm": 1.2084706718767035,
"learning_rate": 4.9117545453276016e-05,
"loss": 0.8396,
"num_input_tokens_seen": 566108032,
"step": 625,
"train_runtime": 83077.9364,
"train_tokens_per_second": 6814.18
},
{
"epoch": 0.5347480106100796,
"grad_norm": 0.9983384620282137,
"learning_rate": 4.908476205654926e-05,
"loss": 0.8534,
"num_input_tokens_seen": 570380992,
"step": 630,
"train_runtime": 83704.2336,
"train_tokens_per_second": 6814.243
},
{
"epoch": 0.5389920424403183,
"grad_norm": 0.9073500421909143,
"learning_rate": 4.905139210588367e-05,
"loss": 0.8345,
"num_input_tokens_seen": 574750656,
"step": 635,
"train_runtime": 84320.0888,
"train_tokens_per_second": 6816.296
},
{
"epoch": 0.543236074270557,
"grad_norm": 0.8991742233452803,
"learning_rate": 4.9017436413962214e-05,
"loss": 0.8238,
"num_input_tokens_seen": 579340672,
"step": 640,
"train_runtime": 84974.8594,
"train_tokens_per_second": 6817.789
},
{
"epoch": 0.5474801061007958,
"grad_norm": 4.653712662762566,
"learning_rate": 4.898289580773284e-05,
"loss": 0.8246,
"num_input_tokens_seen": 583953984,
"step": 645,
"train_runtime": 85620.5078,
"train_tokens_per_second": 6820.258
},
{
"epoch": 0.5517241379310345,
"grad_norm": 1.2360522636755376,
"learning_rate": 4.8947771128388375e-05,
"loss": 0.8556,
"num_input_tokens_seen": 588613760,
"step": 650,
"train_runtime": 86304.8212,
"train_tokens_per_second": 6820.172
},
{
"epoch": 0.5559681697612732,
"grad_norm": 1.012520096736992,
"learning_rate": 4.891206323134598e-05,
"loss": 0.8536,
"num_input_tokens_seen": 593383872,
"step": 655,
"train_runtime": 87061.5623,
"train_tokens_per_second": 6815.681
},
{
"epoch": 0.560212201591512,
"grad_norm": 0.8267761442768032,
"learning_rate": 4.887577298622635e-05,
"loss": 0.8353,
"num_input_tokens_seen": 597805376,
"step": 660,
"train_runtime": 87694.9854,
"train_tokens_per_second": 6816.871
},
{
"epoch": 0.5644562334217507,
"grad_norm": 0.7725220551193656,
"learning_rate": 4.883890127683255e-05,
"loss": 0.8328,
"num_input_tokens_seen": 602285312,
"step": 665,
"train_runtime": 88327.7078,
"train_tokens_per_second": 6818.759
},
{
"epoch": 0.5687002652519894,
"grad_norm": 1.0161234053244246,
"learning_rate": 4.8801449001128455e-05,
"loss": 0.8292,
"num_input_tokens_seen": 606832384,
"step": 670,
"train_runtime": 88954.121,
"train_tokens_per_second": 6821.858
},
{
"epoch": 0.5729442970822282,
"grad_norm": 0.9710515463242312,
"learning_rate": 4.87634170712169e-05,
"loss": 0.8299,
"num_input_tokens_seen": 611343936,
"step": 675,
"train_runtime": 89596.2203,
"train_tokens_per_second": 6823.323
},
{
"epoch": 0.5771883289124669,
"grad_norm": 0.877400582973328,
"learning_rate": 4.872480641331747e-05,
"loss": 0.8233,
"num_input_tokens_seen": 615967936,
"step": 680,
"train_runtime": 90249.7743,
"train_tokens_per_second": 6825.147
},
{
"epoch": 0.5814323607427055,
"grad_norm": 1.088037474445821,
"learning_rate": 4.868561796774394e-05,
"loss": 0.834,
"num_input_tokens_seen": 620611584,
"step": 685,
"train_runtime": 90936.5491,
"train_tokens_per_second": 6824.666
},
{
"epoch": 0.5856763925729443,
"grad_norm": 1.1907419217852628,
"learning_rate": 4.8645852688881355e-05,
"loss": 0.8409,
"num_input_tokens_seen": 624875392,
"step": 690,
"train_runtime": 91496.14,
"train_tokens_per_second": 6829.527
},
{
"epoch": 0.589920424403183,
"grad_norm": 1.050185541897206,
"learning_rate": 4.860551154516285e-05,
"loss": 0.8312,
"num_input_tokens_seen": 629393280,
"step": 695,
"train_runtime": 92108.2917,
"train_tokens_per_second": 6833.188
},
{
"epoch": 0.5941644562334217,
"grad_norm": 1.010783386950393,
"learning_rate": 4.856459551904597e-05,
"loss": 0.8435,
"num_input_tokens_seen": 633982080,
"step": 700,
"train_runtime": 92773.246,
"train_tokens_per_second": 6833.674
},
{
"epoch": 0.5984084880636604,
"grad_norm": 0.8655815203324599,
"learning_rate": 4.8523105606988835e-05,
"loss": 0.8341,
"num_input_tokens_seen": 638403328,
"step": 705,
"train_runtime": 93409.0451,
"train_tokens_per_second": 6834.492
},
{
"epoch": 0.6026525198938992,
"grad_norm": 1.0223867573550975,
"learning_rate": 4.84810428194258e-05,
"loss": 0.8298,
"num_input_tokens_seen": 643227008,
"step": 710,
"train_runtime": 94093.6295,
"train_tokens_per_second": 6836.031
},
{
"epoch": 0.6068965517241379,
"grad_norm": 0.9038681896334841,
"learning_rate": 4.8438408180742894e-05,
"loss": 0.8236,
"num_input_tokens_seen": 647670528,
"step": 715,
"train_runtime": 94754.1626,
"train_tokens_per_second": 6835.273
},
{
"epoch": 0.6111405835543766,
"grad_norm": 0.8915747409475175,
"learning_rate": 4.839520272925286e-05,
"loss": 0.8321,
"num_input_tokens_seen": 652249152,
"step": 720,
"train_runtime": 95432.8253,
"train_tokens_per_second": 6834.642
},
{
"epoch": 0.6153846153846154,
"grad_norm": 0.97865694250563,
"learning_rate": 4.835142751716986e-05,
"loss": 0.8209,
"num_input_tokens_seen": 656865472,
"step": 725,
"train_runtime": 96098.174,
"train_tokens_per_second": 6835.359
},
{
"epoch": 0.6196286472148541,
"grad_norm": 1.059303689963251,
"learning_rate": 4.8307083610583846e-05,
"loss": 0.8402,
"num_input_tokens_seen": 661394048,
"step": 730,
"train_runtime": 96740.313,
"train_tokens_per_second": 6836.799
},
{
"epoch": 0.6238726790450928,
"grad_norm": 1.0997482317899427,
"learning_rate": 4.8262172089434635e-05,
"loss": 0.8172,
"num_input_tokens_seen": 666086336,
"step": 735,
"train_runtime": 97449.6172,
"train_tokens_per_second": 6835.187
},
{
"epoch": 0.6281167108753316,
"grad_norm": 1.0533663604386134,
"learning_rate": 4.8216694047485554e-05,
"loss": 0.8044,
"num_input_tokens_seen": 670654912,
"step": 740,
"train_runtime": 98101.755,
"train_tokens_per_second": 6836.319
},
{
"epoch": 0.6323607427055703,
"grad_norm": 0.9308590706085377,
"learning_rate": 4.817065059229682e-05,
"loss": 0.8285,
"num_input_tokens_seen": 675148480,
"step": 745,
"train_runtime": 98698.5544,
"train_tokens_per_second": 6840.51
},
{
"epoch": 0.636604774535809,
"grad_norm": 1.3308817219838736,
"learning_rate": 4.812404284519861e-05,
"loss": 0.8158,
"num_input_tokens_seen": 679716288,
"step": 750,
"train_runtime": 99373.7407,
"train_tokens_per_second": 6839.999
},
{
"epoch": 0.6408488063660478,
"grad_norm": 0.9405621214215828,
"learning_rate": 4.8076871941263676e-05,
"loss": 0.8149,
"num_input_tokens_seen": 684227584,
"step": 755,
"train_runtime": 100016.4536,
"train_tokens_per_second": 6841.15
},
{
"epoch": 0.6450928381962865,
"grad_norm": 0.8872116598330962,
"learning_rate": 4.8029139029279785e-05,
"loss": 0.826,
"num_input_tokens_seen": 688566720,
"step": 760,
"train_runtime": 100660.3191,
"train_tokens_per_second": 6840.498
},
{
"epoch": 0.6493368700265252,
"grad_norm": 0.9148948434386133,
"learning_rate": 4.798084527172167e-05,
"loss": 0.8166,
"num_input_tokens_seen": 693048704,
"step": 765,
"train_runtime": 101289.7336,
"train_tokens_per_second": 6842.24
},
{
"epoch": 0.6535809018567639,
"grad_norm": 1.0036481967491233,
"learning_rate": 4.793199184472274e-05,
"loss": 0.7923,
"num_input_tokens_seen": 697787008,
"step": 770,
"train_runtime": 101976.2461,
"train_tokens_per_second": 6842.643
},
{
"epoch": 0.6578249336870027,
"grad_norm": 1.2742858954042269,
"learning_rate": 4.7882579938046485e-05,
"loss": 0.8215,
"num_input_tokens_seen": 702239936,
"step": 775,
"train_runtime": 102610.041,
"train_tokens_per_second": 6843.774
},
{
"epoch": 0.6620689655172414,
"grad_norm": 0.8006816309417074,
"learning_rate": 4.783261075505743e-05,
"loss": 0.8246,
"num_input_tokens_seen": 706860352,
"step": 780,
"train_runtime": 103308.8254,
"train_tokens_per_second": 6842.207
},
{
"epoch": 0.6663129973474801,
"grad_norm": 0.965961260253803,
"learning_rate": 4.7782085512691875e-05,
"loss": 0.8038,
"num_input_tokens_seen": 711270720,
"step": 785,
"train_runtime": 103950.3781,
"train_tokens_per_second": 6842.406
},
{
"epoch": 0.6705570291777189,
"grad_norm": 1.0034852088994206,
"learning_rate": 4.7731005441428233e-05,
"loss": 0.8138,
"num_input_tokens_seen": 715836544,
"step": 790,
"train_runtime": 104619.9901,
"train_tokens_per_second": 6842.254
},
{
"epoch": 0.6748010610079576,
"grad_norm": 1.0147223046467029,
"learning_rate": 4.767937178525709e-05,
"loss": 0.8025,
"num_input_tokens_seen": 720186176,
"step": 795,
"train_runtime": 105243.2657,
"train_tokens_per_second": 6843.062
},
{
"epoch": 0.6790450928381963,
"grad_norm": 0.926457311442124,
"learning_rate": 4.7627185801650856e-05,
"loss": 0.7936,
"num_input_tokens_seen": 724606016,
"step": 800,
"train_runtime": 105887.5277,
"train_tokens_per_second": 6843.167
},
{
"epoch": 0.683289124668435,
"grad_norm": 0.8015225346610259,
"learning_rate": 4.757444876153323e-05,
"loss": 0.8111,
"num_input_tokens_seen": 729128640,
"step": 805,
"train_runtime": 106518.7609,
"train_tokens_per_second": 6845.072
},
{
"epoch": 0.6875331564986737,
"grad_norm": 0.9590554887765099,
"learning_rate": 4.752116194924816e-05,
"loss": 0.817,
"num_input_tokens_seen": 733513856,
"step": 810,
"train_runtime": 107154.5457,
"train_tokens_per_second": 6845.383
},
{
"epoch": 0.6917771883289124,
"grad_norm": 1.252406369714219,
"learning_rate": 4.746732666252861e-05,
"loss": 0.8036,
"num_input_tokens_seen": 737837376,
"step": 815,
"train_runtime": 107786.6851,
"train_tokens_per_second": 6845.348
},
{
"epoch": 0.6960212201591512,
"grad_norm": 1.1913694647387847,
"learning_rate": 4.7412944212464935e-05,
"loss": 0.8184,
"num_input_tokens_seen": 742147072,
"step": 820,
"train_runtime": 108401.4826,
"train_tokens_per_second": 6846.282
},
{
"epoch": 0.7002652519893899,
"grad_norm": 0.9763850021800689,
"learning_rate": 4.7358015923472986e-05,
"loss": 0.8044,
"num_input_tokens_seen": 747045952,
"step": 825,
"train_runtime": 109094.785,
"train_tokens_per_second": 6847.678
},
{
"epoch": 0.7045092838196286,
"grad_norm": 0.9052886472757133,
"learning_rate": 4.730254313326181e-05,
"loss": 0.8081,
"num_input_tokens_seen": 751393984,
"step": 830,
"train_runtime": 109732.5528,
"train_tokens_per_second": 6847.503
},
{
"epoch": 0.7087533156498673,
"grad_norm": 0.8281556672805458,
"learning_rate": 4.724652719280111e-05,
"loss": 0.7982,
"num_input_tokens_seen": 756075328,
"step": 835,
"train_runtime": 110411.1601,
"train_tokens_per_second": 6847.816
},
{
"epoch": 0.7129973474801061,
"grad_norm": 0.971171145956504,
"learning_rate": 4.718996946628829e-05,
"loss": 0.7825,
"num_input_tokens_seen": 760528320,
"step": 840,
"train_runtime": 111055.3652,
"train_tokens_per_second": 6848.191
},
{
"epoch": 0.7172413793103448,
"grad_norm": 0.9931373968227002,
"learning_rate": 4.713287133111533e-05,
"loss": 0.8096,
"num_input_tokens_seen": 765244928,
"step": 845,
"train_runtime": 111727.8513,
"train_tokens_per_second": 6849.187
},
{
"epoch": 0.7214854111405835,
"grad_norm": 1.2834237882216515,
"learning_rate": 4.707523417783511e-05,
"loss": 0.7948,
"num_input_tokens_seen": 769642624,
"step": 850,
"train_runtime": 112389.6421,
"train_tokens_per_second": 6847.985
},
{
"epoch": 0.7257294429708223,
"grad_norm": 0.8486465229852926,
"learning_rate": 4.701705941012767e-05,
"loss": 0.8044,
"num_input_tokens_seen": 774147136,
"step": 855,
"train_runtime": 113005.1043,
"train_tokens_per_second": 6850.55
},
{
"epoch": 0.729973474801061,
"grad_norm": 0.7791940973514704,
"learning_rate": 4.6958348444765954e-05,
"loss": 0.7998,
"num_input_tokens_seen": 778752064,
"step": 860,
"train_runtime": 113685.5974,
"train_tokens_per_second": 6850.05
},
{
"epoch": 0.7342175066312997,
"grad_norm": 1.1164402590095137,
"learning_rate": 4.689910271158131e-05,
"loss": 0.8177,
"num_input_tokens_seen": 783091968,
"step": 865,
"train_runtime": 114295.4264,
"train_tokens_per_second": 6851.472
},
{
"epoch": 0.7384615384615385,
"grad_norm": 0.9094005817671243,
"learning_rate": 4.6839323653428693e-05,
"loss": 0.8154,
"num_input_tokens_seen": 787572544,
"step": 870,
"train_runtime": 114937.7188,
"train_tokens_per_second": 6852.168
},
{
"epoch": 0.7427055702917772,
"grad_norm": 1.0703160970060077,
"learning_rate": 4.677901272615149e-05,
"loss": 0.8013,
"num_input_tokens_seen": 791977152,
"step": 875,
"train_runtime": 115587.8811,
"train_tokens_per_second": 6851.732
},
{
"epoch": 0.7469496021220159,
"grad_norm": 0.749724050960587,
"learning_rate": 4.6718171398546136e-05,
"loss": 0.7849,
"num_input_tokens_seen": 796372864,
"step": 880,
"train_runtime": 116239.6888,
"train_tokens_per_second": 6851.127
},
{
"epoch": 0.7511936339522547,
"grad_norm": 0.8931070149936695,
"learning_rate": 4.6656801152326244e-05,
"loss": 0.7947,
"num_input_tokens_seen": 800903424,
"step": 885,
"train_runtime": 116882.2446,
"train_tokens_per_second": 6852.225
},
{
"epoch": 0.7511936339522547,
"eval_loss": 0.8023512363433838,
"eval_runtime": 1055.9576,
"eval_samples_per_second": 2.886,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 800903424,
"step": 885
},
{
"epoch": 0.7554376657824934,
"grad_norm": 0.8326667759990252,
"learning_rate": 4.6594903482086605e-05,
"loss": 0.7865,
"num_input_tokens_seen": 805358784,
"step": 890,
"train_runtime": 118552.9366,
"train_tokens_per_second": 6793.242
},
{
"epoch": 0.7596816976127321,
"grad_norm": 0.8171872300696933,
"learning_rate": 4.653247989526675e-05,
"loss": 0.811,
"num_input_tokens_seen": 809917248,
"step": 895,
"train_runtime": 119241.0553,
"train_tokens_per_second": 6792.268
},
{
"epoch": 0.7639257294429708,
"grad_norm": 0.9066917659881738,
"learning_rate": 4.646953191211422e-05,
"loss": 0.7936,
"num_input_tokens_seen": 814265600,
"step": 900,
"train_runtime": 119851.6659,
"train_tokens_per_second": 6793.945
},
{
"epoch": 0.7681697612732096,
"grad_norm": 0.9338261305365108,
"learning_rate": 4.640606106564759e-05,
"loss": 0.8104,
"num_input_tokens_seen": 818678144,
"step": 905,
"train_runtime": 120471.0976,
"train_tokens_per_second": 6795.639
},
{
"epoch": 0.7724137931034483,
"grad_norm": 0.8719603501589874,
"learning_rate": 4.6342068901619115e-05,
"loss": 0.7913,
"num_input_tokens_seen": 823195328,
"step": 910,
"train_runtime": 121127.4303,
"train_tokens_per_second": 6796.11
},
{
"epoch": 0.776657824933687,
"grad_norm": 1.0213034444078577,
"learning_rate": 4.6277556978477063e-05,
"loss": 0.8081,
"num_input_tokens_seen": 827899840,
"step": 915,
"train_runtime": 121853.2081,
"train_tokens_per_second": 6794.239
},
{
"epoch": 0.7809018567639258,
"grad_norm": 1.0336867967280456,
"learning_rate": 4.6212526867327785e-05,
"loss": 0.8118,
"num_input_tokens_seen": 832394688,
"step": 920,
"train_runtime": 122480.5431,
"train_tokens_per_second": 6796.138
},
{
"epoch": 0.7851458885941645,
"grad_norm": 1.0694313116089975,
"learning_rate": 4.614698015189744e-05,
"loss": 0.8067,
"num_input_tokens_seen": 837091072,
"step": 925,
"train_runtime": 123144.408,
"train_tokens_per_second": 6797.638
},
{
"epoch": 0.7893899204244031,
"grad_norm": 0.9268002608848666,
"learning_rate": 4.6080918428493447e-05,
"loss": 0.7948,
"num_input_tokens_seen": 841442112,
"step": 930,
"train_runtime": 123780.7152,
"train_tokens_per_second": 6797.845
},
{
"epoch": 0.793633952254642,
"grad_norm": 0.9823980696238214,
"learning_rate": 4.601434330596557e-05,
"loss": 0.7926,
"num_input_tokens_seen": 845885504,
"step": 935,
"train_runtime": 124463.32,
"train_tokens_per_second": 6796.263
},
{
"epoch": 0.7978779840848806,
"grad_norm": 1.0766192496117053,
"learning_rate": 4.594725640566679e-05,
"loss": 0.8019,
"num_input_tokens_seen": 850419456,
"step": 940,
"train_runtime": 125130.694,
"train_tokens_per_second": 6796.25
},
{
"epoch": 0.8021220159151193,
"grad_norm": 1.0293248834632993,
"learning_rate": 4.5879659361413754e-05,
"loss": 0.8065,
"num_input_tokens_seen": 854885120,
"step": 945,
"train_runtime": 125769.8721,
"train_tokens_per_second": 6797.217
},
{
"epoch": 0.8063660477453581,
"grad_norm": 1.0089948918046479,
"learning_rate": 4.581155381944705e-05,
"loss": 0.8084,
"num_input_tokens_seen": 859518400,
"step": 950,
"train_runtime": 126468.1103,
"train_tokens_per_second": 6796.325
},
{
"epoch": 0.8106100795755968,
"grad_norm": 0.9051441031845476,
"learning_rate": 4.574294143839107e-05,
"loss": 0.7832,
"num_input_tokens_seen": 863890816,
"step": 955,
"train_runtime": 127085.4612,
"train_tokens_per_second": 6797.716
},
{
"epoch": 0.8148541114058355,
"grad_norm": 0.8826069009760195,
"learning_rate": 4.567382388921363e-05,
"loss": 0.8055,
"num_input_tokens_seen": 868430208,
"step": 960,
"train_runtime": 127753.0201,
"train_tokens_per_second": 6797.727
},
{
"epoch": 0.8190981432360742,
"grad_norm": 0.9189235994594633,
"learning_rate": 4.560420285518529e-05,
"loss": 0.8076,
"num_input_tokens_seen": 873261376,
"step": 965,
"train_runtime": 128445.48,
"train_tokens_per_second": 6798.693
},
{
"epoch": 0.823342175066313,
"grad_norm": 0.6649154972034895,
"learning_rate": 4.5534080031838336e-05,
"loss": 0.8748,
"num_input_tokens_seen": 877848320,
"step": 970,
"train_runtime": 129117.949,
"train_tokens_per_second": 6798.809
},
{
"epoch": 0.8275862068965517,
"grad_norm": 0.7645668184363056,
"learning_rate": 4.5463457126925493e-05,
"loss": 0.7949,
"num_input_tokens_seen": 882236288,
"step": 975,
"train_runtime": 129732.9954,
"train_tokens_per_second": 6800.4
},
{
"epoch": 0.8318302387267904,
"grad_norm": 0.9335431624509639,
"learning_rate": 4.539233586037836e-05,
"loss": 0.7904,
"num_input_tokens_seen": 886710592,
"step": 980,
"train_runtime": 130375.8011,
"train_tokens_per_second": 6801.19
},
{
"epoch": 0.8360742705570292,
"grad_norm": 0.8454411533111347,
"learning_rate": 4.532071796426549e-05,
"loss": 0.7944,
"num_input_tokens_seen": 891233088,
"step": 985,
"train_runtime": 131026.9147,
"train_tokens_per_second": 6801.909
},
{
"epoch": 0.8403183023872679,
"grad_norm": 0.9294276165792987,
"learning_rate": 4.5248605182750224e-05,
"loss": 0.8052,
"num_input_tokens_seen": 895797056,
"step": 990,
"train_runtime": 131628.0839,
"train_tokens_per_second": 6805.516
},
{
"epoch": 0.8445623342175066,
"grad_norm": 1.0688757710466947,
"learning_rate": 4.5175999272048205e-05,
"loss": 0.7871,
"num_input_tokens_seen": 900477248,
"step": 995,
"train_runtime": 132307.0252,
"train_tokens_per_second": 6805.967
},
{
"epoch": 0.8488063660477454,
"grad_norm": 0.8274182681903438,
"learning_rate": 4.510290200038463e-05,
"loss": 0.8022,
"num_input_tokens_seen": 905019392,
"step": 1000,
"train_runtime": 132955.1592,
"train_tokens_per_second": 6806.952
},
{
"epoch": 0.8530503978779841,
"grad_norm": 1.0546997932680735,
"learning_rate": 4.502931514795116e-05,
"loss": 0.7817,
"num_input_tokens_seen": 909356352,
"step": 1005,
"train_runtime": 133539.5693,
"train_tokens_per_second": 6809.64
},
{
"epoch": 0.8572944297082228,
"grad_norm": 1.2588102790900502,
"learning_rate": 4.495524050686257e-05,
"loss": 0.788,
"num_input_tokens_seen": 913771904,
"step": 1010,
"train_runtime": 134129.5941,
"train_tokens_per_second": 6812.605
},
{
"epoch": 0.8615384615384616,
"grad_norm": 1.1408803201963393,
"learning_rate": 4.488067988111313e-05,
"loss": 0.8001,
"num_input_tokens_seen": 918194944,
"step": 1015,
"train_runtime": 134742.2212,
"train_tokens_per_second": 6814.456
},
{
"epoch": 0.8657824933687003,
"grad_norm": 1.1899291730699249,
"learning_rate": 4.480563508653264e-05,
"loss": 0.7955,
"num_input_tokens_seen": 922666688,
"step": 1020,
"train_runtime": 135362.6791,
"train_tokens_per_second": 6816.256
},
{
"epoch": 0.870026525198939,
"grad_norm": 0.8130281973510006,
"learning_rate": 4.473010795074221e-05,
"loss": 0.7979,
"num_input_tokens_seen": 927156672,
"step": 1025,
"train_runtime": 136033.28,
"train_tokens_per_second": 6815.661
},
{
"epoch": 0.8742705570291777,
"grad_norm": 0.7570121360764169,
"learning_rate": 4.465410031310979e-05,
"loss": 0.8073,
"num_input_tokens_seen": 931890368,
"step": 1030,
"train_runtime": 136755.15,
"train_tokens_per_second": 6814.298
},
{
"epoch": 0.8785145888594165,
"grad_norm": 0.9159403793732956,
"learning_rate": 4.457761402470532e-05,
"loss": 0.8,
"num_input_tokens_seen": 936323264,
"step": 1035,
"train_runtime": 137383.018,
"train_tokens_per_second": 6815.422
},
{
"epoch": 0.8827586206896552,
"grad_norm": 1.0377351854287133,
"learning_rate": 4.450065094825567e-05,
"loss": 0.801,
"num_input_tokens_seen": 940907840,
"step": 1040,
"train_runtime": 138076.6951,
"train_tokens_per_second": 6814.386
},
{
"epoch": 0.8870026525198939,
"grad_norm": 0.8502067435449593,
"learning_rate": 4.442321295809932e-05,
"loss": 0.7884,
"num_input_tokens_seen": 945377920,
"step": 1045,
"train_runtime": 138693.9405,
"train_tokens_per_second": 6816.289
},
{
"epoch": 0.8912466843501327,
"grad_norm": 1.0008412143003285,
"learning_rate": 4.4345301940140625e-05,
"loss": 0.794,
"num_input_tokens_seen": 949653760,
"step": 1050,
"train_runtime": 139252.8781,
"train_tokens_per_second": 6819.635
},
{
"epoch": 0.8954907161803713,
"grad_norm": 0.7701062680223436,
"learning_rate": 4.426691979180395e-05,
"loss": 0.7879,
"num_input_tokens_seen": 953995840,
"step": 1055,
"train_runtime": 139848.6279,
"train_tokens_per_second": 6821.632
},
{
"epoch": 0.89973474801061,
"grad_norm": 0.7657676098694198,
"learning_rate": 4.4188068421987475e-05,
"loss": 0.78,
"num_input_tokens_seen": 958380160,
"step": 1060,
"train_runtime": 140471.8443,
"train_tokens_per_second": 6822.578
},
{
"epoch": 0.9039787798408488,
"grad_norm": 0.9443174140002766,
"learning_rate": 4.410874975101662e-05,
"loss": 0.7975,
"num_input_tokens_seen": 962938624,
"step": 1065,
"train_runtime": 141147.4618,
"train_tokens_per_second": 6822.217
},
{
"epoch": 0.9082228116710875,
"grad_norm": 1.0109837282388179,
"learning_rate": 4.402896571059738e-05,
"loss": 0.7979,
"num_input_tokens_seen": 967324608,
"step": 1070,
"train_runtime": 141768.6116,
"train_tokens_per_second": 6823.264
},
{
"epoch": 0.9124668435013262,
"grad_norm": 0.8120979404556385,
"learning_rate": 4.394871824376923e-05,
"loss": 0.7889,
"num_input_tokens_seen": 971853824,
"step": 1075,
"train_runtime": 142397.8849,
"train_tokens_per_second": 6824.918
},
{
"epoch": 0.916710875331565,
"grad_norm": 0.7674620546551851,
"learning_rate": 4.386800930485777e-05,
"loss": 0.7872,
"num_input_tokens_seen": 976342336,
"step": 1080,
"train_runtime": 143033.2803,
"train_tokens_per_second": 6825.98
},
{
"epoch": 0.9209549071618037,
"grad_norm": 0.8642492390279295,
"learning_rate": 4.378684085942722e-05,
"loss": 0.7968,
"num_input_tokens_seen": 980950016,
"step": 1085,
"train_runtime": 143727.3251,
"train_tokens_per_second": 6825.077
},
{
"epoch": 0.9251989389920424,
"grad_norm": 0.7868533581684258,
"learning_rate": 4.370521488423248e-05,
"loss": 0.7723,
"num_input_tokens_seen": 985579968,
"step": 1090,
"train_runtime": 144414.0117,
"train_tokens_per_second": 6824.684
},
{
"epoch": 0.9294429708222812,
"grad_norm": 0.9784190030448764,
"learning_rate": 4.3623133367171e-05,
"loss": 0.7657,
"num_input_tokens_seen": 990242240,
"step": 1095,
"train_runtime": 145104.4504,
"train_tokens_per_second": 6824.341
},
{
"epoch": 0.9336870026525199,
"grad_norm": 0.8416491217730794,
"learning_rate": 4.354059830723439e-05,
"loss": 0.7762,
"num_input_tokens_seen": 994700352,
"step": 1100,
"train_runtime": 145724.9371,
"train_tokens_per_second": 6825.876
},
{
"epoch": 0.9379310344827586,
"grad_norm": 0.8128690749017204,
"learning_rate": 4.34576117144597e-05,
"loss": 0.7872,
"num_input_tokens_seen": 999373568,
"step": 1105,
"train_runtime": 146429.9411,
"train_tokens_per_second": 6824.926
},
{
"epoch": 0.9421750663129973,
"grad_norm": 0.8795752491352558,
"learning_rate": 4.337417560988053e-05,
"loss": 0.7907,
"num_input_tokens_seen": 1003937216,
"step": 1110,
"train_runtime": 147068.6613,
"train_tokens_per_second": 6826.316
},
{
"epoch": 0.9464190981432361,
"grad_norm": 1.0803855481863844,
"learning_rate": 4.329029202547774e-05,
"loss": 0.7802,
"num_input_tokens_seen": 1008544768,
"step": 1115,
"train_runtime": 147756.5973,
"train_tokens_per_second": 6825.717
},
{
"epoch": 0.9506631299734748,
"grad_norm": 0.9249709902148427,
"learning_rate": 4.3205963004130016e-05,
"loss": 0.7835,
"num_input_tokens_seen": 1013050048,
"step": 1120,
"train_runtime": 148402.3493,
"train_tokens_per_second": 6826.375
},
{
"epoch": 0.9549071618037135,
"grad_norm": 0.8556253260160931,
"learning_rate": 4.3121190599564075e-05,
"loss": 0.7797,
"num_input_tokens_seen": 1017459840,
"step": 1125,
"train_runtime": 149027.5856,
"train_tokens_per_second": 6827.326
},
{
"epoch": 0.9591511936339523,
"grad_norm": 1.0166848790525587,
"learning_rate": 4.30359768763047e-05,
"loss": 0.7676,
"num_input_tokens_seen": 1022257088,
"step": 1130,
"train_runtime": 149712.8732,
"train_tokens_per_second": 6828.117
},
{
"epoch": 0.963395225464191,
"grad_norm": 0.9097894747416901,
"learning_rate": 4.2950323909624404e-05,
"loss": 0.7736,
"num_input_tokens_seen": 1026797248,
"step": 1135,
"train_runtime": 150374.5156,
"train_tokens_per_second": 6828.266
},
{
"epoch": 0.9676392572944297,
"grad_norm": 0.7704218910218441,
"learning_rate": 4.286423378549294e-05,
"loss": 0.7899,
"num_input_tokens_seen": 1031489344,
"step": 1140,
"train_runtime": 151058.329,
"train_tokens_per_second": 6828.418
},
{
"epoch": 0.9718832891246685,
"grad_norm": 0.7852167446402514,
"learning_rate": 4.2777708600526475e-05,
"loss": 0.7825,
"num_input_tokens_seen": 1035924096,
"step": 1145,
"train_runtime": 151677.6959,
"train_tokens_per_second": 6829.772
},
{
"epoch": 0.9761273209549072,
"grad_norm": 0.8709541370781978,
"learning_rate": 4.269075046193651e-05,
"loss": 0.7853,
"num_input_tokens_seen": 1040557120,
"step": 1150,
"train_runtime": 152331.7174,
"train_tokens_per_second": 6830.863
},
{
"epoch": 0.9803713527851459,
"grad_norm": 0.7354772314266763,
"learning_rate": 4.2603361487478635e-05,
"loss": 0.7796,
"num_input_tokens_seen": 1045138240,
"step": 1155,
"train_runtime": 153014.033,
"train_tokens_per_second": 6830.342
},
{
"epoch": 0.9846153846153847,
"grad_norm": 0.8929842185397294,
"learning_rate": 4.2515543805400845e-05,
"loss": 0.7931,
"num_input_tokens_seen": 1049637440,
"step": 1160,
"train_runtime": 153673.9541,
"train_tokens_per_second": 6830.288
},
{
"epoch": 0.9888594164456234,
"grad_norm": 0.8843306924811901,
"learning_rate": 4.2427299554391795e-05,
"loss": 0.7818,
"num_input_tokens_seen": 1054084800,
"step": 1165,
"train_runtime": 154301.843,
"train_tokens_per_second": 6831.317
},
{
"epoch": 0.993103448275862,
"grad_norm": 0.7260294542114571,
"learning_rate": 4.2338630883528694e-05,
"loss": 0.7868,
"num_input_tokens_seen": 1058576128,
"step": 1170,
"train_runtime": 154951.0655,
"train_tokens_per_second": 6831.68
},
{
"epoch": 0.9973474801061007,
"grad_norm": 1.0310167390412928,
"learning_rate": 4.224953995222495e-05,
"loss": 0.7913,
"num_input_tokens_seen": 1063234944,
"step": 1175,
"train_runtime": 155625.4902,
"train_tokens_per_second": 6832.01
},
{
"epoch": 1.0008488063660477,
"grad_norm": 0.9546542980993455,
"learning_rate": 4.2160028930177586e-05,
"loss": 0.6435,
"num_input_tokens_seen": 1066978304,
"step": 1180,
"train_runtime": 156195.3002,
"train_tokens_per_second": 6831.053
},
{
"epoch": 1.0008488063660477,
"eval_loss": 0.7810727953910828,
"eval_runtime": 1060.3774,
"eval_samples_per_second": 2.874,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 1066978304,
"step": 1180
},
{
"epoch": 1.0050928381962865,
"grad_norm": 1.0559718016156323,
"learning_rate": 4.207009999731441e-05,
"loss": 0.724,
"num_input_tokens_seen": 1071397376,
"step": 1185,
"train_runtime": 157954.8381,
"train_tokens_per_second": 6782.935
},
{
"epoch": 1.0093368700265253,
"grad_norm": 0.8356946223535584,
"learning_rate": 4.1979755343740936e-05,
"loss": 0.7198,
"num_input_tokens_seen": 1075925056,
"step": 1190,
"train_runtime": 158621.7481,
"train_tokens_per_second": 6782.961
},
{
"epoch": 1.0135809018567639,
"grad_norm": 0.9532551565257256,
"learning_rate": 4.188899716968699e-05,
"loss": 0.7137,
"num_input_tokens_seen": 1080308416,
"step": 1195,
"train_runtime": 159243.7038,
"train_tokens_per_second": 6783.995
},
{
"epoch": 1.0178249336870027,
"grad_norm": 0.886163908743495,
"learning_rate": 4.179782768545321e-05,
"loss": 0.6985,
"num_input_tokens_seen": 1084861632,
"step": 1200,
"train_runtime": 159917.1941,
"train_tokens_per_second": 6783.896
},
{
"epoch": 1.0220689655172415,
"grad_norm": 0.8306981011679802,
"learning_rate": 4.170624911135713e-05,
"loss": 0.7246,
"num_input_tokens_seen": 1089273600,
"step": 1205,
"train_runtime": 160556.71,
"train_tokens_per_second": 6784.354
},
{
"epoch": 1.02631299734748,
"grad_norm": 0.9829055718597421,
"learning_rate": 4.161426367767921e-05,
"loss": 0.7219,
"num_input_tokens_seen": 1093625472,
"step": 1210,
"train_runtime": 161182.2827,
"train_tokens_per_second": 6785.023
},
{
"epoch": 1.0305570291777189,
"grad_norm": 0.9263411832733288,
"learning_rate": 4.1521873624608396e-05,
"loss": 0.7293,
"num_input_tokens_seen": 1098036992,
"step": 1215,
"train_runtime": 161813.3681,
"train_tokens_per_second": 6785.824
},
{
"epoch": 1.0348010610079577,
"grad_norm": 1.3238936418341176,
"learning_rate": 4.1429081202187667e-05,
"loss": 0.7214,
"num_input_tokens_seen": 1102560128,
"step": 1220,
"train_runtime": 162471.5936,
"train_tokens_per_second": 6786.172
},
{
"epoch": 1.0390450928381962,
"grad_norm": 0.822682923114858,
"learning_rate": 4.1335888670259196e-05,
"loss": 0.704,
"num_input_tokens_seen": 1107129920,
"step": 1225,
"train_runtime": 163123.1296,
"train_tokens_per_second": 6787.081
},
{
"epoch": 1.043289124668435,
"grad_norm": 0.9140492461674421,
"learning_rate": 4.12422982984093e-05,
"loss": 0.7199,
"num_input_tokens_seen": 1111822080,
"step": 1230,
"train_runtime": 163825.6741,
"train_tokens_per_second": 6786.617
},
{
"epoch": 1.0475331564986738,
"grad_norm": 0.8760621488535013,
"learning_rate": 4.11483123659132e-05,
"loss": 0.7055,
"num_input_tokens_seen": 1116287808,
"step": 1235,
"train_runtime": 164439.6584,
"train_tokens_per_second": 6788.434
},
{
"epoch": 1.0517771883289124,
"grad_norm": 0.8280642864840718,
"learning_rate": 4.1053933161679494e-05,
"loss": 0.7235,
"num_input_tokens_seen": 1120773120,
"step": 1240,
"train_runtime": 165072.1322,
"train_tokens_per_second": 6789.596
},
{
"epoch": 1.0560212201591512,
"grad_norm": 0.9389411578202482,
"learning_rate": 4.095916298419441e-05,
"loss": 0.7058,
"num_input_tokens_seen": 1125450432,
"step": 1245,
"train_runtime": 165746.363,
"train_tokens_per_second": 6790.197
},
{
"epoch": 1.06026525198939,
"grad_norm": 0.8674975290576171,
"learning_rate": 4.0864004141465844e-05,
"loss": 0.7144,
"num_input_tokens_seen": 1129770880,
"step": 1250,
"train_runtime": 166381.7549,
"train_tokens_per_second": 6790.233
},
{
"epoch": 1.0645092838196286,
"grad_norm": 0.8423637185173262,
"learning_rate": 4.0768458950967135e-05,
"loss": 0.6924,
"num_input_tokens_seen": 1134325824,
"step": 1255,
"train_runtime": 167041.841,
"train_tokens_per_second": 6790.669
},
{
"epoch": 1.0687533156498674,
"grad_norm": 0.8463731702794449,
"learning_rate": 4.067252973958064e-05,
"loss": 0.7022,
"num_input_tokens_seen": 1138890880,
"step": 1260,
"train_runtime": 167698.4028,
"train_tokens_per_second": 6791.304
},
{
"epoch": 1.072997347480106,
"grad_norm": 1.2814601777565326,
"learning_rate": 4.0576218843541046e-05,
"loss": 0.7244,
"num_input_tokens_seen": 1143446080,
"step": 1265,
"train_runtime": 168389.1765,
"train_tokens_per_second": 6790.496
},
{
"epoch": 1.0772413793103448,
"grad_norm": 0.7804384260432371,
"learning_rate": 4.0479528608378515e-05,
"loss": 0.7118,
"num_input_tokens_seen": 1148022848,
"step": 1270,
"train_runtime": 169068.4692,
"train_tokens_per_second": 6790.284
},
{
"epoch": 1.0814854111405836,
"grad_norm": 0.8440032711255191,
"learning_rate": 4.0382461388861505e-05,
"loss": 0.7069,
"num_input_tokens_seen": 1152678080,
"step": 1275,
"train_runtime": 169749.9181,
"train_tokens_per_second": 6790.449
},
{
"epoch": 1.0857294429708222,
"grad_norm": 1.0462601786941845,
"learning_rate": 4.0285019548939464e-05,
"loss": 0.7009,
"num_input_tokens_seen": 1157385088,
"step": 1280,
"train_runtime": 170412.1829,
"train_tokens_per_second": 6791.68
},
{
"epoch": 1.089973474801061,
"grad_norm": 0.7949896906145127,
"learning_rate": 4.018720546168524e-05,
"loss": 0.714,
"num_input_tokens_seen": 1161843200,
"step": 1285,
"train_runtime": 171052.7364,
"train_tokens_per_second": 6792.31
},
{
"epoch": 1.0942175066312998,
"grad_norm": 0.8723394657733861,
"learning_rate": 4.008902150923731e-05,
"loss": 0.7173,
"num_input_tokens_seen": 1166275008,
"step": 1290,
"train_runtime": 171633.6694,
"train_tokens_per_second": 6795.141
},
{
"epoch": 1.0984615384615384,
"grad_norm": 1.271199939246554,
"learning_rate": 3.999047008274173e-05,
"loss": 0.718,
"num_input_tokens_seen": 1170805952,
"step": 1295,
"train_runtime": 172293.1957,
"train_tokens_per_second": 6795.428
},
{
"epoch": 1.1027055702917772,
"grad_norm": 0.7123662125520419,
"learning_rate": 3.989155358229394e-05,
"loss": 0.7326,
"num_input_tokens_seen": 1175398720,
"step": 1300,
"train_runtime": 172948.1235,
"train_tokens_per_second": 6796.25
},
{
"epoch": 1.106949602122016,
"grad_norm": 0.838546216114602,
"learning_rate": 3.979227441688028e-05,
"loss": 0.7096,
"num_input_tokens_seen": 1179790336,
"step": 1305,
"train_runtime": 173532.3842,
"train_tokens_per_second": 6798.675
},
{
"epoch": 1.1111936339522546,
"grad_norm": 1.0578594741413954,
"learning_rate": 3.969263500431935e-05,
"loss": 0.736,
"num_input_tokens_seen": 1184330304,
"step": 1310,
"train_runtime": 174200.2849,
"train_tokens_per_second": 6798.67
},
{
"epoch": 1.1154376657824934,
"grad_norm": 1.0727364156307728,
"learning_rate": 3.9592637771203114e-05,
"loss": 0.7149,
"num_input_tokens_seen": 1188880384,
"step": 1315,
"train_runtime": 174872.9271,
"train_tokens_per_second": 6798.539
},
{
"epoch": 1.1196816976127322,
"grad_norm": 1.4213471650077618,
"learning_rate": 3.949228515283777e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1193170816,
"step": 1320,
"train_runtime": 175486.1393,
"train_tokens_per_second": 6799.231
},
{
"epoch": 1.1239257294429708,
"grad_norm": 0.9528308642512087,
"learning_rate": 3.9391579593184525e-05,
"loss": 0.7046,
"num_input_tokens_seen": 1197641344,
"step": 1325,
"train_runtime": 176147.7116,
"train_tokens_per_second": 6799.074
},
{
"epoch": 1.1281697612732096,
"grad_norm": 0.8240250587692688,
"learning_rate": 3.929052354479999e-05,
"loss": 0.7073,
"num_input_tokens_seen": 1202042432,
"step": 1330,
"train_runtime": 176809.4403,
"train_tokens_per_second": 6798.52
},
{
"epoch": 1.1324137931034484,
"grad_norm": 0.9575820680101333,
"learning_rate": 3.918911946877651e-05,
"loss": 0.7123,
"num_input_tokens_seen": 1206438080,
"step": 1335,
"train_runtime": 177399.569,
"train_tokens_per_second": 6800.682
},
{
"epoch": 1.136657824933687,
"grad_norm": 1.0494568271450921,
"learning_rate": 3.908736983468219e-05,
"loss": 0.7037,
"num_input_tokens_seen": 1211039616,
"step": 1340,
"train_runtime": 178060.7312,
"train_tokens_per_second": 6801.273
},
{
"epoch": 1.1409018567639257,
"grad_norm": 1.3953016773438447,
"learning_rate": 3.898527712050074e-05,
"loss": 0.6992,
"num_input_tokens_seen": 1215405568,
"step": 1345,
"train_runtime": 178700.2333,
"train_tokens_per_second": 6801.365
},
{
"epoch": 1.1451458885941646,
"grad_norm": 0.9502009725403173,
"learning_rate": 3.88828438125712e-05,
"loss": 0.7273,
"num_input_tokens_seen": 1220023168,
"step": 1350,
"train_runtime": 179354.7367,
"train_tokens_per_second": 6802.291
},
{
"epoch": 1.1493899204244031,
"grad_norm": 1.1556987622554753,
"learning_rate": 3.878007240552732e-05,
"loss": 0.6946,
"num_input_tokens_seen": 1224574464,
"step": 1355,
"train_runtime": 180001.4614,
"train_tokens_per_second": 6803.136
},
{
"epoch": 1.153633952254642,
"grad_norm": 0.792466643427966,
"learning_rate": 3.867696540223681e-05,
"loss": 0.708,
"num_input_tokens_seen": 1229115520,
"step": 1360,
"train_runtime": 180638.8559,
"train_tokens_per_second": 6804.27
},
{
"epoch": 1.1578779840848807,
"grad_norm": 0.9099009827745455,
"learning_rate": 3.8573525313740435e-05,
"loss": 0.7198,
"num_input_tokens_seen": 1233652160,
"step": 1365,
"train_runtime": 181281.2594,
"train_tokens_per_second": 6805.183
},
{
"epoch": 1.1621220159151193,
"grad_norm": 0.8873710969043223,
"learning_rate": 3.846975465919079e-05,
"loss": 0.7047,
"num_input_tokens_seen": 1238186112,
"step": 1370,
"train_runtime": 181914.6873,
"train_tokens_per_second": 6806.411
},
{
"epoch": 1.1663660477453581,
"grad_norm": 1.0769227041949128,
"learning_rate": 3.836565596579103e-05,
"loss": 0.7363,
"num_input_tokens_seen": 1242761728,
"step": 1375,
"train_runtime": 182587.2454,
"train_tokens_per_second": 6806.399
},
{
"epoch": 1.1706100795755967,
"grad_norm": 0.9446612826256684,
"learning_rate": 3.826123176873324e-05,
"loss": 0.7001,
"num_input_tokens_seen": 1247182656,
"step": 1380,
"train_runtime": 183248.8336,
"train_tokens_per_second": 6805.951
},
{
"epoch": 1.1748541114058355,
"grad_norm": 0.7930410617767145,
"learning_rate": 3.8156484611136774e-05,
"loss": 0.7121,
"num_input_tokens_seen": 1251653056,
"step": 1385,
"train_runtime": 183867.2318,
"train_tokens_per_second": 6807.374
},
{
"epoch": 1.1790981432360743,
"grad_norm": 1.2141311456918997,
"learning_rate": 3.805141704398626e-05,
"loss": 0.7085,
"num_input_tokens_seen": 1256043584,
"step": 1390,
"train_runtime": 184507.3826,
"train_tokens_per_second": 6807.552
},
{
"epoch": 1.1833421750663131,
"grad_norm": 0.8727742652201835,
"learning_rate": 3.794603162606949e-05,
"loss": 0.7021,
"num_input_tokens_seen": 1260434688,
"step": 1395,
"train_runtime": 185139.6011,
"train_tokens_per_second": 6808.023
},
{
"epoch": 1.1875862068965517,
"grad_norm": 0.7037237894630392,
"learning_rate": 3.784033092391513e-05,
"loss": 0.732,
"num_input_tokens_seen": 1264932736,
"step": 1400,
"train_runtime": 185754.0941,
"train_tokens_per_second": 6809.717
},
{
"epoch": 1.1918302387267905,
"grad_norm": 0.8398911891859556,
"learning_rate": 3.773431751173018e-05,
"loss": 0.7254,
"num_input_tokens_seen": 1269425664,
"step": 1405,
"train_runtime": 186452.0318,
"train_tokens_per_second": 6808.323
},
{
"epoch": 1.196074270557029,
"grad_norm": 1.046439278032978,
"learning_rate": 3.76279939713373e-05,
"loss": 0.7034,
"num_input_tokens_seen": 1273725056,
"step": 1410,
"train_runtime": 187095.6204,
"train_tokens_per_second": 6807.883
},
{
"epoch": 1.2003183023872679,
"grad_norm": 0.7473279729847268,
"learning_rate": 3.7521362892111945e-05,
"loss": 0.7002,
"num_input_tokens_seen": 1278142592,
"step": 1415,
"train_runtime": 187703.911,
"train_tokens_per_second": 6809.355
},
{
"epoch": 1.2045623342175067,
"grad_norm": 1.0374275407130875,
"learning_rate": 3.741442687091926e-05,
"loss": 0.7204,
"num_input_tokens_seen": 1282692032,
"step": 1420,
"train_runtime": 188345.3351,
"train_tokens_per_second": 6810.32
},
{
"epoch": 1.2088063660477453,
"grad_norm": 0.7440268721036309,
"learning_rate": 3.730718851205089e-05,
"loss": 0.7114,
"num_input_tokens_seen": 1287034560,
"step": 1425,
"train_runtime": 188944.1141,
"train_tokens_per_second": 6811.721
},
{
"epoch": 1.213050397877984,
"grad_norm": 0.776479873397123,
"learning_rate": 3.719965042716154e-05,
"loss": 0.7081,
"num_input_tokens_seen": 1291460416,
"step": 1430,
"train_runtime": 189556.4755,
"train_tokens_per_second": 6813.064
},
{
"epoch": 1.2172944297082229,
"grad_norm": 0.7404149511150007,
"learning_rate": 3.709181523520532e-05,
"loss": 0.7022,
"num_input_tokens_seen": 1296144576,
"step": 1435,
"train_runtime": 190275.4375,
"train_tokens_per_second": 6811.938
},
{
"epoch": 1.2215384615384615,
"grad_norm": 0.7950074266734066,
"learning_rate": 3.698368556237206e-05,
"loss": 0.7245,
"num_input_tokens_seen": 1300612352,
"step": 1440,
"train_runtime": 190917.0602,
"train_tokens_per_second": 6812.447
},
{
"epoch": 1.2257824933687003,
"grad_norm": 0.8330406088802553,
"learning_rate": 3.687526404202326e-05,
"loss": 0.6876,
"num_input_tokens_seen": 1305227776,
"step": 1445,
"train_runtime": 191618.6772,
"train_tokens_per_second": 6811.59
},
{
"epoch": 1.230026525198939,
"grad_norm": 0.7986919467015979,
"learning_rate": 3.6766553314628016e-05,
"loss": 0.6882,
"num_input_tokens_seen": 1309815616,
"step": 1450,
"train_runtime": 192268.2878,
"train_tokens_per_second": 6812.437
},
{
"epoch": 1.2342705570291777,
"grad_norm": 0.8194811456779011,
"learning_rate": 3.66575560276987e-05,
"loss": 0.6992,
"num_input_tokens_seen": 1314293440,
"step": 1455,
"train_runtime": 192911.152,
"train_tokens_per_second": 6812.947
},
{
"epoch": 1.2385145888594165,
"grad_norm": 0.8920784814801206,
"learning_rate": 3.654827483572647e-05,
"loss": 0.7034,
"num_input_tokens_seen": 1318678784,
"step": 1460,
"train_runtime": 193534.093,
"train_tokens_per_second": 6813.677
},
{
"epoch": 1.2427586206896553,
"grad_norm": 0.7947592511270455,
"learning_rate": 3.6438712400116626e-05,
"loss": 0.7277,
"num_input_tokens_seen": 1323485248,
"step": 1465,
"train_runtime": 194275.8066,
"train_tokens_per_second": 6812.404
},
{
"epoch": 1.2470026525198938,
"grad_norm": 0.9177270797271538,
"learning_rate": 3.6328871389123817e-05,
"loss": 0.7177,
"num_input_tokens_seen": 1327989184,
"step": 1470,
"train_runtime": 194920.646,
"train_tokens_per_second": 6812.973
},
{
"epoch": 1.2512466843501326,
"grad_norm": 0.9095775155270419,
"learning_rate": 3.6218754477787034e-05,
"loss": 0.69,
"num_input_tokens_seen": 1332484288,
"step": 1475,
"train_runtime": 195594.2663,
"train_tokens_per_second": 6812.492
},
{
"epoch": 1.2512466843501326,
"eval_loss": 0.7701402306556702,
"eval_runtime": 1058.7996,
"eval_samples_per_second": 2.878,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 1332484288,
"step": 1475
},
{
"epoch": 1.2554907161803714,
"grad_norm": 1.0025427441776165,
"learning_rate": 3.610836434786448e-05,
"loss": 0.7226,
"num_input_tokens_seen": 1336834944,
"step": 1480,
"train_runtime": 197298.3845,
"train_tokens_per_second": 6775.701
},
{
"epoch": 1.25973474801061,
"grad_norm": 1.1051405618486907,
"learning_rate": 3.599770368776824e-05,
"loss": 0.717,
"num_input_tokens_seen": 1341307904,
"step": 1485,
"train_runtime": 197946.5126,
"train_tokens_per_second": 6776.113
},
{
"epoch": 1.2639787798408488,
"grad_norm": 0.7277747101019799,
"learning_rate": 3.588677519249883e-05,
"loss": 0.7129,
"num_input_tokens_seen": 1345945600,
"step": 1490,
"train_runtime": 198583.5461,
"train_tokens_per_second": 6777.73
},
{
"epoch": 1.2682228116710874,
"grad_norm": 1.0644077159168257,
"learning_rate": 3.577558156357954e-05,
"loss": 0.6964,
"num_input_tokens_seen": 1350721856,
"step": 1495,
"train_runtime": 199281.1546,
"train_tokens_per_second": 6777.971
},
{
"epoch": 1.2724668435013262,
"grad_norm": 0.7336344093486956,
"learning_rate": 3.566412550899067e-05,
"loss": 0.7085,
"num_input_tokens_seen": 1355297856,
"step": 1500,
"train_runtime": 199944.5213,
"train_tokens_per_second": 6778.37
},
{
"epoch": 1.276710875331565,
"grad_norm": 0.8133225375902521,
"learning_rate": 3.5552409743103556e-05,
"loss": 0.6867,
"num_input_tokens_seen": 1359679104,
"step": 1505,
"train_runtime": 200577.4067,
"train_tokens_per_second": 6778.825
},
{
"epoch": 1.2809549071618038,
"grad_norm": 0.697351763291778,
"learning_rate": 3.5440436986614475e-05,
"loss": 0.6982,
"num_input_tokens_seen": 1364232960,
"step": 1510,
"train_runtime": 201233.0814,
"train_tokens_per_second": 6779.367
},
{
"epoch": 1.2851989389920424,
"grad_norm": 0.7278903758794186,
"learning_rate": 3.53282099664784e-05,
"loss": 0.6908,
"num_input_tokens_seen": 1368631360,
"step": 1515,
"train_runtime": 201880.3686,
"train_tokens_per_second": 6779.418
},
{
"epoch": 1.2894429708222812,
"grad_norm": 0.8347309141959418,
"learning_rate": 3.521573141584254e-05,
"loss": 0.6901,
"num_input_tokens_seen": 1373082240,
"step": 1520,
"train_runtime": 202488.6455,
"train_tokens_per_second": 6781.033
},
{
"epoch": 1.2936870026525198,
"grad_norm": 0.8283488230820615,
"learning_rate": 3.5103004073979854e-05,
"loss": 0.6807,
"num_input_tokens_seen": 1377433792,
"step": 1525,
"train_runtime": 203113.9684,
"train_tokens_per_second": 6781.581
},
{
"epoch": 1.2979310344827586,
"grad_norm": 0.7419000730982289,
"learning_rate": 3.499003068622226e-05,
"loss": 0.6995,
"num_input_tokens_seen": 1382143808,
"step": 1530,
"train_runtime": 203848.007,
"train_tokens_per_second": 6780.266
},
{
"epoch": 1.3021750663129974,
"grad_norm": 0.8544362640648647,
"learning_rate": 3.487681400389384e-05,
"loss": 0.6932,
"num_input_tokens_seen": 1386581504,
"step": 1535,
"train_runtime": 204454.9055,
"train_tokens_per_second": 6781.845
},
{
"epoch": 1.3064190981432362,
"grad_norm": 0.7409589690489501,
"learning_rate": 3.4763356784243784e-05,
"loss": 0.6943,
"num_input_tokens_seen": 1391187968,
"step": 1540,
"train_runtime": 205112.5105,
"train_tokens_per_second": 6782.56
},
{
"epoch": 1.3106631299734748,
"grad_norm": 0.8626742457912966,
"learning_rate": 3.4649661790379285e-05,
"loss": 0.6894,
"num_input_tokens_seen": 1395756992,
"step": 1545,
"train_runtime": 205751.9562,
"train_tokens_per_second": 6783.688
},
{
"epoch": 1.3149071618037136,
"grad_norm": 0.9751075736447785,
"learning_rate": 3.453573179119821e-05,
"loss": 0.6919,
"num_input_tokens_seen": 1400210880,
"step": 1550,
"train_runtime": 206360.6402,
"train_tokens_per_second": 6785.261
},
{
"epoch": 1.3191511936339522,
"grad_norm": 0.839491541858112,
"learning_rate": 3.4421569561321705e-05,
"loss": 0.7069,
"num_input_tokens_seen": 1404544896,
"step": 1555,
"train_runtime": 206988.7608,
"train_tokens_per_second": 6785.609
},
{
"epoch": 1.323395225464191,
"grad_norm": 0.7686049267619298,
"learning_rate": 3.4307177881026574e-05,
"loss": 0.7102,
"num_input_tokens_seen": 1409135360,
"step": 1560,
"train_runtime": 207645.0724,
"train_tokens_per_second": 6786.269
},
{
"epoch": 1.3276392572944298,
"grad_norm": 0.7897081067547944,
"learning_rate": 3.419255953617762e-05,
"loss": 0.7095,
"num_input_tokens_seen": 1413494272,
"step": 1565,
"train_runtime": 208306.3313,
"train_tokens_per_second": 6785.652
},
{
"epoch": 1.3318832891246684,
"grad_norm": 0.9461779222467549,
"learning_rate": 3.407771731815975e-05,
"loss": 0.685,
"num_input_tokens_seen": 1417846272,
"step": 1570,
"train_runtime": 208904.186,
"train_tokens_per_second": 6787.065
},
{
"epoch": 1.3361273209549072,
"grad_norm": 0.8934996824542671,
"learning_rate": 3.3962654023810056e-05,
"loss": 0.6814,
"num_input_tokens_seen": 1422340672,
"step": 1575,
"train_runtime": 209567.9077,
"train_tokens_per_second": 6787.016
},
{
"epoch": 1.340371352785146,
"grad_norm": 0.8661495630419888,
"learning_rate": 3.384737245534962e-05,
"loss": 0.7181,
"num_input_tokens_seen": 1426955904,
"step": 1580,
"train_runtime": 210237.9687,
"train_tokens_per_second": 6787.337
},
{
"epoch": 1.3446153846153845,
"grad_norm": 0.8719974075076845,
"learning_rate": 3.373187542031534e-05,
"loss": 0.6959,
"num_input_tokens_seen": 1431489088,
"step": 1585,
"train_runtime": 210873.0161,
"train_tokens_per_second": 6788.394
},
{
"epoch": 1.3488594164456233,
"grad_norm": 1.1626368748489013,
"learning_rate": 3.361616573149153e-05,
"loss": 0.6832,
"num_input_tokens_seen": 1435997504,
"step": 1590,
"train_runtime": 211532.587,
"train_tokens_per_second": 6788.54
},
{
"epoch": 1.3531034482758622,
"grad_norm": 0.8538479857974732,
"learning_rate": 3.350024620684142e-05,
"loss": 0.7099,
"num_input_tokens_seen": 1440263232,
"step": 1595,
"train_runtime": 212121.5231,
"train_tokens_per_second": 6789.802
},
{
"epoch": 1.3573474801061007,
"grad_norm": 0.8056069140705914,
"learning_rate": 3.338411966943852e-05,
"loss": 0.69,
"num_input_tokens_seen": 1444712192,
"step": 1600,
"train_runtime": 212749.667,
"train_tokens_per_second": 6790.667
},
{
"epoch": 1.3615915119363395,
"grad_norm": 0.7748917218818391,
"learning_rate": 3.326778894739787e-05,
"loss": 0.7012,
"num_input_tokens_seen": 1449143872,
"step": 1605,
"train_runtime": 213375.0723,
"train_tokens_per_second": 6791.533
},
{
"epoch": 1.3658355437665781,
"grad_norm": 0.7420765913634999,
"learning_rate": 3.3151256873807166e-05,
"loss": 0.7044,
"num_input_tokens_seen": 1453720384,
"step": 1610,
"train_runtime": 214016.224,
"train_tokens_per_second": 6792.571
},
{
"epoch": 1.370079575596817,
"grad_norm": 0.8236976518371382,
"learning_rate": 3.3034526286657784e-05,
"loss": 0.6767,
"num_input_tokens_seen": 1458310144,
"step": 1615,
"train_runtime": 214691.52,
"train_tokens_per_second": 6792.584
},
{
"epoch": 1.3743236074270557,
"grad_norm": 1.234048869991443,
"learning_rate": 3.291760002877563e-05,
"loss": 0.694,
"num_input_tokens_seen": 1462804672,
"step": 1620,
"train_runtime": 215323.1611,
"train_tokens_per_second": 6793.531
},
{
"epoch": 1.3785676392572945,
"grad_norm": 1.1190238309471685,
"learning_rate": 3.280048094775194e-05,
"loss": 0.6912,
"num_input_tokens_seen": 1467354688,
"step": 1625,
"train_runtime": 215957.7438,
"train_tokens_per_second": 6794.638
},
{
"epoch": 1.3828116710875331,
"grad_norm": 0.7424932571208389,
"learning_rate": 3.268317189587389e-05,
"loss": 0.6772,
"num_input_tokens_seen": 1471765312,
"step": 1630,
"train_runtime": 216588.9755,
"train_tokens_per_second": 6795.2
},
{
"epoch": 1.387055702917772,
"grad_norm": 0.8756485744320969,
"learning_rate": 3.256567573005519e-05,
"loss": 0.7056,
"num_input_tokens_seen": 1476461312,
"step": 1635,
"train_runtime": 217239.6368,
"train_tokens_per_second": 6796.464
},
{
"epoch": 1.3912997347480105,
"grad_norm": 0.8539432184293334,
"learning_rate": 3.2447995311766426e-05,
"loss": 0.6921,
"num_input_tokens_seen": 1481070080,
"step": 1640,
"train_runtime": 217902.9575,
"train_tokens_per_second": 6796.925
},
{
"epoch": 1.3955437665782493,
"grad_norm": 0.9984531979139321,
"learning_rate": 3.233013350696547e-05,
"loss": 0.6788,
"num_input_tokens_seen": 1485426304,
"step": 1645,
"train_runtime": 218524.3978,
"train_tokens_per_second": 6797.531
},
{
"epoch": 1.399787798408488,
"grad_norm": 0.8306599513500448,
"learning_rate": 3.22120931860276e-05,
"loss": 0.7068,
"num_input_tokens_seen": 1489976064,
"step": 1650,
"train_runtime": 219164.0872,
"train_tokens_per_second": 6798.45
},
{
"epoch": 1.404031830238727,
"grad_norm": 0.8165110526157475,
"learning_rate": 3.2093877223675657e-05,
"loss": 0.7055,
"num_input_tokens_seen": 1494425408,
"step": 1655,
"train_runtime": 219817.6974,
"train_tokens_per_second": 6798.476
},
{
"epoch": 1.4082758620689655,
"grad_norm": 0.9181922831479999,
"learning_rate": 3.197548849890997e-05,
"loss": 0.6919,
"num_input_tokens_seen": 1499022912,
"step": 1660,
"train_runtime": 220540.8581,
"train_tokens_per_second": 6797.03
},
{
"epoch": 1.4125198938992043,
"grad_norm": 0.8451299000501948,
"learning_rate": 3.1856929894938294e-05,
"loss": 0.6851,
"num_input_tokens_seen": 1503165184,
"step": 1665,
"train_runtime": 221107.0695,
"train_tokens_per_second": 6798.359
},
{
"epoch": 1.4167639257294429,
"grad_norm": 0.9309089115614846,
"learning_rate": 3.17382042991056e-05,
"loss": 0.6779,
"num_input_tokens_seen": 1507912704,
"step": 1670,
"train_runtime": 221826.3413,
"train_tokens_per_second": 6797.717
},
{
"epoch": 1.4210079575596817,
"grad_norm": 0.897730430026796,
"learning_rate": 3.16193146028237e-05,
"loss": 0.6916,
"num_input_tokens_seen": 1512406912,
"step": 1675,
"train_runtime": 222474.5449,
"train_tokens_per_second": 6798.112
},
{
"epoch": 1.4252519893899205,
"grad_norm": 1.0914616744302021,
"learning_rate": 3.1500263701500896e-05,
"loss": 0.7087,
"num_input_tokens_seen": 1516995328,
"step": 1680,
"train_runtime": 223170.4553,
"train_tokens_per_second": 6797.474
},
{
"epoch": 1.4294960212201593,
"grad_norm": 0.7658163870719906,
"learning_rate": 3.1381054494471405e-05,
"loss": 0.703,
"num_input_tokens_seen": 1521406976,
"step": 1685,
"train_runtime": 223818.5244,
"train_tokens_per_second": 6797.502
},
{
"epoch": 1.4337400530503979,
"grad_norm": 0.7295564226365354,
"learning_rate": 3.12616898849248e-05,
"loss": 0.7035,
"num_input_tokens_seen": 1526055168,
"step": 1690,
"train_runtime": 224524.94,
"train_tokens_per_second": 6796.818
},
{
"epoch": 1.4379840848806367,
"grad_norm": 0.8439956680094854,
"learning_rate": 3.1142172779835274e-05,
"loss": 0.6746,
"num_input_tokens_seen": 1530635200,
"step": 1695,
"train_runtime": 225153.441,
"train_tokens_per_second": 6798.187
},
{
"epoch": 1.4422281167108753,
"grad_norm": 0.8397468802586634,
"learning_rate": 3.1022506089890876e-05,
"loss": 0.7068,
"num_input_tokens_seen": 1535012288,
"step": 1700,
"train_runtime": 225781.6539,
"train_tokens_per_second": 6798.658
},
{
"epoch": 1.446472148541114,
"grad_norm": 0.7474048463993876,
"learning_rate": 3.0902692729422575e-05,
"loss": 0.6865,
"num_input_tokens_seen": 1539284736,
"step": 1705,
"train_runtime": 226344.0602,
"train_tokens_per_second": 6800.641
},
{
"epoch": 1.4507161803713529,
"grad_norm": 0.9736770454301451,
"learning_rate": 3.078273561633335e-05,
"loss": 0.6763,
"num_input_tokens_seen": 1543699904,
"step": 1710,
"train_runtime": 226957.3024,
"train_tokens_per_second": 6801.719
},
{
"epoch": 1.4549602122015914,
"grad_norm": 1.1147200850975938,
"learning_rate": 3.066263767202706e-05,
"loss": 0.6914,
"num_input_tokens_seen": 1548275328,
"step": 1715,
"train_runtime": 227614.0923,
"train_tokens_per_second": 6802.195
},
{
"epoch": 1.4592042440318302,
"grad_norm": 0.9261552445682865,
"learning_rate": 3.0542401821337346e-05,
"loss": 0.6895,
"num_input_tokens_seen": 1552716864,
"step": 1720,
"train_runtime": 228231.3018,
"train_tokens_per_second": 6803.26
},
{
"epoch": 1.463448275862069,
"grad_norm": 0.7494164761692941,
"learning_rate": 3.042203099245639e-05,
"loss": 0.6871,
"num_input_tokens_seen": 1557269760,
"step": 1725,
"train_runtime": 228914.9513,
"train_tokens_per_second": 6802.831
},
{
"epoch": 1.4676923076923076,
"grad_norm": 0.7383133192878851,
"learning_rate": 3.0301528116863592e-05,
"loss": 0.6914,
"num_input_tokens_seen": 1561556608,
"step": 1730,
"train_runtime": 229541.9414,
"train_tokens_per_second": 6802.925
},
{
"epoch": 1.4719363395225464,
"grad_norm": 0.877542891400688,
"learning_rate": 3.0180896129254182e-05,
"loss": 0.6962,
"num_input_tokens_seen": 1565974592,
"step": 1735,
"train_runtime": 230156.1279,
"train_tokens_per_second": 6803.967
},
{
"epoch": 1.4761803713527852,
"grad_norm": 0.7394328578918072,
"learning_rate": 3.006013796746774e-05,
"loss": 0.6763,
"num_input_tokens_seen": 1570370368,
"step": 1740,
"train_runtime": 230776.2675,
"train_tokens_per_second": 6804.731
},
{
"epoch": 1.4804244031830238,
"grad_norm": 0.8032649294789167,
"learning_rate": 2.993925657241668e-05,
"loss": 0.6904,
"num_input_tokens_seen": 1574874432,
"step": 1745,
"train_runtime": 231438.1989,
"train_tokens_per_second": 6804.73
},
{
"epoch": 1.4846684350132626,
"grad_norm": 0.9191103442108757,
"learning_rate": 2.9818254888014586e-05,
"loss": 0.6809,
"num_input_tokens_seen": 1579401664,
"step": 1750,
"train_runtime": 232077.6612,
"train_tokens_per_second": 6805.488
},
{
"epoch": 1.4889124668435012,
"grad_norm": 0.86131262301876,
"learning_rate": 2.9697135861104546e-05,
"loss": 0.6976,
"num_input_tokens_seen": 1584000064,
"step": 1755,
"train_runtime": 232725.1511,
"train_tokens_per_second": 6806.312
},
{
"epoch": 1.49315649867374,
"grad_norm": 0.7493354049181269,
"learning_rate": 2.9575902441387393e-05,
"loss": 0.693,
"num_input_tokens_seen": 1588529152,
"step": 1760,
"train_runtime": 233378.3571,
"train_tokens_per_second": 6806.669
},
{
"epoch": 1.4974005305039788,
"grad_norm": 0.790057237962092,
"learning_rate": 2.9454557581349818e-05,
"loss": 0.6793,
"num_input_tokens_seen": 1593390656,
"step": 1765,
"train_runtime": 234055.6771,
"train_tokens_per_second": 6807.742
},
{
"epoch": 1.5016445623342176,
"grad_norm": 0.8848325086982859,
"learning_rate": 2.933310423619252e-05,
"loss": 0.6963,
"num_input_tokens_seen": 1597966720,
"step": 1770,
"train_runtime": 234761.2586,
"train_tokens_per_second": 6806.774
},
{
"epoch": 1.5016445623342176,
"eval_loss": 0.7488037943840027,
"eval_runtime": 1056.893,
"eval_samples_per_second": 2.883,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 1597966720,
"step": 1770
},
{
"epoch": 1.5058885941644562,
"grad_norm": 0.8733912053492838,
"learning_rate": 2.9211545363758214e-05,
"loss": 0.6861,
"num_input_tokens_seen": 1602346944,
"step": 1775,
"train_runtime": 236413.8812,
"train_tokens_per_second": 6777.719
},
{
"epoch": 1.510132625994695,
"grad_norm": 0.9213477294170037,
"learning_rate": 2.9089883924459603e-05,
"loss": 0.6802,
"num_input_tokens_seen": 1606861888,
"step": 1780,
"train_runtime": 237090.594,
"train_tokens_per_second": 6777.417
},
{
"epoch": 1.5143766578249336,
"grad_norm": 0.8533906385009806,
"learning_rate": 2.8968122881207272e-05,
"loss": 0.6926,
"num_input_tokens_seen": 1611490176,
"step": 1785,
"train_runtime": 237783.8661,
"train_tokens_per_second": 6777.122
},
{
"epoch": 1.5186206896551724,
"grad_norm": 0.700075097629672,
"learning_rate": 2.884626519933753e-05,
"loss": 0.6809,
"num_input_tokens_seen": 1616104256,
"step": 1790,
"train_runtime": 238446.2141,
"train_tokens_per_second": 6777.647
},
{
"epoch": 1.5228647214854112,
"grad_norm": 0.7529188790563152,
"learning_rate": 2.872431384654021e-05,
"loss": 0.6744,
"num_input_tokens_seen": 1620585216,
"step": 1795,
"train_runtime": 239099.2559,
"train_tokens_per_second": 6777.876
},
{
"epoch": 1.52710875331565,
"grad_norm": 0.9180471978156958,
"learning_rate": 2.8602271792786355e-05,
"loss": 0.6979,
"num_input_tokens_seen": 1625263744,
"step": 1800,
"train_runtime": 239770.0872,
"train_tokens_per_second": 6778.426
},
{
"epoch": 1.5313527851458886,
"grad_norm": 0.936376426158768,
"learning_rate": 2.8480142010255956e-05,
"loss": 0.6701,
"num_input_tokens_seen": 1629558400,
"step": 1805,
"train_runtime": 240351.8577,
"train_tokens_per_second": 6779.887
},
{
"epoch": 1.5355968169761272,
"grad_norm": 0.6813164585163979,
"learning_rate": 2.835792747326549e-05,
"loss": 0.6846,
"num_input_tokens_seen": 1633885760,
"step": 1810,
"train_runtime": 240960.0435,
"train_tokens_per_second": 6780.733
},
{
"epoch": 1.539840848806366,
"grad_norm": 0.9012352213500389,
"learning_rate": 2.8235631158195542e-05,
"loss": 0.6752,
"num_input_tokens_seen": 1638485184,
"step": 1815,
"train_runtime": 241606.7877,
"train_tokens_per_second": 6781.619
},
{
"epoch": 1.5440848806366048,
"grad_norm": 1.0501209838121786,
"learning_rate": 2.8113256043418296e-05,
"loss": 0.6786,
"num_input_tokens_seen": 1643127424,
"step": 1820,
"train_runtime": 242328.0234,
"train_tokens_per_second": 6780.592
},
{
"epoch": 1.5483289124668436,
"grad_norm": 0.7301282294890542,
"learning_rate": 2.7990805109224994e-05,
"loss": 0.7052,
"num_input_tokens_seen": 1647584256,
"step": 1825,
"train_runtime": 242957.2381,
"train_tokens_per_second": 6781.375
},
{
"epoch": 1.5525729442970824,
"grad_norm": 0.7180423134088882,
"learning_rate": 2.786828133775337e-05,
"loss": 0.6862,
"num_input_tokens_seen": 1651979520,
"step": 1830,
"train_runtime": 243571.1176,
"train_tokens_per_second": 6782.329
},
{
"epoch": 1.556816976127321,
"grad_norm": 1.0492539609549594,
"learning_rate": 2.774568771291503e-05,
"loss": 0.6832,
"num_input_tokens_seen": 1656516672,
"step": 1835,
"train_runtime": 244227.2168,
"train_tokens_per_second": 6782.687
},
{
"epoch": 1.5610610079575595,
"grad_norm": 1.1159044633198913,
"learning_rate": 2.7623027220322757e-05,
"loss": 0.6695,
"num_input_tokens_seen": 1661151360,
"step": 1840,
"train_runtime": 244907.8876,
"train_tokens_per_second": 6782.76
},
{
"epoch": 1.5653050397877983,
"grad_norm": 0.7897737642307381,
"learning_rate": 2.75003028472178e-05,
"loss": 0.6781,
"num_input_tokens_seen": 1665702272,
"step": 1845,
"train_runtime": 245544.1767,
"train_tokens_per_second": 6783.717
},
{
"epoch": 1.5695490716180371,
"grad_norm": 0.8021337355935967,
"learning_rate": 2.737751758239717e-05,
"loss": 0.6872,
"num_input_tokens_seen": 1670142848,
"step": 1850,
"train_runtime": 246145.0482,
"train_tokens_per_second": 6785.198
},
{
"epoch": 1.573793103448276,
"grad_norm": 0.9146806729788793,
"learning_rate": 2.7254674416140796e-05,
"loss": 0.6674,
"num_input_tokens_seen": 1674686336,
"step": 1855,
"train_runtime": 246790.7905,
"train_tokens_per_second": 6785.854
},
{
"epoch": 1.5780371352785147,
"grad_norm": 0.7370948567058712,
"learning_rate": 2.7131776340138732e-05,
"loss": 0.6835,
"num_input_tokens_seen": 1679386880,
"step": 1860,
"train_runtime": 247484.4332,
"train_tokens_per_second": 6785.828
},
{
"epoch": 1.5822811671087533,
"grad_norm": 0.8275204157097062,
"learning_rate": 2.700882634741828e-05,
"loss": 0.6633,
"num_input_tokens_seen": 1683943488,
"step": 1865,
"train_runtime": 248132.5186,
"train_tokens_per_second": 6786.468
},
{
"epoch": 1.586525198938992,
"grad_norm": 0.9040723037619556,
"learning_rate": 2.688582743227112e-05,
"loss": 0.6687,
"num_input_tokens_seen": 1688602624,
"step": 1870,
"train_runtime": 248805.7127,
"train_tokens_per_second": 6786.832
},
{
"epoch": 1.5907692307692307,
"grad_norm": 0.760985868201272,
"learning_rate": 2.676278259018037e-05,
"loss": 0.6978,
"num_input_tokens_seen": 1693144960,
"step": 1875,
"train_runtime": 249443.0497,
"train_tokens_per_second": 6787.701
},
{
"epoch": 1.5950132625994695,
"grad_norm": 0.6622361089518702,
"learning_rate": 2.663969481774764e-05,
"loss": 0.6809,
"num_input_tokens_seen": 1697705216,
"step": 1880,
"train_runtime": 250088.7537,
"train_tokens_per_second": 6788.411
},
{
"epoch": 1.5992572944297083,
"grad_norm": 0.9742697876211484,
"learning_rate": 2.6516567112620057e-05,
"loss": 0.6955,
"num_input_tokens_seen": 1702328000,
"step": 1885,
"train_runtime": 250811.4006,
"train_tokens_per_second": 6787.283
},
{
"epoch": 1.603501326259947,
"grad_norm": 0.8432142437312786,
"learning_rate": 2.6393402473417257e-05,
"loss": 0.6891,
"num_input_tokens_seen": 1706848704,
"step": 1890,
"train_runtime": 251465.4682,
"train_tokens_per_second": 6787.607
},
{
"epoch": 1.6077453580901857,
"grad_norm": 1.1461032434751868,
"learning_rate": 2.627020389965835e-05,
"loss": 0.6813,
"num_input_tokens_seen": 1711334336,
"step": 1895,
"train_runtime": 252107.1822,
"train_tokens_per_second": 6788.122
},
{
"epoch": 1.6119893899204243,
"grad_norm": 0.7667390099087915,
"learning_rate": 2.61469743916889e-05,
"loss": 0.6809,
"num_input_tokens_seen": 1715695488,
"step": 1900,
"train_runtime": 252733.9504,
"train_tokens_per_second": 6788.544
},
{
"epoch": 1.616233421750663,
"grad_norm": 1.319936113059114,
"learning_rate": 2.6023716950607814e-05,
"loss": 0.6773,
"num_input_tokens_seen": 1720293184,
"step": 1905,
"train_runtime": 253411.1635,
"train_tokens_per_second": 6788.545
},
{
"epoch": 1.620477453580902,
"grad_norm": 0.8696242756135805,
"learning_rate": 2.590043457819428e-05,
"loss": 0.6858,
"num_input_tokens_seen": 1724818304,
"step": 1910,
"train_runtime": 254091.2449,
"train_tokens_per_second": 6788.185
},
{
"epoch": 1.6247214854111407,
"grad_norm": 0.6984051732842884,
"learning_rate": 2.5777130276834677e-05,
"loss": 0.6558,
"num_input_tokens_seen": 1729429824,
"step": 1915,
"train_runtime": 254736.195,
"train_tokens_per_second": 6789.101
},
{
"epoch": 1.6289655172413793,
"grad_norm": 0.7277968832920597,
"learning_rate": 2.56538070494494e-05,
"loss": 0.6816,
"num_input_tokens_seen": 1733884032,
"step": 1920,
"train_runtime": 255336.8051,
"train_tokens_per_second": 6790.576
},
{
"epoch": 1.633209549071618,
"grad_norm": 0.9589871027299995,
"learning_rate": 2.5530467899419792e-05,
"loss": 0.6529,
"num_input_tokens_seen": 1738507328,
"step": 1925,
"train_runtime": 256022.2293,
"train_tokens_per_second": 6790.455
},
{
"epoch": 1.6374535809018567,
"grad_norm": 0.8788636504147925,
"learning_rate": 2.5407115830514955e-05,
"loss": 0.6613,
"num_input_tokens_seen": 1743139584,
"step": 1930,
"train_runtime": 256649.7946,
"train_tokens_per_second": 6791.899
},
{
"epoch": 1.6416976127320955,
"grad_norm": 0.751730530930981,
"learning_rate": 2.5283753846818626e-05,
"loss": 0.6688,
"num_input_tokens_seen": 1747799104,
"step": 1935,
"train_runtime": 257356.5973,
"train_tokens_per_second": 6791.351
},
{
"epoch": 1.6459416445623343,
"grad_norm": 1.0663719817746726,
"learning_rate": 2.516038495265599e-05,
"loss": 0.6806,
"num_input_tokens_seen": 1752473536,
"step": 1940,
"train_runtime": 258018.0708,
"train_tokens_per_second": 6792.057
},
{
"epoch": 1.650185676392573,
"grad_norm": 0.8682191978665272,
"learning_rate": 2.503701215252056e-05,
"loss": 0.6834,
"num_input_tokens_seen": 1757236416,
"step": 1945,
"train_runtime": 258721.276,
"train_tokens_per_second": 6792.006
},
{
"epoch": 1.6544297082228117,
"grad_norm": 0.7561744190987955,
"learning_rate": 2.4913638451000926e-05,
"loss": 0.6723,
"num_input_tokens_seen": 1761509184,
"step": 1950,
"train_runtime": 259302.524,
"train_tokens_per_second": 6793.259
},
{
"epoch": 1.6586737400530502,
"grad_norm": 0.9925925496141875,
"learning_rate": 2.479026685270767e-05,
"loss": 0.652,
"num_input_tokens_seen": 1766119104,
"step": 1955,
"train_runtime": 259978.5543,
"train_tokens_per_second": 6793.326
},
{
"epoch": 1.662917771883289,
"grad_norm": 0.6823275549351902,
"learning_rate": 2.4666900362200124e-05,
"loss": 0.6702,
"num_input_tokens_seen": 1770634688,
"step": 1960,
"train_runtime": 260619.4472,
"train_tokens_per_second": 6793.947
},
{
"epoch": 1.6671618037135278,
"grad_norm": 0.8309834963725057,
"learning_rate": 2.4543541983913257e-05,
"loss": 0.6498,
"num_input_tokens_seen": 1775127616,
"step": 1965,
"train_runtime": 261263.6563,
"train_tokens_per_second": 6794.392
},
{
"epoch": 1.6714058355437666,
"grad_norm": 0.916270148214916,
"learning_rate": 2.4420194722084438e-05,
"loss": 0.6637,
"num_input_tokens_seen": 1779681280,
"step": 1970,
"train_runtime": 261910.4255,
"train_tokens_per_second": 6795.0
},
{
"epoch": 1.6756498673740055,
"grad_norm": 0.8052511706103775,
"learning_rate": 2.4296861580680348e-05,
"loss": 0.6941,
"num_input_tokens_seen": 1784311040,
"step": 1975,
"train_runtime": 262520.958,
"train_tokens_per_second": 6796.833
},
{
"epoch": 1.679893899204244,
"grad_norm": 0.9680550812544997,
"learning_rate": 2.4173545563323745e-05,
"loss": 0.6812,
"num_input_tokens_seen": 1788858240,
"step": 1980,
"train_runtime": 263164.4906,
"train_tokens_per_second": 6797.491
},
{
"epoch": 1.6841379310344826,
"grad_norm": 0.854153010692225,
"learning_rate": 2.4050249673220394e-05,
"loss": 0.6798,
"num_input_tokens_seen": 1793492672,
"step": 1985,
"train_runtime": 263823.0171,
"train_tokens_per_second": 6798.09
},
{
"epoch": 1.6883819628647214,
"grad_norm": 0.9065907609733993,
"learning_rate": 2.3926976913085848e-05,
"loss": 0.6844,
"num_input_tokens_seen": 1798141312,
"step": 1990,
"train_runtime": 264461.4917,
"train_tokens_per_second": 6799.256
},
{
"epoch": 1.6926259946949602,
"grad_norm": 0.964561250811086,
"learning_rate": 2.3803730285072366e-05,
"loss": 0.6795,
"num_input_tokens_seen": 1802590528,
"step": 1995,
"train_runtime": 265120.6177,
"train_tokens_per_second": 6799.134
},
{
"epoch": 1.696870026525199,
"grad_norm": 0.9694821325807637,
"learning_rate": 2.3680512790695818e-05,
"loss": 0.6863,
"num_input_tokens_seen": 1806991488,
"step": 2000,
"train_runtime": 265757.2174,
"train_tokens_per_second": 6799.407
},
{
"epoch": 1.7011140583554378,
"grad_norm": 0.8677648954151779,
"learning_rate": 2.3557327430762528e-05,
"loss": 0.6698,
"num_input_tokens_seen": 1811461056,
"step": 2005,
"train_runtime": 266396.2479,
"train_tokens_per_second": 6799.875
},
{
"epoch": 1.7053580901856764,
"grad_norm": 0.7613369253339761,
"learning_rate": 2.3434177205296257e-05,
"loss": 0.6613,
"num_input_tokens_seen": 1816142272,
"step": 2010,
"train_runtime": 267054.6771,
"train_tokens_per_second": 6800.638
},
{
"epoch": 1.709602122015915,
"grad_norm": 1.0622307149303414,
"learning_rate": 2.3311065113465083e-05,
"loss": 0.6602,
"num_input_tokens_seen": 1820555008,
"step": 2015,
"train_runtime": 267688.1492,
"train_tokens_per_second": 6801.03
},
{
"epoch": 1.7138461538461538,
"grad_norm": 0.8964492847174279,
"learning_rate": 2.3187994153508397e-05,
"loss": 0.658,
"num_input_tokens_seen": 1825223808,
"step": 2020,
"train_runtime": 268333.947,
"train_tokens_per_second": 6802.061
},
{
"epoch": 1.7180901856763926,
"grad_norm": 1.177224036456473,
"learning_rate": 2.3064967322663893e-05,
"loss": 0.6932,
"num_input_tokens_seen": 1829789568,
"step": 2025,
"train_runtime": 268969.5993,
"train_tokens_per_second": 6802.961
},
{
"epoch": 1.7223342175066314,
"grad_norm": 1.0226100260638311,
"learning_rate": 2.2941987617094527e-05,
"loss": 0.6721,
"num_input_tokens_seen": 1834277632,
"step": 2030,
"train_runtime": 269616.3957,
"train_tokens_per_second": 6803.287
},
{
"epoch": 1.72657824933687,
"grad_norm": 0.776158652036907,
"learning_rate": 2.2819058031815606e-05,
"loss": 0.685,
"num_input_tokens_seen": 1838997504,
"step": 2035,
"train_runtime": 270312.8678,
"train_tokens_per_second": 6803.219
},
{
"epoch": 1.7308222811671088,
"grad_norm": 0.8255759535290057,
"learning_rate": 2.26961815606218e-05,
"loss": 0.6708,
"num_input_tokens_seen": 1843497088,
"step": 2040,
"train_runtime": 270987.9158,
"train_tokens_per_second": 6802.876
},
{
"epoch": 1.7350663129973474,
"grad_norm": 0.9846063905318818,
"learning_rate": 2.2573361196014245e-05,
"loss": 0.68,
"num_input_tokens_seen": 1848067968,
"step": 2045,
"train_runtime": 271662.3336,
"train_tokens_per_second": 6802.813
},
{
"epoch": 1.7393103448275862,
"grad_norm": 0.8104965013143679,
"learning_rate": 2.2450599929127715e-05,
"loss": 0.6681,
"num_input_tokens_seen": 1852536512,
"step": 2050,
"train_runtime": 272308.8148,
"train_tokens_per_second": 6803.072
},
{
"epoch": 1.743554376657825,
"grad_norm": 0.7512768919734117,
"learning_rate": 2.2327900749657677e-05,
"loss": 0.6608,
"num_input_tokens_seen": 1856969408,
"step": 2055,
"train_runtime": 272924.5407,
"train_tokens_per_second": 6803.966
},
{
"epoch": 1.7477984084880638,
"grad_norm": 0.938563336869354,
"learning_rate": 2.2205266645787588e-05,
"loss": 0.6436,
"num_input_tokens_seen": 1861364032,
"step": 2060,
"train_runtime": 273525.8351,
"train_tokens_per_second": 6805.076
},
{
"epoch": 1.7520424403183024,
"grad_norm": 0.9168142161151926,
"learning_rate": 2.2082700604116046e-05,
"loss": 0.6734,
"num_input_tokens_seen": 1866079936,
"step": 2065,
"train_runtime": 274238.6936,
"train_tokens_per_second": 6804.583
},
{
"epoch": 1.7520424403183024,
"eval_loss": 0.729947566986084,
"eval_runtime": 1057.9673,
"eval_samples_per_second": 2.88,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 1866079936,
"step": 2065
},
{
"epoch": 1.7562864721485412,
"grad_norm": 0.7781219629120125,
"learning_rate": 2.1960205609584066e-05,
"loss": 0.6555,
"num_input_tokens_seen": 1870563904,
"step": 2070,
"train_runtime": 275928.5055,
"train_tokens_per_second": 6779.162
},
{
"epoch": 1.7605305039787797,
"grad_norm": 0.8980636445872143,
"learning_rate": 2.183778464540244e-05,
"loss": 0.6756,
"num_input_tokens_seen": 1874859840,
"step": 2075,
"train_runtime": 276546.0381,
"train_tokens_per_second": 6779.558
},
{
"epoch": 1.7647745358090186,
"grad_norm": 0.8923077727556372,
"learning_rate": 2.1715440692978994e-05,
"loss": 0.6779,
"num_input_tokens_seen": 1879558656,
"step": 2080,
"train_runtime": 277240.8007,
"train_tokens_per_second": 6779.517
},
{
"epoch": 1.7690185676392574,
"grad_norm": 1.1974687941835145,
"learning_rate": 2.159317673184608e-05,
"loss": 0.6671,
"num_input_tokens_seen": 1883979904,
"step": 2085,
"train_runtime": 277861.5498,
"train_tokens_per_second": 6780.283
},
{
"epoch": 1.7732625994694962,
"grad_norm": 0.7953214594587166,
"learning_rate": 2.1470995739587944e-05,
"loss": 0.6731,
"num_input_tokens_seen": 1888448384,
"step": 2090,
"train_runtime": 278500.3438,
"train_tokens_per_second": 6780.776
},
{
"epoch": 1.7775066312997347,
"grad_norm": 0.7353368817381706,
"learning_rate": 2.13489006917682e-05,
"loss": 0.6567,
"num_input_tokens_seen": 1892962880,
"step": 2095,
"train_runtime": 279180.0365,
"train_tokens_per_second": 6780.438
},
{
"epoch": 1.7817506631299733,
"grad_norm": 0.7817976445897892,
"learning_rate": 2.1226894561857447e-05,
"loss": 0.6645,
"num_input_tokens_seen": 1897595968,
"step": 2100,
"train_runtime": 279829.9966,
"train_tokens_per_second": 6781.246
},
{
"epoch": 1.7859946949602121,
"grad_norm": 0.8309895762650132,
"learning_rate": 2.1104980321160752e-05,
"loss": 0.6734,
"num_input_tokens_seen": 1902109888,
"step": 2105,
"train_runtime": 280489.1904,
"train_tokens_per_second": 6781.402
},
{
"epoch": 1.790238726790451,
"grad_norm": 0.6339748804576945,
"learning_rate": 2.0983160938745382e-05,
"loss": 0.6526,
"num_input_tokens_seen": 1906705216,
"step": 2110,
"train_runtime": 281135.8565,
"train_tokens_per_second": 6782.149
},
{
"epoch": 1.7944827586206897,
"grad_norm": 0.8552208011108713,
"learning_rate": 2.086143938136841e-05,
"loss": 0.6563,
"num_input_tokens_seen": 1911218304,
"step": 2115,
"train_runtime": 281788.5646,
"train_tokens_per_second": 6782.455
},
{
"epoch": 1.7987267904509285,
"grad_norm": 0.9732618024212317,
"learning_rate": 2.0739818613404513e-05,
"loss": 0.6619,
"num_input_tokens_seen": 1915723008,
"step": 2120,
"train_runtime": 282459.2265,
"train_tokens_per_second": 6782.299
},
{
"epoch": 1.8029708222811671,
"grad_norm": 1.0431409571591543,
"learning_rate": 2.06183015967738e-05,
"loss": 0.6451,
"num_input_tokens_seen": 1920464320,
"step": 2125,
"train_runtime": 283127.6969,
"train_tokens_per_second": 6783.032
},
{
"epoch": 1.8072148541114057,
"grad_norm": 0.874966660194592,
"learning_rate": 2.0496891290869595e-05,
"loss": 0.6679,
"num_input_tokens_seen": 1924942528,
"step": 2130,
"train_runtime": 283780.283,
"train_tokens_per_second": 6783.214
},
{
"epoch": 1.8114588859416445,
"grad_norm": 0.7749558949940442,
"learning_rate": 2.0375590652486482e-05,
"loss": 0.6803,
"num_input_tokens_seen": 1929745408,
"step": 2135,
"train_runtime": 284477.4061,
"train_tokens_per_second": 6783.475
},
{
"epoch": 1.8157029177718833,
"grad_norm": 0.7228243017653365,
"learning_rate": 2.025440263574817e-05,
"loss": 0.6338,
"num_input_tokens_seen": 1934284800,
"step": 2140,
"train_runtime": 285111.2012,
"train_tokens_per_second": 6784.317
},
{
"epoch": 1.819946949602122,
"grad_norm": 0.7810955976714484,
"learning_rate": 2.013333019203563e-05,
"loss": 0.6532,
"num_input_tokens_seen": 1938877184,
"step": 2145,
"train_runtime": 285771.2844,
"train_tokens_per_second": 6784.717
},
{
"epoch": 1.8241909814323607,
"grad_norm": 0.8174083310669077,
"learning_rate": 2.001237626991523e-05,
"loss": 0.6511,
"num_input_tokens_seen": 1943391872,
"step": 2150,
"train_runtime": 286412.497,
"train_tokens_per_second": 6785.29
},
{
"epoch": 1.8284350132625995,
"grad_norm": 0.795144872361272,
"learning_rate": 1.989154381506684e-05,
"loss": 0.6598,
"num_input_tokens_seen": 1947919808,
"step": 2155,
"train_runtime": 287052.841,
"train_tokens_per_second": 6785.928
},
{
"epoch": 1.832679045092838,
"grad_norm": 0.7485422633125285,
"learning_rate": 1.9770835770212198e-05,
"loss": 0.6566,
"num_input_tokens_seen": 1952470976,
"step": 2160,
"train_runtime": 287718.7456,
"train_tokens_per_second": 6786.04
},
{
"epoch": 1.8369230769230769,
"grad_norm": 1.1308794437672134,
"learning_rate": 1.9650255075043163e-05,
"loss": 0.6559,
"num_input_tokens_seen": 1957140480,
"step": 2165,
"train_runtime": 288394.0615,
"train_tokens_per_second": 6786.341
},
{
"epoch": 1.8411671087533157,
"grad_norm": 0.8804300984321736,
"learning_rate": 1.9529804666150157e-05,
"loss": 0.6628,
"num_input_tokens_seen": 1961650176,
"step": 2170,
"train_runtime": 289025.4848,
"train_tokens_per_second": 6787.118
},
{
"epoch": 1.8454111405835545,
"grad_norm": 0.7255223951059633,
"learning_rate": 1.940948747695066e-05,
"loss": 0.6394,
"num_input_tokens_seen": 1966166336,
"step": 2175,
"train_runtime": 289641.3116,
"train_tokens_per_second": 6788.28
},
{
"epoch": 1.849655172413793,
"grad_norm": 0.7679331866159973,
"learning_rate": 1.9289306437617734e-05,
"loss": 0.6643,
"num_input_tokens_seen": 1970829888,
"step": 2180,
"train_runtime": 290308.436,
"train_tokens_per_second": 6788.745
},
{
"epoch": 1.8538992042440319,
"grad_norm": 1.142785592894065,
"learning_rate": 1.916926447500871e-05,
"loss": 0.6499,
"num_input_tokens_seen": 1975190528,
"step": 2185,
"train_runtime": 290918.3543,
"train_tokens_per_second": 6789.501
},
{
"epoch": 1.8581432360742705,
"grad_norm": 0.9065635652001467,
"learning_rate": 1.904936451259384e-05,
"loss": 0.6607,
"num_input_tokens_seen": 1979864704,
"step": 2190,
"train_runtime": 291624.09,
"train_tokens_per_second": 6789.099
},
{
"epoch": 1.8623872679045093,
"grad_norm": 2.2396906301840747,
"learning_rate": 1.892960947038519e-05,
"loss": 0.671,
"num_input_tokens_seen": 1984425600,
"step": 2195,
"train_runtime": 292249.7481,
"train_tokens_per_second": 6790.17
},
{
"epoch": 1.866631299734748,
"grad_norm": 0.821508733622114,
"learning_rate": 1.8810002264865444e-05,
"loss": 0.6556,
"num_input_tokens_seen": 1988847360,
"step": 2200,
"train_runtime": 292879.9609,
"train_tokens_per_second": 6790.657
},
{
"epoch": 1.8708753315649869,
"grad_norm": 0.7742604167377043,
"learning_rate": 1.8690545808916908e-05,
"loss": 0.6713,
"num_input_tokens_seen": 1993346432,
"step": 2205,
"train_runtime": 293530.2207,
"train_tokens_per_second": 6790.941
},
{
"epoch": 1.8751193633952254,
"grad_norm": 0.7780945672181276,
"learning_rate": 1.8571243011750604e-05,
"loss": 0.6511,
"num_input_tokens_seen": 1997950144,
"step": 2210,
"train_runtime": 294160.5008,
"train_tokens_per_second": 6792.041
},
{
"epoch": 1.879363395225464,
"grad_norm": 0.7883576009365799,
"learning_rate": 1.8452096778835348e-05,
"loss": 0.6611,
"num_input_tokens_seen": 2002450688,
"step": 2215,
"train_runtime": 294809.6886,
"train_tokens_per_second": 6792.35
},
{
"epoch": 1.8836074270557028,
"grad_norm": 1.3753307055301716,
"learning_rate": 1.833311001182707e-05,
"loss": 0.6566,
"num_input_tokens_seen": 2006911360,
"step": 2220,
"train_runtime": 295455.3095,
"train_tokens_per_second": 6792.605
},
{
"epoch": 1.8878514588859416,
"grad_norm": 0.7211010767842578,
"learning_rate": 1.821428560849809e-05,
"loss": 0.6493,
"num_input_tokens_seen": 2011488384,
"step": 2225,
"train_runtime": 296132.4569,
"train_tokens_per_second": 6792.529
},
{
"epoch": 1.8920954907161804,
"grad_norm": 0.7499760395080998,
"learning_rate": 1.8095626462666548e-05,
"loss": 0.6688,
"num_input_tokens_seen": 2016013248,
"step": 2230,
"train_runtime": 296757.4534,
"train_tokens_per_second": 6793.471
},
{
"epoch": 1.8963395225464192,
"grad_norm": 0.7281088629603852,
"learning_rate": 1.797713546412598e-05,
"loss": 0.6691,
"num_input_tokens_seen": 2020582592,
"step": 2235,
"train_runtime": 297412.3677,
"train_tokens_per_second": 6793.875
},
{
"epoch": 1.9005835543766578,
"grad_norm": 0.6418455823156107,
"learning_rate": 1.78588154985749e-05,
"loss": 0.6638,
"num_input_tokens_seen": 2025185600,
"step": 2240,
"train_runtime": 298049.559,
"train_tokens_per_second": 6794.795
},
{
"epoch": 1.9048275862068964,
"grad_norm": 0.8741945049532132,
"learning_rate": 1.7740669447546513e-05,
"loss": 0.6691,
"num_input_tokens_seen": 2029829952,
"step": 2245,
"train_runtime": 298722.6372,
"train_tokens_per_second": 6795.032
},
{
"epoch": 1.9090716180371352,
"grad_norm": 0.841478880460149,
"learning_rate": 1.762270018833857e-05,
"loss": 0.6789,
"num_input_tokens_seen": 2034547456,
"step": 2250,
"train_runtime": 299449.1091,
"train_tokens_per_second": 6794.301
},
{
"epoch": 1.913315649867374,
"grad_norm": 0.8723647428014106,
"learning_rate": 1.7504910593943267e-05,
"loss": 0.6579,
"num_input_tokens_seen": 2039144640,
"step": 2255,
"train_runtime": 300085.9181,
"train_tokens_per_second": 6795.203
},
{
"epoch": 1.9175596816976128,
"grad_norm": 0.7892235650709017,
"learning_rate": 1.738730353297732e-05,
"loss": 0.6824,
"num_input_tokens_seen": 2043803392,
"step": 2260,
"train_runtime": 300788.7082,
"train_tokens_per_second": 6794.814
},
{
"epoch": 1.9218037135278516,
"grad_norm": 0.7722394088543671,
"learning_rate": 1.726988186961202e-05,
"loss": 0.6557,
"num_input_tokens_seen": 2048335872,
"step": 2265,
"train_runtime": 301420.1009,
"train_tokens_per_second": 6795.618
},
{
"epoch": 1.9260477453580902,
"grad_norm": 0.922769500095772,
"learning_rate": 1.7152648463503605e-05,
"loss": 0.6614,
"num_input_tokens_seen": 2053131840,
"step": 2270,
"train_runtime": 302141.3095,
"train_tokens_per_second": 6795.27
},
{
"epoch": 1.9302917771883288,
"grad_norm": 0.7824828630386448,
"learning_rate": 1.7035606169723488e-05,
"loss": 0.6478,
"num_input_tokens_seen": 2057792768,
"step": 2275,
"train_runtime": 302806.6263,
"train_tokens_per_second": 6795.732
},
{
"epoch": 1.9345358090185676,
"grad_norm": 0.8688413702016398,
"learning_rate": 1.69187578386888e-05,
"loss": 0.6524,
"num_input_tokens_seen": 2062355392,
"step": 2280,
"train_runtime": 303443.8425,
"train_tokens_per_second": 6796.498
},
{
"epoch": 1.9387798408488064,
"grad_norm": 0.9595105345229777,
"learning_rate": 1.6802106316092966e-05,
"loss": 0.6603,
"num_input_tokens_seen": 2066871424,
"step": 2285,
"train_runtime": 304131.5685,
"train_tokens_per_second": 6795.978
},
{
"epoch": 1.9430238726790452,
"grad_norm": 0.8712664797281483,
"learning_rate": 1.6685654442836373e-05,
"loss": 0.6587,
"num_input_tokens_seen": 2071492864,
"step": 2290,
"train_runtime": 304825.7013,
"train_tokens_per_second": 6795.663
},
{
"epoch": 1.9472679045092838,
"grad_norm": 0.7732671706732043,
"learning_rate": 1.656940505495722e-05,
"loss": 0.6524,
"num_input_tokens_seen": 2075990976,
"step": 2295,
"train_runtime": 305476.6188,
"train_tokens_per_second": 6795.908
},
{
"epoch": 1.9515119363395226,
"grad_norm": 0.8870290841303411,
"learning_rate": 1.645336098356242e-05,
"loss": 0.6405,
"num_input_tokens_seen": 2080441856,
"step": 2300,
"train_runtime": 306111.3026,
"train_tokens_per_second": 6796.358
},
{
"epoch": 1.9557559681697612,
"grad_norm": 0.8458061170360918,
"learning_rate": 1.633752505475864e-05,
"loss": 0.6634,
"num_input_tokens_seen": 2085189888,
"step": 2305,
"train_runtime": 306781.713,
"train_tokens_per_second": 6796.982
},
{
"epoch": 1.96,
"grad_norm": 1.3279259324289545,
"learning_rate": 1.622190008958354e-05,
"loss": 0.6473,
"num_input_tokens_seen": 2089579008,
"step": 2310,
"train_runtime": 307390.5024,
"train_tokens_per_second": 6797.8
},
{
"epoch": 1.9642440318302388,
"grad_norm": 0.9118221582118621,
"learning_rate": 1.610648890393701e-05,
"loss": 0.6665,
"num_input_tokens_seen": 2094280256,
"step": 2315,
"train_runtime": 308079.9412,
"train_tokens_per_second": 6797.847
},
{
"epoch": 1.9684880636604776,
"grad_norm": 0.8711258127478657,
"learning_rate": 1.5991294308512595e-05,
"loss": 0.6587,
"num_input_tokens_seen": 2098990464,
"step": 2320,
"train_runtime": 308818.5174,
"train_tokens_per_second": 6796.841
},
{
"epoch": 1.9727320954907162,
"grad_norm": 0.7184830235566304,
"learning_rate": 1.5876319108729077e-05,
"loss": 0.6661,
"num_input_tokens_seen": 2103407872,
"step": 2325,
"train_runtime": 309463.3706,
"train_tokens_per_second": 6796.953
},
{
"epoch": 1.976976127320955,
"grad_norm": 0.7747189164106947,
"learning_rate": 1.5761566104662117e-05,
"loss": 0.6518,
"num_input_tokens_seen": 2107807168,
"step": 2330,
"train_runtime": 310069.0252,
"train_tokens_per_second": 6797.864
},
{
"epoch": 1.9812201591511935,
"grad_norm": 0.878008091110606,
"learning_rate": 1.5647038090976114e-05,
"loss": 0.6593,
"num_input_tokens_seen": 2112190016,
"step": 2335,
"train_runtime": 310683.7408,
"train_tokens_per_second": 6798.521
},
{
"epoch": 1.9854641909814323,
"grad_norm": 0.7138571628786999,
"learning_rate": 1.5532737856856062e-05,
"loss": 0.6507,
"num_input_tokens_seen": 2116487360,
"step": 2340,
"train_runtime": 311271.0869,
"train_tokens_per_second": 6799.499
},
{
"epoch": 1.9897082228116711,
"grad_norm": 0.873326228404246,
"learning_rate": 1.5418668185939715e-05,
"loss": 0.6422,
"num_input_tokens_seen": 2120920256,
"step": 2345,
"train_runtime": 311893.1375,
"train_tokens_per_second": 6800.15
},
{
"epoch": 1.99395225464191,
"grad_norm": 0.6793405806925589,
"learning_rate": 1.530483185624973e-05,
"loss": 0.6492,
"num_input_tokens_seen": 2125213056,
"step": 2350,
"train_runtime": 312486.8653,
"train_tokens_per_second": 6800.968
},
{
"epoch": 1.9981962864721485,
"grad_norm": 0.7840816343012773,
"learning_rate": 1.519123164012603e-05,
"loss": 0.6551,
"num_input_tokens_seen": 2129589248,
"step": 2355,
"train_runtime": 313085.9064,
"train_tokens_per_second": 6801.933
},
{
"epoch": 2.0016976127320953,
"grad_norm": 0.7056893265010777,
"learning_rate": 1.507787030415831e-05,
"loss": 0.4932,
"num_input_tokens_seen": 2133444224,
"step": 2360,
"train_runtime": 313640.7613,
"train_tokens_per_second": 6802.191
},
{
"epoch": 2.0016976127320953,
"eval_loss": 0.71119624376297,
"eval_runtime": 1056.465,
"eval_samples_per_second": 2.884,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 2133444224,
"step": 2360
},
{
"epoch": 2.005941644562334,
"grad_norm": 0.8558740917942909,
"learning_rate": 1.4964750609118614e-05,
"loss": 0.5706,
"num_input_tokens_seen": 2138178304,
"step": 2365,
"train_runtime": 315463.138,
"train_tokens_per_second": 6777.902
},
{
"epoch": 2.010185676392573,
"grad_norm": 0.8240864942611333,
"learning_rate": 1.4851875309894159e-05,
"loss": 0.5672,
"num_input_tokens_seen": 2142597568,
"step": 2370,
"train_runtime": 316074.4747,
"train_tokens_per_second": 6778.774
},
{
"epoch": 2.0144297082228118,
"grad_norm": 0.9448165249761411,
"learning_rate": 1.4739247155420183e-05,
"loss": 0.5481,
"num_input_tokens_seen": 2147211968,
"step": 2375,
"train_runtime": 316715.0106,
"train_tokens_per_second": 6779.634
},
{
"epoch": 2.0186737400530506,
"grad_norm": 0.8633716944676363,
"learning_rate": 1.4626868888613027e-05,
"loss": 0.5397,
"num_input_tokens_seen": 2151752896,
"step": 2380,
"train_runtime": 317376.0878,
"train_tokens_per_second": 6779.82
},
{
"epoch": 2.022917771883289,
"grad_norm": 0.809745604875603,
"learning_rate": 1.4514743246303359e-05,
"loss": 0.5531,
"num_input_tokens_seen": 2156288704,
"step": 2385,
"train_runtime": 318045.818,
"train_tokens_per_second": 6779.805
},
{
"epoch": 2.0271618037135277,
"grad_norm": 0.8968293184086553,
"learning_rate": 1.4402872959169461e-05,
"loss": 0.5337,
"num_input_tokens_seen": 2160913088,
"step": 2390,
"train_runtime": 318702.9313,
"train_tokens_per_second": 6780.336
},
{
"epoch": 2.0314058355437665,
"grad_norm": 0.9550756849016048,
"learning_rate": 1.4291260751670816e-05,
"loss": 0.5366,
"num_input_tokens_seen": 2165574976,
"step": 2395,
"train_runtime": 319403.5503,
"train_tokens_per_second": 6780.059
},
{
"epoch": 2.0356498673740053,
"grad_norm": 1.207409378421572,
"learning_rate": 1.4179909341981625e-05,
"loss": 0.5345,
"num_input_tokens_seen": 2170092736,
"step": 2400,
"train_runtime": 320033.1824,
"train_tokens_per_second": 6780.837
},
{
"epoch": 2.039893899204244,
"grad_norm": 0.8757416767488818,
"learning_rate": 1.4068821441924779e-05,
"loss": 0.5715,
"num_input_tokens_seen": 2174494400,
"step": 2405,
"train_runtime": 320671.8092,
"train_tokens_per_second": 6781.059
},
{
"epoch": 2.044137931034483,
"grad_norm": 0.8349852718141944,
"learning_rate": 1.3957999756905643e-05,
"loss": 0.5607,
"num_input_tokens_seen": 2178937728,
"step": 2410,
"train_runtime": 321299.9151,
"train_tokens_per_second": 6781.632
},
{
"epoch": 2.0483819628647213,
"grad_norm": 0.8313527873747903,
"learning_rate": 1.3847446985846297e-05,
"loss": 0.5364,
"num_input_tokens_seen": 2183459520,
"step": 2415,
"train_runtime": 321952.4508,
"train_tokens_per_second": 6781.932
},
{
"epoch": 2.05262599469496,
"grad_norm": 1.5262840043879295,
"learning_rate": 1.3737165821119752e-05,
"loss": 0.5404,
"num_input_tokens_seen": 2187827712,
"step": 2420,
"train_runtime": 322540.9554,
"train_tokens_per_second": 6783.1
},
{
"epoch": 2.056870026525199,
"grad_norm": 2.1756261698457076,
"learning_rate": 1.3627158948484391e-05,
"loss": 0.5469,
"num_input_tokens_seen": 2192377216,
"step": 2425,
"train_runtime": 323190.2135,
"train_tokens_per_second": 6783.551
},
{
"epoch": 2.0611140583554377,
"grad_norm": 1.0913830108446643,
"learning_rate": 1.351742904701856e-05,
"loss": 0.5683,
"num_input_tokens_seen": 2196995328,
"step": 2430,
"train_runtime": 323877.7351,
"train_tokens_per_second": 6783.41
},
{
"epoch": 2.0653580901856765,
"grad_norm": 0.9641578470507003,
"learning_rate": 1.3407978789055311e-05,
"loss": 0.551,
"num_input_tokens_seen": 2201593728,
"step": 2435,
"train_runtime": 324546.3078,
"train_tokens_per_second": 6783.604
},
{
"epoch": 2.0696021220159153,
"grad_norm": 0.79070536279513,
"learning_rate": 1.3298810840117348e-05,
"loss": 0.5296,
"num_input_tokens_seen": 2206299712,
"step": 2440,
"train_runtime": 325212.3944,
"train_tokens_per_second": 6784.181
},
{
"epoch": 2.0738461538461537,
"grad_norm": 0.9950759732472904,
"learning_rate": 1.3189927858852092e-05,
"loss": 0.5623,
"num_input_tokens_seen": 2210768256,
"step": 2445,
"train_runtime": 325832.427,
"train_tokens_per_second": 6784.985
},
{
"epoch": 2.0780901856763925,
"grad_norm": 1.0145751132111058,
"learning_rate": 1.3081332496966923e-05,
"loss": 0.5454,
"num_input_tokens_seen": 2215064064,
"step": 2450,
"train_runtime": 326434.2676,
"train_tokens_per_second": 6785.636
},
{
"epoch": 2.0823342175066313,
"grad_norm": 1.2823230060644373,
"learning_rate": 1.297302739916463e-05,
"loss": 0.5435,
"num_input_tokens_seen": 2219600896,
"step": 2455,
"train_runtime": 327106.8846,
"train_tokens_per_second": 6785.552
},
{
"epoch": 2.08657824933687,
"grad_norm": 0.9419195367761739,
"learning_rate": 1.2865015203078996e-05,
"loss": 0.5445,
"num_input_tokens_seen": 2224140416,
"step": 2460,
"train_runtime": 327852.8283,
"train_tokens_per_second": 6783.96
},
{
"epoch": 2.090822281167109,
"grad_norm": 1.125875379008506,
"learning_rate": 1.27572985392105e-05,
"loss": 0.5443,
"num_input_tokens_seen": 2228717248,
"step": 2465,
"train_runtime": 328535.4625,
"train_tokens_per_second": 6783.795
},
{
"epoch": 2.0950663129973477,
"grad_norm": 1.1324409125579475,
"learning_rate": 1.2649880030862393e-05,
"loss": 0.5599,
"num_input_tokens_seen": 2233320128,
"step": 2470,
"train_runtime": 329151.185,
"train_tokens_per_second": 6785.089
},
{
"epoch": 2.099310344827586,
"grad_norm": 0.931872300199955,
"learning_rate": 1.2542762294076631e-05,
"loss": 0.5637,
"num_input_tokens_seen": 2237752384,
"step": 2475,
"train_runtime": 329773.7972,
"train_tokens_per_second": 6785.719
},
{
"epoch": 2.103554376657825,
"grad_norm": 0.8489286981286124,
"learning_rate": 1.2435947937570355e-05,
"loss": 0.5598,
"num_input_tokens_seen": 2242141568,
"step": 2480,
"train_runtime": 330402.9852,
"train_tokens_per_second": 6786.081
},
{
"epoch": 2.1077984084880637,
"grad_norm": 1.1131489144682933,
"learning_rate": 1.2329439562672178e-05,
"loss": 0.5418,
"num_input_tokens_seen": 2246654592,
"step": 2485,
"train_runtime": 331056.83,
"train_tokens_per_second": 6786.311
},
{
"epoch": 2.1120424403183025,
"grad_norm": 0.9322580519781613,
"learning_rate": 1.2223239763258965e-05,
"loss": 0.5505,
"num_input_tokens_seen": 2251247168,
"step": 2490,
"train_runtime": 331724.102,
"train_tokens_per_second": 6786.505
},
{
"epoch": 2.1162864721485413,
"grad_norm": 0.9283803102424425,
"learning_rate": 1.2117351125692603e-05,
"loss": 0.5568,
"num_input_tokens_seen": 2255680768,
"step": 2495,
"train_runtime": 332394.4276,
"train_tokens_per_second": 6786.157
},
{
"epoch": 2.12053050397878,
"grad_norm": 1.1047391998064584,
"learning_rate": 1.2011776228757024e-05,
"loss": 0.5505,
"num_input_tokens_seen": 2260087168,
"step": 2500,
"train_runtime": 333031.3463,
"train_tokens_per_second": 6786.41
},
{
"epoch": 2.1247745358090184,
"grad_norm": 0.975091099261222,
"learning_rate": 1.1906517643595408e-05,
"loss": 0.5573,
"num_input_tokens_seen": 2264578560,
"step": 2505,
"train_runtime": 333699.72,
"train_tokens_per_second": 6786.276
},
{
"epoch": 2.1290185676392572,
"grad_norm": 1.1511567847202058,
"learning_rate": 1.180157793364756e-05,
"loss": 0.5413,
"num_input_tokens_seen": 2269041472,
"step": 2510,
"train_runtime": 334339.6624,
"train_tokens_per_second": 6786.636
},
{
"epoch": 2.133262599469496,
"grad_norm": 1.0466460806645501,
"learning_rate": 1.1696959654587474e-05,
"loss": 0.5493,
"num_input_tokens_seen": 2273598720,
"step": 2515,
"train_runtime": 334997.2489,
"train_tokens_per_second": 6786.918
},
{
"epoch": 2.137506631299735,
"grad_norm": 1.0291472253443341,
"learning_rate": 1.1592665354261118e-05,
"loss": 0.5456,
"num_input_tokens_seen": 2278146944,
"step": 2520,
"train_runtime": 335632.7848,
"train_tokens_per_second": 6787.617
},
{
"epoch": 2.1417506631299736,
"grad_norm": 1.0091794838431885,
"learning_rate": 1.1488697572624351e-05,
"loss": 0.5668,
"num_input_tokens_seen": 2282573568,
"step": 2525,
"train_runtime": 336253.6573,
"train_tokens_per_second": 6788.249
},
{
"epoch": 2.145994694960212,
"grad_norm": 0.85992217862161,
"learning_rate": 1.138505884168109e-05,
"loss": 0.5308,
"num_input_tokens_seen": 2287001600,
"step": 2530,
"train_runtime": 336902.3826,
"train_tokens_per_second": 6788.321
},
{
"epoch": 2.150238726790451,
"grad_norm": 0.8608747847941026,
"learning_rate": 1.1281751685421646e-05,
"loss": 0.5605,
"num_input_tokens_seen": 2291406080,
"step": 2535,
"train_runtime": 337530.1254,
"train_tokens_per_second": 6788.745
},
{
"epoch": 2.1544827586206896,
"grad_norm": 1.0839910126914474,
"learning_rate": 1.1178778619761209e-05,
"loss": 0.5507,
"num_input_tokens_seen": 2295897472,
"step": 2540,
"train_runtime": 338146.0422,
"train_tokens_per_second": 6789.662
},
{
"epoch": 2.1587267904509284,
"grad_norm": 0.8806247078251732,
"learning_rate": 1.1076142152478686e-05,
"loss": 0.5449,
"num_input_tokens_seen": 2300505152,
"step": 2545,
"train_runtime": 338818.075,
"train_tokens_per_second": 6789.795
},
{
"epoch": 2.162970822281167,
"grad_norm": 1.1516459030706268,
"learning_rate": 1.0973844783155474e-05,
"loss": 0.5267,
"num_input_tokens_seen": 2304838976,
"step": 2550,
"train_runtime": 339432.6849,
"train_tokens_per_second": 6790.268
},
{
"epoch": 2.167214854111406,
"grad_norm": 0.8899542374126376,
"learning_rate": 1.0871889003114743e-05,
"loss": 0.5415,
"num_input_tokens_seen": 2309598144,
"step": 2555,
"train_runtime": 340127.2354,
"train_tokens_per_second": 6790.395
},
{
"epoch": 2.1714588859416444,
"grad_norm": 0.9275051879320549,
"learning_rate": 1.0770277295360629e-05,
"loss": 0.535,
"num_input_tokens_seen": 2314332800,
"step": 2560,
"train_runtime": 340820.4477,
"train_tokens_per_second": 6790.475
},
{
"epoch": 2.175702917771883,
"grad_norm": 1.0929509877136006,
"learning_rate": 1.066901213451785e-05,
"loss": 0.5407,
"num_input_tokens_seen": 2318735744,
"step": 2565,
"train_runtime": 341455.108,
"train_tokens_per_second": 6790.748
},
{
"epoch": 2.179946949602122,
"grad_norm": 1.2365680582016083,
"learning_rate": 1.0568095986771414e-05,
"loss": 0.5256,
"num_input_tokens_seen": 2323017216,
"step": 2570,
"train_runtime": 342078.7972,
"train_tokens_per_second": 6790.883
},
{
"epoch": 2.184190981432361,
"grad_norm": 0.9117201092121693,
"learning_rate": 1.0467531309806547e-05,
"loss": 0.5471,
"num_input_tokens_seen": 2327511360,
"step": 2575,
"train_runtime": 342733.0181,
"train_tokens_per_second": 6791.033
},
{
"epoch": 2.1884350132625996,
"grad_norm": 0.9389916789629429,
"learning_rate": 1.0367320552748849e-05,
"loss": 0.533,
"num_input_tokens_seen": 2332033792,
"step": 2580,
"train_runtime": 343352.1052,
"train_tokens_per_second": 6791.96
},
{
"epoch": 2.1926790450928384,
"grad_norm": 0.9268628668955959,
"learning_rate": 1.0267466156104655e-05,
"loss": 0.5493,
"num_input_tokens_seen": 2336623744,
"step": 2585,
"train_runtime": 344011.0131,
"train_tokens_per_second": 6792.293
},
{
"epoch": 2.1969230769230768,
"grad_norm": 0.9632958346283522,
"learning_rate": 1.0167970551701586e-05,
"loss": 0.5585,
"num_input_tokens_seen": 2341059904,
"step": 2590,
"train_runtime": 344653.4787,
"train_tokens_per_second": 6792.503
},
{
"epoch": 2.2011671087533156,
"grad_norm": 0.9858607973372651,
"learning_rate": 1.0068836162629333e-05,
"loss": 0.551,
"num_input_tokens_seen": 2345544192,
"step": 2595,
"train_runtime": 345280.1341,
"train_tokens_per_second": 6793.163
},
{
"epoch": 2.2054111405835544,
"grad_norm": 0.9219599125064627,
"learning_rate": 9.970065403180648e-06,
"loss": 0.5456,
"num_input_tokens_seen": 2350091328,
"step": 2600,
"train_runtime": 345944.8214,
"train_tokens_per_second": 6793.255
},
{
"epoch": 2.209655172413793,
"grad_norm": 1.0056863158147027,
"learning_rate": 9.871660678792532e-06,
"loss": 0.5573,
"num_input_tokens_seen": 2354507008,
"step": 2605,
"train_runtime": 346586.1214,
"train_tokens_per_second": 6793.426
},
{
"epoch": 2.213899204244032,
"grad_norm": 0.805746716680293,
"learning_rate": 9.77362438598769e-06,
"loss": 0.5316,
"num_input_tokens_seen": 2358953152,
"step": 2610,
"train_runtime": 347186.5132,
"train_tokens_per_second": 6794.484
},
{
"epoch": 2.2181432360742708,
"grad_norm": 1.0149168469719647,
"learning_rate": 9.675958912316091e-06,
"loss": 0.5582,
"num_input_tokens_seen": 2363513408,
"step": 2615,
"train_runtime": 347848.8662,
"train_tokens_per_second": 6794.656
},
{
"epoch": 2.222387267904509,
"grad_norm": 0.9339666893590254,
"learning_rate": 9.578666636296946e-06,
"loss": 0.5468,
"num_input_tokens_seen": 2368091328,
"step": 2620,
"train_runtime": 348511.7283,
"train_tokens_per_second": 6794.868
},
{
"epoch": 2.226631299734748,
"grad_norm": 0.8237811196852415,
"learning_rate": 9.481749927360627e-06,
"loss": 0.5219,
"num_input_tokens_seen": 2372630144,
"step": 2625,
"train_runtime": 349198.393,
"train_tokens_per_second": 6794.505
},
{
"epoch": 2.2308753315649867,
"grad_norm": 0.9312360048745897,
"learning_rate": 9.385211145791126e-06,
"loss": 0.5316,
"num_input_tokens_seen": 2377249792,
"step": 2630,
"train_runtime": 349861.9157,
"train_tokens_per_second": 6794.823
},
{
"epoch": 2.2351193633952255,
"grad_norm": 1.0941579207092957,
"learning_rate": 9.289052642668416e-06,
"loss": 0.5307,
"num_input_tokens_seen": 2381752576,
"step": 2635,
"train_runtime": 350477.7588,
"train_tokens_per_second": 6795.731
},
{
"epoch": 2.2393633952254643,
"grad_norm": 1.054157711767946,
"learning_rate": 9.193276759811339e-06,
"loss": 0.5608,
"num_input_tokens_seen": 2386147072,
"step": 2640,
"train_runtime": 351111.8395,
"train_tokens_per_second": 6795.974
},
{
"epoch": 2.2436074270557027,
"grad_norm": 0.8910120445576349,
"learning_rate": 9.097885829720443e-06,
"loss": 0.5412,
"num_input_tokens_seen": 2390898880,
"step": 2645,
"train_runtime": 351810.8417,
"train_tokens_per_second": 6795.978
},
{
"epoch": 2.2478514588859415,
"grad_norm": 1.2007055994992777,
"learning_rate": 9.002882175521272e-06,
"loss": 0.5399,
"num_input_tokens_seen": 2395503232,
"step": 2650,
"train_runtime": 352435.7792,
"train_tokens_per_second": 6796.992
},
{
"epoch": 2.2520954907161803,
"grad_norm": 1.1196939855484749,
"learning_rate": 8.90826811090775e-06,
"loss": 0.5167,
"num_input_tokens_seen": 2399908928,
"step": 2655,
"train_runtime": 353060.1062,
"train_tokens_per_second": 6797.451
},
{
"epoch": 2.2520954907161803,
"eval_loss": 0.7378480434417725,
"eval_runtime": 1056.8523,
"eval_samples_per_second": 2.883,
"eval_steps_per_second": 0.091,
"num_input_tokens_seen": 2399908928,
"step": 2655
},
{
"epoch": 2.256339522546419,
"grad_norm": 1.2536339396950835,
"learning_rate": 8.814045940085832e-06,
"loss": 0.5282,
"num_input_tokens_seen": 2404340864,
"step": 2660,
"train_runtime": 354734.293,
"train_tokens_per_second": 6777.864
},
{
"epoch": 2.260583554376658,
"grad_norm": 1.0188614881940494,
"learning_rate": 8.720217957717409e-06,
"loss": 0.5746,
"num_input_tokens_seen": 2408905408,
"step": 2665,
"train_runtime": 355379.4662,
"train_tokens_per_second": 6778.403
},
{
"epoch": 2.2648275862068967,
"grad_norm": 0.8497257096281086,
"learning_rate": 8.62678644886439e-06,
"loss": 0.5434,
"num_input_tokens_seen": 2413171200,
"step": 2670,
"train_runtime": 355989.2119,
"train_tokens_per_second": 6778.776
},
{
"epoch": 2.269071618037135,
"grad_norm": 0.9253941979058675,
"learning_rate": 8.533753688933093e-06,
"loss": 0.5716,
"num_input_tokens_seen": 2417806976,
"step": 2675,
"train_runtime": 356663.8352,
"train_tokens_per_second": 6778.952
},
{
"epoch": 2.273315649867374,
"grad_norm": 0.7919650133709085,
"learning_rate": 8.441121943618797e-06,
"loss": 0.5217,
"num_input_tokens_seen": 2422469504,
"step": 2680,
"train_runtime": 357398.8789,
"train_tokens_per_second": 6778.056
},
{
"epoch": 2.2775596816976127,
"grad_norm": 0.9826186100423545,
"learning_rate": 8.34889346885058e-06,
"loss": 0.537,
"num_input_tokens_seen": 2427001408,
"step": 2685,
"train_runtime": 358060.275,
"train_tokens_per_second": 6778.192
},
{
"epoch": 2.2818037135278515,
"grad_norm": 0.9270721696870025,
"learning_rate": 8.257070510736375e-06,
"loss": 0.5473,
"num_input_tokens_seen": 2431585984,
"step": 2690,
"train_runtime": 358720.1033,
"train_tokens_per_second": 6778.505
},
{
"epoch": 2.2860477453580903,
"grad_norm": 1.0463399115668766,
"learning_rate": 8.165655305508283e-06,
"loss": 0.5199,
"num_input_tokens_seen": 2436195008,
"step": 2695,
"train_runtime": 359371.4989,
"train_tokens_per_second": 6779.043
},
{
"epoch": 2.290291777188329,
"grad_norm": 0.8433980949747677,
"learning_rate": 8.074650079468061e-06,
"loss": 0.5406,
"num_input_tokens_seen": 2440643712,
"step": 2700,
"train_runtime": 360023.2487,
"train_tokens_per_second": 6779.128
},
{
"epoch": 2.2945358090185675,
"grad_norm": 0.8344761722848487,
"learning_rate": 7.984057048932994e-06,
"loss": 0.523,
"num_input_tokens_seen": 2445383360,
"step": 2705,
"train_runtime": 360768.2247,
"train_tokens_per_second": 6778.267
},
{
"epoch": 2.2987798408488063,
"grad_norm": 0.9238013027939782,
"learning_rate": 7.893878420181814e-06,
"loss": 0.5394,
"num_input_tokens_seen": 2449944832,
"step": 2710,
"train_runtime": 361432.627,
"train_tokens_per_second": 6778.427
},
{
"epoch": 2.303023872679045,
"grad_norm": 0.830003880013244,
"learning_rate": 7.80411638940107e-06,
"loss": 0.5329,
"num_input_tokens_seen": 2454313856,
"step": 2715,
"train_runtime": 362054.2575,
"train_tokens_per_second": 6778.856
},
{
"epoch": 2.307267904509284,
"grad_norm": 1.2592294301355798,
"learning_rate": 7.714773142631553e-06,
"loss": 0.5287,
"num_input_tokens_seen": 2458717888,
"step": 2720,
"train_runtime": 362669.4395,
"train_tokens_per_second": 6779.501
},
{
"epoch": 2.3115119363395227,
"grad_norm": 1.04619682795218,
"learning_rate": 7.625850855715125e-06,
"loss": 0.5494,
"num_input_tokens_seen": 2463239104,
"step": 2725,
"train_runtime": 363325.7794,
"train_tokens_per_second": 6779.698
},
{
"epoch": 2.3157559681697615,
"grad_norm": 0.9229168180828327,
"learning_rate": 7.53735169424169e-06,
"loss": 0.5247,
"num_input_tokens_seen": 2467903744,
"step": 2730,
"train_runtime": 364007.0501,
"train_tokens_per_second": 6779.824
},
{
"epoch": 2.32,
"grad_norm": 0.9124446948723857,
"learning_rate": 7.449277813496469e-06,
"loss": 0.5264,
"num_input_tokens_seen": 2472315328,
"step": 2735,
"train_runtime": 364630.5034,
"train_tokens_per_second": 6780.331
},
{
"epoch": 2.3242440318302386,
"grad_norm": 0.9029907706736199,
"learning_rate": 7.361631358407511e-06,
"loss": 0.5462,
"num_input_tokens_seen": 2476917568,
"step": 2740,
"train_runtime": 365293.8544,
"train_tokens_per_second": 6780.617
},
{
"epoch": 2.3284880636604774,
"grad_norm": 0.965769243054048,
"learning_rate": 7.274414463493457e-06,
"loss": 0.5276,
"num_input_tokens_seen": 2481395968,
"step": 2745,
"train_runtime": 365941.4705,
"train_tokens_per_second": 6780.855
},
{
"epoch": 2.3327320954907163,
"grad_norm": 1.0526371028734811,
"learning_rate": 7.1876292528115425e-06,
"loss": 0.524,
"num_input_tokens_seen": 2485823424,
"step": 2750,
"train_runtime": 366578.7434,
"train_tokens_per_second": 6781.144
},
{
"epoch": 2.336976127320955,
"grad_norm": 1.029915538876609,
"learning_rate": 7.101277839905887e-06,
"loss": 0.5337,
"num_input_tokens_seen": 2490461696,
"step": 2755,
"train_runtime": 367244.9697,
"train_tokens_per_second": 6781.473
},
{
"epoch": 2.3412201591511934,
"grad_norm": 0.8452644471628386,
"learning_rate": 7.015362327756009e-06,
"loss": 0.5565,
"num_input_tokens_seen": 2494895104,
"step": 2760,
"train_runtime": 367940.8003,
"train_tokens_per_second": 6780.697
},
{
"epoch": 2.345464190981432,
"grad_norm": 1.0284851305831375,
"learning_rate": 6.92988480872562e-06,
"loss": 0.5551,
"num_input_tokens_seen": 2499453376,
"step": 2765,
"train_runtime": 368624.734,
"train_tokens_per_second": 6780.482
},
{
"epoch": 2.349708222811671,
"grad_norm": 0.9746799945929299,
"learning_rate": 6.844847364511667e-06,
"loss": 0.5652,
"num_input_tokens_seen": 2503898176,
"step": 2770,
"train_runtime": 369245.4874,
"train_tokens_per_second": 6781.121
},
{
"epoch": 2.35395225464191,
"grad_norm": 1.0403632612371465,
"learning_rate": 6.760252066093598e-06,
"loss": 0.536,
"num_input_tokens_seen": 2508404032,
"step": 2775,
"train_runtime": 369898.6243,
"train_tokens_per_second": 6781.328
},
{
"epoch": 2.3581962864721486,
"grad_norm": 1.1324249124610484,
"learning_rate": 6.676100973683019e-06,
"loss": 0.5293,
"num_input_tokens_seen": 2512983360,
"step": 2780,
"train_runtime": 370586.7059,
"train_tokens_per_second": 6781.094
},
{
"epoch": 2.3624403183023874,
"grad_norm": 0.9136829751433924,
"learning_rate": 6.592396136673396e-06,
"loss": 0.5133,
"num_input_tokens_seen": 2517545792,
"step": 2785,
"train_runtime": 371255.3251,
"train_tokens_per_second": 6781.171
},
{
"epoch": 2.3666843501326262,
"grad_norm": 1.0337343562459835,
"learning_rate": 6.509139593590263e-06,
"loss": 0.5449,
"num_input_tokens_seen": 2521995456,
"step": 2790,
"train_runtime": 371858.1677,
"train_tokens_per_second": 6782.144
},
{
"epoch": 2.3709283819628646,
"grad_norm": 0.8876485451947176,
"learning_rate": 6.426333372041482e-06,
"loss": 0.5321,
"num_input_tokens_seen": 2526578304,
"step": 2795,
"train_runtime": 372512.2967,
"train_tokens_per_second": 6782.537
},
{
"epoch": 2.3751724137931034,
"grad_norm": 1.01633470136039,
"learning_rate": 6.343979488667923e-06,
"loss": 0.546,
"num_input_tokens_seen": 2531181120,
"step": 2800,
"train_runtime": 373190.8361,
"train_tokens_per_second": 6782.538
},
{
"epoch": 2.379416445623342,
"grad_norm": 1.17898509851511,
"learning_rate": 6.2620799490943296e-06,
"loss": 0.5339,
"num_input_tokens_seen": 2535604096,
"step": 2805,
"train_runtime": 373804.9218,
"train_tokens_per_second": 6783.228
},
{
"epoch": 2.383660477453581,
"grad_norm": 0.995446476535709,
"learning_rate": 6.18063674788047e-06,
"loss": 0.5294,
"num_input_tokens_seen": 2539962496,
"step": 2810,
"train_runtime": 374428.8855,
"train_tokens_per_second": 6783.565
},
{
"epoch": 2.38790450928382,
"grad_norm": 1.4781781798293818,
"learning_rate": 6.099651868472578e-06,
"loss": 0.5377,
"num_input_tokens_seen": 2544523264,
"step": 2815,
"train_runtime": 375082.1614,
"train_tokens_per_second": 6783.909
},
{
"epoch": 2.392148541114058,
"grad_norm": 0.9042911431359419,
"learning_rate": 6.0191272831550296e-06,
"loss": 0.5277,
"num_input_tokens_seen": 2549001728,
"step": 2820,
"train_runtime": 375739.4269,
"train_tokens_per_second": 6783.961
},
{
"epoch": 2.396392572944297,
"grad_norm": 0.9427728279286258,
"learning_rate": 5.939064953002324e-06,
"loss": 0.5286,
"num_input_tokens_seen": 2553568448,
"step": 2825,
"train_runtime": 376350.7934,
"train_tokens_per_second": 6785.075
},
{
"epoch": 2.4006366047745358,
"grad_norm": 1.140690055341589,
"learning_rate": 5.859466827831325e-06,
"loss": 0.5404,
"num_input_tokens_seen": 2557913088,
"step": 2830,
"train_runtime": 376975.3041,
"train_tokens_per_second": 6785.36
},
{
"epoch": 2.4048806366047746,
"grad_norm": 0.973992690753586,
"learning_rate": 5.780334846153762e-06,
"loss": 0.5361,
"num_input_tokens_seen": 2562388224,
"step": 2835,
"train_runtime": 377604.1066,
"train_tokens_per_second": 6785.912
},
{
"epoch": 2.4091246684350134,
"grad_norm": 0.8947292451598252,
"learning_rate": 5.701670935129033e-06,
"loss": 0.5458,
"num_input_tokens_seen": 2566790976,
"step": 2840,
"train_runtime": 378241.5611,
"train_tokens_per_second": 6786.116
},
{
"epoch": 2.413368700265252,
"grad_norm": 0.9334940824258273,
"learning_rate": 5.623477010517269e-06,
"loss": 0.5225,
"num_input_tokens_seen": 2571270592,
"step": 2845,
"train_runtime": 378873.6154,
"train_tokens_per_second": 6786.618
},
{
"epoch": 2.4176127320954905,
"grad_norm": 0.7746666166786925,
"learning_rate": 5.545754976632672e-06,
"loss": 0.534,
"num_input_tokens_seen": 2575889152,
"step": 2850,
"train_runtime": 379547.952,
"train_tokens_per_second": 6786.729
},
{
"epoch": 2.4218567639257294,
"grad_norm": 1.0182336381313688,
"learning_rate": 5.468506726297149e-06,
"loss": 0.5221,
"num_input_tokens_seen": 2580260608,
"step": 2855,
"train_runtime": 380137.2732,
"train_tokens_per_second": 6787.707
},
{
"epoch": 2.426100795755968,
"grad_norm": 0.9862563470365936,
"learning_rate": 5.391734140794183e-06,
"loss": 0.5398,
"num_input_tokens_seen": 2585142720,
"step": 2860,
"train_runtime": 380839.0154,
"train_tokens_per_second": 6788.02
},
{
"epoch": 2.430344827586207,
"grad_norm": 0.9041618526091733,
"learning_rate": 5.3154390898230846e-06,
"loss": 0.512,
"num_input_tokens_seen": 2589755136,
"step": 2865,
"train_runtime": 381496.5415,
"train_tokens_per_second": 6788.411
},
{
"epoch": 2.4345888594164458,
"grad_norm": 0.914359636860779,
"learning_rate": 5.2396234314533665e-06,
"loss": 0.5143,
"num_input_tokens_seen": 2594081792,
"step": 2870,
"train_runtime": 382111.5001,
"train_tokens_per_second": 6788.808
},
{
"epoch": 2.438832891246684,
"grad_norm": 1.033477002199706,
"learning_rate": 5.16428901207959e-06,
"loss": 0.5324,
"num_input_tokens_seen": 2598687872,
"step": 2875,
"train_runtime": 382756.5754,
"train_tokens_per_second": 6789.401
},
{
"epoch": 2.443076923076923,
"grad_norm": 0.9057174065952954,
"learning_rate": 5.089437666376304e-06,
"loss": 0.5263,
"num_input_tokens_seen": 2603366144,
"step": 2880,
"train_runtime": 383439.9985,
"train_tokens_per_second": 6789.501
},
{
"epoch": 2.4473209549071617,
"grad_norm": 0.8529850767813882,
"learning_rate": 5.015071217253428e-06,
"loss": 0.5113,
"num_input_tokens_seen": 2608005440,
"step": 2885,
"train_runtime": 384090.9187,
"train_tokens_per_second": 6790.073
},
{
"epoch": 2.4515649867374005,
"grad_norm": 1.3879570401538315,
"learning_rate": 4.941191475811843e-06,
"loss": 0.5222,
"num_input_tokens_seen": 2612660480,
"step": 2890,
"train_runtime": 384747.1837,
"train_tokens_per_second": 6790.59
},
{
"epoch": 2.4558090185676393,
"grad_norm": 1.2654753413045388,
"learning_rate": 4.867800241299275e-06,
"loss": 0.5128,
"num_input_tokens_seen": 2617068928,
"step": 2895,
"train_runtime": 385371.9974,
"train_tokens_per_second": 6791.02
},
{
"epoch": 2.460053050397878,
"grad_norm": 0.9035128336837777,
"learning_rate": 4.794899301066477e-06,
"loss": 0.529,
"num_input_tokens_seen": 2621648384,
"step": 2900,
"train_runtime": 386085.7096,
"train_tokens_per_second": 6790.327
},
{
"epoch": 2.464297082228117,
"grad_norm": 0.8567446173412482,
"learning_rate": 4.72249043052371e-06,
"loss": 0.5269,
"num_input_tokens_seen": 2626115776,
"step": 2905,
"train_runtime": 386725.7007,
"train_tokens_per_second": 6790.642
},
{
"epoch": 2.4685411140583553,
"grad_norm": 0.8912412665948739,
"learning_rate": 4.650575393097498e-06,
"loss": 0.5387,
"num_input_tokens_seen": 2630631040,
"step": 2910,
"train_runtime": 387368.1112,
"train_tokens_per_second": 6791.037
},
{
"epoch": 2.472785145888594,
"grad_norm": 1.1130459835714437,
"learning_rate": 4.57915594018768e-06,
"loss": 0.5337,
"num_input_tokens_seen": 2635143296,
"step": 2915,
"train_runtime": 388043.1985,
"train_tokens_per_second": 6790.85
},
{
"epoch": 2.477029177718833,
"grad_norm": 0.9430504436597011,
"learning_rate": 4.508233811124765e-06,
"loss": 0.5261,
"num_input_tokens_seen": 2639559808,
"step": 2920,
"train_runtime": 388662.674,
"train_tokens_per_second": 6791.39
},
{
"epoch": 2.4812732095490717,
"grad_norm": 0.9383789050215547,
"learning_rate": 4.437810733127571e-06,
"loss": 0.5464,
"num_input_tokens_seen": 2644172096,
"step": 2925,
"train_runtime": 389299.3342,
"train_tokens_per_second": 6792.131
},
{
"epoch": 2.4855172413793105,
"grad_norm": 0.9063379227339012,
"learning_rate": 4.367888421261154e-06,
"loss": 0.509,
"num_input_tokens_seen": 2648731712,
"step": 2930,
"train_runtime": 389972.3983,
"train_tokens_per_second": 6792.1
},
{
"epoch": 2.489761273209549,
"grad_norm": 0.8758741295078302,
"learning_rate": 4.298468578395029e-06,
"loss": 0.5072,
"num_input_tokens_seen": 2653268800,
"step": 2935,
"train_runtime": 390605.8102,
"train_tokens_per_second": 6792.702
},
{
"epoch": 2.4940053050397877,
"grad_norm": 1.0498774972803595,
"learning_rate": 4.229552895161754e-06,
"loss": 0.5399,
"num_input_tokens_seen": 2657767616,
"step": 2940,
"train_runtime": 391245.0688,
"train_tokens_per_second": 6793.102
},
{
"epoch": 2.4982493368700265,
"grad_norm": 0.9576568252690971,
"learning_rate": 4.161143049915661e-06,
"loss": 0.5167,
"num_input_tokens_seen": 2662204992,
"step": 2945,
"train_runtime": 391840.1072,
"train_tokens_per_second": 6794.111
},
{
"epoch": 2.5024933687002653,
"grad_norm": 1.297030102246004,
"learning_rate": 4.093240708692098e-06,
"loss": 0.5434,
"num_input_tokens_seen": 2666829440,
"step": 2950,
"train_runtime": 392486.1885,
"train_tokens_per_second": 6794.709
},
{
"epoch": 2.5024933687002653,
"eval_loss": 0.7277879118919373,
"eval_runtime": 1062.7442,
"eval_samples_per_second": 2.867,
"eval_steps_per_second": 0.09,
"num_input_tokens_seen": 2666829440,
"step": 2950
},
{
"epoch": 2.506737400530504,
"grad_norm": 0.9337370037141564,
"learning_rate": 4.025847525166737e-06,
"loss": 0.521,
"num_input_tokens_seen": 2671303040,
"step": 2955,
"train_runtime": 394192.7243,
"train_tokens_per_second": 6776.642
},
{
"epoch": 2.510981432360743,
"grad_norm": 1.0142479442870918,
"learning_rate": 3.958965140615395e-06,
"loss": 0.5013,
"num_input_tokens_seen": 2675854528,
"step": 2960,
"train_runtime": 394856.393,
"train_tokens_per_second": 6776.779
},
{
"epoch": 2.5152254641909817,
"grad_norm": 0.9673554697201667,
"learning_rate": 3.892595183874015e-06,
"loss": 0.5138,
"num_input_tokens_seen": 2680310976,
"step": 2965,
"train_runtime": 395478.0069,
"train_tokens_per_second": 6777.396
},
{
"epoch": 2.51946949602122,
"grad_norm": 0.9773791433571442,
"learning_rate": 3.826739271299004e-06,
"loss": 0.5374,
"num_input_tokens_seen": 2685063360,
"step": 2970,
"train_runtime": 396163.4888,
"train_tokens_per_second": 6777.665
},
{
"epoch": 2.523713527851459,
"grad_norm": 0.8762686166416458,
"learning_rate": 3.761399006727878e-06,
"loss": 0.518,
"num_input_tokens_seen": 2689742912,
"step": 2975,
"train_runtime": 396848.2908,
"train_tokens_per_second": 6777.761
},
{
"epoch": 2.5279575596816977,
"grad_norm": 0.9814246027282987,
"learning_rate": 3.696575981440198e-06,
"loss": 0.5333,
"num_input_tokens_seen": 2694003776,
"step": 2980,
"train_runtime": 397469.5249,
"train_tokens_per_second": 6777.888
},
{
"epoch": 2.5322015915119365,
"grad_norm": 1.0494687013783062,
"learning_rate": 3.632271774118812e-06,
"loss": 0.5249,
"num_input_tokens_seen": 2698482432,
"step": 2985,
"train_runtime": 398112.2116,
"train_tokens_per_second": 6778.196
},
{
"epoch": 2.536445623342175,
"grad_norm": 0.9167136316488117,
"learning_rate": 3.568487950811414e-06,
"loss": 0.5179,
"num_input_tokens_seen": 2703208384,
"step": 2990,
"train_runtime": 398770.7605,
"train_tokens_per_second": 6778.853
},
{
"epoch": 2.5406896551724136,
"grad_norm": 1.0631373570087592,
"learning_rate": 3.5052260648924056e-06,
"loss": 0.5258,
"num_input_tokens_seen": 2707736448,
"step": 2995,
"train_runtime": 399417.3719,
"train_tokens_per_second": 6779.216
},
{
"epoch": 2.5449336870026524,
"grad_norm": 0.8646476638396003,
"learning_rate": 3.442487657025059e-06,
"loss": 0.5148,
"num_input_tokens_seen": 2712052544,
"step": 3000,
"train_runtime": 400011.3913,
"train_tokens_per_second": 6779.938
},
{
"epoch": 2.5491777188328912,
"grad_norm": 1.3814464306803582,
"learning_rate": 3.380274255124008e-06,
"loss": 0.5328,
"num_input_tokens_seen": 2716590016,
"step": 3005,
"train_runtime": 400647.6096,
"train_tokens_per_second": 6780.497
},
{
"epoch": 2.55342175066313,
"grad_norm": 0.9608139693650856,
"learning_rate": 3.318587374318008e-06,
"loss": 0.5221,
"num_input_tokens_seen": 2721057600,
"step": 3010,
"train_runtime": 401293.542,
"train_tokens_per_second": 6780.716
},
{
"epoch": 2.557665782493369,
"grad_norm": 1.052375691071485,
"learning_rate": 3.257428516913094e-06,
"loss": 0.5356,
"num_input_tokens_seen": 2725470592,
"step": 3015,
"train_runtime": 401915.8909,
"train_tokens_per_second": 6781.196
},
{
"epoch": 2.5619098143236076,
"grad_norm": 1.0422726914250204,
"learning_rate": 3.1967991723559186e-06,
"loss": 0.5357,
"num_input_tokens_seen": 2729976320,
"step": 3020,
"train_runtime": 402550.4344,
"train_tokens_per_second": 6781.7
},
{
"epoch": 2.566153846153846,
"grad_norm": 1.0115755929526138,
"learning_rate": 3.1367008171975606e-06,
"loss": 0.5292,
"num_input_tokens_seen": 2734356096,
"step": 3025,
"train_runtime": 403147.2983,
"train_tokens_per_second": 6782.524
},
{
"epoch": 2.570397877984085,
"grad_norm": 0.9326348646064712,
"learning_rate": 3.0771349150574833e-06,
"loss": 0.5032,
"num_input_tokens_seen": 2738757568,
"step": 3030,
"train_runtime": 403750.7771,
"train_tokens_per_second": 6783.287
},
{
"epoch": 2.5746419098143236,
"grad_norm": 0.8962193098952964,
"learning_rate": 3.0181029165879505e-06,
"loss": 0.5235,
"num_input_tokens_seen": 2743195520,
"step": 3035,
"train_runtime": 404385.6448,
"train_tokens_per_second": 6783.613
},
{
"epoch": 2.5788859416445624,
"grad_norm": 0.9643001215470395,
"learning_rate": 2.959606259438677e-06,
"loss": 0.5371,
"num_input_tokens_seen": 2747720128,
"step": 3040,
"train_runtime": 405024.1548,
"train_tokens_per_second": 6784.09
},
{
"epoch": 2.583129973474801,
"grad_norm": 1.083989853159668,
"learning_rate": 2.9016463682218137e-06,
"loss": 0.5045,
"num_input_tokens_seen": 2752185344,
"step": 3045,
"train_runtime": 405663.7384,
"train_tokens_per_second": 6784.401
},
{
"epoch": 2.5873740053050396,
"grad_norm": 1.0350757402442816,
"learning_rate": 2.844224654477251e-06,
"loss": 0.535,
"num_input_tokens_seen": 2756955712,
"step": 3050,
"train_runtime": 406343.9367,
"train_tokens_per_second": 6784.784
},
{
"epoch": 2.5916180371352784,
"grad_norm": 1.0876688931917096,
"learning_rate": 2.787342516638253e-06,
"loss": 0.5241,
"num_input_tokens_seen": 2761469632,
"step": 3055,
"train_runtime": 406991.7195,
"train_tokens_per_second": 6785.076
},
{
"epoch": 2.595862068965517,
"grad_norm": 1.187734783928453,
"learning_rate": 2.7310013399973937e-06,
"loss": 0.5083,
"num_input_tokens_seen": 2766078848,
"step": 3060,
"train_runtime": 407630.5997,
"train_tokens_per_second": 6785.749
},
{
"epoch": 2.600106100795756,
"grad_norm": 0.8033965202229059,
"learning_rate": 2.6752024966728186e-06,
"loss": 0.5078,
"num_input_tokens_seen": 2770419968,
"step": 3065,
"train_runtime": 408227.7584,
"train_tokens_per_second": 6786.457
},
{
"epoch": 2.604350132625995,
"grad_norm": 1.0703320175428703,
"learning_rate": 2.6199473455748302e-06,
"loss": 0.523,
"num_input_tokens_seen": 2774944704,
"step": 3070,
"train_runtime": 408874.2335,
"train_tokens_per_second": 6786.793
},
{
"epoch": 2.6085941644562336,
"grad_norm": 0.8853137013274479,
"learning_rate": 2.5652372323727995e-06,
"loss": 0.5344,
"num_input_tokens_seen": 2779491968,
"step": 3075,
"train_runtime": 409530.6069,
"train_tokens_per_second": 6787.019
},
{
"epoch": 2.6128381962864724,
"grad_norm": 0.9523960118476597,
"learning_rate": 2.5110734894623845e-06,
"loss": 0.5238,
"num_input_tokens_seen": 2784101184,
"step": 3080,
"train_runtime": 410217.1802,
"train_tokens_per_second": 6786.896
},
{
"epoch": 2.6170822281167108,
"grad_norm": 0.9141894377610371,
"learning_rate": 2.457457435933083e-06,
"loss": 0.5072,
"num_input_tokens_seen": 2788760320,
"step": 3085,
"train_runtime": 410902.2723,
"train_tokens_per_second": 6786.919
},
{
"epoch": 2.6213262599469496,
"grad_norm": 0.9601107951690412,
"learning_rate": 2.404390377536117e-06,
"loss": 0.5461,
"num_input_tokens_seen": 2793482240,
"step": 3090,
"train_runtime": 411591.1251,
"train_tokens_per_second": 6787.032
},
{
"epoch": 2.6255702917771884,
"grad_norm": 1.092143809724588,
"learning_rate": 2.3518736066526106e-06,
"loss": 0.5355,
"num_input_tokens_seen": 2798052800,
"step": 3095,
"train_runtime": 412253.9967,
"train_tokens_per_second": 6787.206
},
{
"epoch": 2.629814323607427,
"grad_norm": 0.9775173060346775,
"learning_rate": 2.2999084022621575e-06,
"loss": 0.5187,
"num_input_tokens_seen": 2802623616,
"step": 3100,
"train_runtime": 412972.0833,
"train_tokens_per_second": 6786.472
},
{
"epoch": 2.6340583554376655,
"grad_norm": 0.8882149505754918,
"learning_rate": 2.2484960299116176e-06,
"loss": 0.5234,
"num_input_tokens_seen": 2807059072,
"step": 3105,
"train_runtime": 413616.3561,
"train_tokens_per_second": 6786.625
},
{
"epoch": 2.6383023872679043,
"grad_norm": 1.04737202246014,
"learning_rate": 2.1976377416843496e-06,
"loss": 0.5337,
"num_input_tokens_seen": 2811685952,
"step": 3110,
"train_runtime": 414302.3818,
"train_tokens_per_second": 6786.555
},
{
"epoch": 2.642546419098143,
"grad_norm": 0.8525650275605402,
"learning_rate": 2.1473347761696765e-06,
"loss": 0.5122,
"num_input_tokens_seen": 2816097472,
"step": 3115,
"train_runtime": 414912.2037,
"train_tokens_per_second": 6787.213
},
{
"epoch": 2.646790450928382,
"grad_norm": 1.4897884354340427,
"learning_rate": 2.097588358432745e-06,
"loss": 0.5344,
"num_input_tokens_seen": 2820541952,
"step": 3120,
"train_runtime": 415551.6146,
"train_tokens_per_second": 6787.465
},
{
"epoch": 2.6510344827586207,
"grad_norm": 0.9315543634592929,
"learning_rate": 2.048399699984685e-06,
"loss": 0.5204,
"num_input_tokens_seen": 2825211008,
"step": 3125,
"train_runtime": 416236.0998,
"train_tokens_per_second": 6787.52
},
{
"epoch": 2.6552785145888596,
"grad_norm": 0.994893210880857,
"learning_rate": 1.999769998753101e-06,
"loss": 0.5092,
"num_input_tokens_seen": 2829805440,
"step": 3130,
"train_runtime": 416937.1267,
"train_tokens_per_second": 6787.128
},
{
"epoch": 2.6595225464190984,
"grad_norm": 0.9211547478732683,
"learning_rate": 1.951700439052906e-06,
"loss": 0.509,
"num_input_tokens_seen": 2834319168,
"step": 3135,
"train_runtime": 417580.9275,
"train_tokens_per_second": 6787.473
},
{
"epoch": 2.6637665782493367,
"grad_norm": 1.1092045966595683,
"learning_rate": 1.9041921915574718e-06,
"loss": 0.5213,
"num_input_tokens_seen": 2838842944,
"step": 3140,
"train_runtime": 418214.8375,
"train_tokens_per_second": 6788.001
},
{
"epoch": 2.6680106100795755,
"grad_norm": 0.8680731962765761,
"learning_rate": 1.857246413270114e-06,
"loss": 0.5303,
"num_input_tokens_seen": 2843287936,
"step": 3145,
"train_runtime": 418837.8662,
"train_tokens_per_second": 6788.517
},
{
"epoch": 2.6722546419098143,
"grad_norm": 1.370277456535701,
"learning_rate": 1.810864247495933e-06,
"loss": 0.5351,
"num_input_tokens_seen": 2847823872,
"step": 3150,
"train_runtime": 419460.3925,
"train_tokens_per_second": 6789.256
},
{
"epoch": 2.676498673740053,
"grad_norm": 1.0792319656449618,
"learning_rate": 1.7650468238139484e-06,
"loss": 0.5114,
"num_input_tokens_seen": 2852148544,
"step": 3155,
"train_runtime": 420051.6697,
"train_tokens_per_second": 6789.995
},
{
"epoch": 2.680742705570292,
"grad_norm": 0.8781496188483874,
"learning_rate": 1.7197952580496086e-06,
"loss": 0.5126,
"num_input_tokens_seen": 2856524608,
"step": 3160,
"train_runtime": 420644.3052,
"train_tokens_per_second": 6790.832
},
{
"epoch": 2.6849867374005303,
"grad_norm": 0.9491071552727024,
"learning_rate": 1.6751106522476078e-06,
"loss": 0.5242,
"num_input_tokens_seen": 2861096576,
"step": 3165,
"train_runtime": 421277.8845,
"train_tokens_per_second": 6791.471
},
{
"epoch": 2.689230769230769,
"grad_norm": 0.9895886874283716,
"learning_rate": 1.6309940946450276e-06,
"loss": 0.5422,
"num_input_tokens_seen": 2865604864,
"step": 3170,
"train_runtime": 421930.8634,
"train_tokens_per_second": 6791.646
},
{
"epoch": 2.693474801061008,
"grad_norm": 0.9535636472034763,
"learning_rate": 1.5874466596448894e-06,
"loss": 0.5138,
"num_input_tokens_seen": 2870295232,
"step": 3175,
"train_runtime": 422607.9976,
"train_tokens_per_second": 6791.862
},
{
"epoch": 2.6977188328912467,
"grad_norm": 1.2371458441648222,
"learning_rate": 1.5444694077899112e-06,
"loss": 0.5143,
"num_input_tokens_seen": 2874723904,
"step": 3180,
"train_runtime": 423241.8919,
"train_tokens_per_second": 6792.154
},
{
"epoch": 2.7019628647214855,
"grad_norm": 1.0270055431553666,
"learning_rate": 1.5020633857367629e-06,
"loss": 0.5097,
"num_input_tokens_seen": 2879036160,
"step": 3185,
"train_runtime": 423847.9941,
"train_tokens_per_second": 6792.615
},
{
"epoch": 2.7062068965517243,
"grad_norm": 1.0047782227717184,
"learning_rate": 1.4602296262304998e-06,
"loss": 0.531,
"num_input_tokens_seen": 2883503104,
"step": 3190,
"train_runtime": 424512.874,
"train_tokens_per_second": 6792.499
},
{
"epoch": 2.710450928381963,
"grad_norm": 0.7991655241640006,
"learning_rate": 1.4189691480794659e-06,
"loss": 0.5031,
"num_input_tokens_seen": 2888068736,
"step": 3195,
"train_runtime": 425152.1322,
"train_tokens_per_second": 6793.024
},
{
"epoch": 2.7146949602122015,
"grad_norm": 0.9290447093133773,
"learning_rate": 1.3782829561304528e-06,
"loss": 0.5214,
"num_input_tokens_seen": 2892383104,
"step": 3200,
"train_runtime": 425736.2555,
"train_tokens_per_second": 6793.838
},
{
"epoch": 2.7189389920424403,
"grad_norm": 0.9336749097922185,
"learning_rate": 1.3381720412442484e-06,
"loss": 0.5046,
"num_input_tokens_seen": 2896798592,
"step": 3205,
"train_runtime": 426372.4313,
"train_tokens_per_second": 6794.057
},
{
"epoch": 2.723183023872679,
"grad_norm": 0.923464487407943,
"learning_rate": 1.2986373802714806e-06,
"loss": 0.519,
"num_input_tokens_seen": 2901355840,
"step": 3210,
"train_runtime": 427007.4668,
"train_tokens_per_second": 6794.626
},
{
"epoch": 2.727427055702918,
"grad_norm": 1.054195900551347,
"learning_rate": 1.259679936028857e-06,
"loss": 0.5278,
"num_input_tokens_seen": 2906018048,
"step": 3215,
"train_runtime": 427698.5588,
"train_tokens_per_second": 6794.547
},
{
"epoch": 2.7316710875331562,
"grad_norm": 1.053479012440482,
"learning_rate": 1.2213006572756868e-06,
"loss": 0.5052,
"num_input_tokens_seen": 2910317120,
"step": 3220,
"train_runtime": 428289.4292,
"train_tokens_per_second": 6795.211
},
{
"epoch": 2.735915119363395,
"grad_norm": 0.9664035844106731,
"learning_rate": 1.1835004786907994e-06,
"loss": 0.5285,
"num_input_tokens_seen": 2914766912,
"step": 3225,
"train_runtime": 428926.8091,
"train_tokens_per_second": 6795.488
},
{
"epoch": 2.740159151193634,
"grad_norm": 0.9743299232561019,
"learning_rate": 1.1462803208497658e-06,
"loss": 0.5349,
"num_input_tokens_seen": 2919116864,
"step": 3230,
"train_runtime": 429579.5988,
"train_tokens_per_second": 6795.287
},
{
"epoch": 2.7444031830238726,
"grad_norm": 0.9853416108002045,
"learning_rate": 1.1096410902024874e-06,
"loss": 0.5281,
"num_input_tokens_seen": 2923624768,
"step": 3235,
"train_runtime": 430191.282,
"train_tokens_per_second": 6796.104
},
{
"epoch": 2.7486472148541115,
"grad_norm": 0.9605567975250936,
"learning_rate": 1.073583679051124e-06,
"loss": 0.5301,
"num_input_tokens_seen": 2928050752,
"step": 3240,
"train_runtime": 430824.3873,
"train_tokens_per_second": 6796.39
},
{
"epoch": 2.7528912466843503,
"grad_norm": 1.1024226803397792,
"learning_rate": 1.0381089655283394e-06,
"loss": 0.5316,
"num_input_tokens_seen": 2932468416,
"step": 3245,
"train_runtime": 431459.9194,
"train_tokens_per_second": 6796.618
},
{
"epoch": 2.7528912466843503,
"eval_loss": 0.7258533835411072,
"eval_runtime": 1061.3977,
"eval_samples_per_second": 2.871,
"eval_steps_per_second": 0.09,
"num_input_tokens_seen": 2932468416,
"step": 3245
},
{
"epoch": 2.757135278514589,
"grad_norm": 1.0283547426763588,
"learning_rate": 1.0032178135759546e-06,
"loss": 0.5119,
"num_input_tokens_seen": 2936775872,
"step": 3250,
"train_runtime": 433099.9457,
"train_tokens_per_second": 6780.827
},
{
"epoch": 2.7613793103448274,
"grad_norm": 0.9455593371411998,
"learning_rate": 9.68911072923867e-07,
"loss": 0.5204,
"num_input_tokens_seen": 2941314240,
"step": 3255,
"train_runtime": 433763.0738,
"train_tokens_per_second": 6780.924
},
{
"epoch": 2.7656233421750662,
"grad_norm": 0.9779382933428667,
"learning_rate": 9.351895790693955e-07,
"loss": 0.5273,
"num_input_tokens_seen": 2945907136,
"step": 3260,
"train_runtime": 434418.6226,
"train_tokens_per_second": 6781.263
},
{
"epoch": 2.769867374005305,
"grad_norm": 0.9433322444237364,
"learning_rate": 9.020541532568899e-07,
"loss": 0.5289,
"num_input_tokens_seen": 2950538496,
"step": 3265,
"train_runtime": 435105.6078,
"train_tokens_per_second": 6781.201
},
{
"epoch": 2.774111405835544,
"grad_norm": 1.0359211392265961,
"learning_rate": 8.695056024577792e-07,
"loss": 0.5082,
"num_input_tokens_seen": 2955022208,
"step": 3270,
"train_runtime": 435708.6215,
"train_tokens_per_second": 6782.106
},
{
"epoch": 2.7783554376657826,
"grad_norm": 0.8927269780868947,
"learning_rate": 8.375447193508662e-07,
"loss": 0.5196,
"num_input_tokens_seen": 2959619008,
"step": 3275,
"train_runtime": 436404.7763,
"train_tokens_per_second": 6781.821
},
{
"epoch": 2.782599469496021,
"grad_norm": 0.994250431687146,
"learning_rate": 8.061722823030693e-07,
"loss": 0.5294,
"num_input_tokens_seen": 2964249856,
"step": 3280,
"train_runtime": 437073.9242,
"train_tokens_per_second": 6782.033
},
{
"epoch": 2.78684350132626,
"grad_norm": 0.8514551639091007,
"learning_rate": 7.753890553504422e-07,
"loss": 0.5073,
"num_input_tokens_seen": 2968889344,
"step": 3285,
"train_runtime": 437728.8681,
"train_tokens_per_second": 6782.485
},
{
"epoch": 2.7910875331564986,
"grad_norm": 1.0389333757881603,
"learning_rate": 7.451957881795673e-07,
"loss": 0.532,
"num_input_tokens_seen": 2973153152,
"step": 3290,
"train_runtime": 438317.3334,
"train_tokens_per_second": 6783.106
},
{
"epoch": 2.7953315649867374,
"grad_norm": 0.8808663623494958,
"learning_rate": 7.155932161093032e-07,
"loss": 0.5312,
"num_input_tokens_seen": 2977666176,
"step": 3295,
"train_runtime": 438968.9598,
"train_tokens_per_second": 6783.318
},
{
"epoch": 2.799575596816976,
"grad_norm": 1.0049671673931104,
"learning_rate": 6.865820600728823e-07,
"loss": 0.5352,
"num_input_tokens_seen": 2982187392,
"step": 3300,
"train_runtime": 439614.2835,
"train_tokens_per_second": 6783.645
},
{
"epoch": 2.803819628647215,
"grad_norm": 0.9855264400649804,
"learning_rate": 6.581630266003419e-07,
"loss": 0.5523,
"num_input_tokens_seen": 2986898304,
"step": 3305,
"train_runtime": 440321.2266,
"train_tokens_per_second": 6783.453
},
{
"epoch": 2.808063660477454,
"grad_norm": 1.0813234011741917,
"learning_rate": 6.303368078013183e-07,
"loss": 0.5116,
"num_input_tokens_seen": 2991295104,
"step": 3310,
"train_runtime": 440944.6485,
"train_tokens_per_second": 6783.834
},
{
"epoch": 2.812307692307692,
"grad_norm": 0.9341722231468776,
"learning_rate": 6.031040813482047e-07,
"loss": 0.5215,
"num_input_tokens_seen": 2995983808,
"step": 3315,
"train_runtime": 441625.9103,
"train_tokens_per_second": 6783.986
},
{
"epoch": 2.816551724137931,
"grad_norm": 0.8962570756580445,
"learning_rate": 5.764655104596311e-07,
"loss": 0.536,
"num_input_tokens_seen": 3000598080,
"step": 3320,
"train_runtime": 442297.9755,
"train_tokens_per_second": 6784.11
},
{
"epoch": 2.82079575596817,
"grad_norm": 0.884157880198946,
"learning_rate": 5.504217438843301e-07,
"loss": 0.5187,
"num_input_tokens_seen": 3005166144,
"step": 3325,
"train_runtime": 442917.9433,
"train_tokens_per_second": 6784.928
},
{
"epoch": 2.8250397877984086,
"grad_norm": 0.9880705840554078,
"learning_rate": 5.249734158853048e-07,
"loss": 0.5367,
"num_input_tokens_seen": 3009451904,
"step": 3330,
"train_runtime": 443509.8078,
"train_tokens_per_second": 6785.536
},
{
"epoch": 2.829283819628647,
"grad_norm": 0.9577753896346771,
"learning_rate": 5.001211462244359e-07,
"loss": 0.5214,
"num_input_tokens_seen": 3013832512,
"step": 3335,
"train_runtime": 444142.2629,
"train_tokens_per_second": 6785.737
},
{
"epoch": 2.8335278514588857,
"grad_norm": 0.9814158353022203,
"learning_rate": 4.758655401473272e-07,
"loss": 0.5255,
"num_input_tokens_seen": 3018354112,
"step": 3340,
"train_runtime": 444781.1211,
"train_tokens_per_second": 6786.156
},
{
"epoch": 2.8377718832891246,
"grad_norm": 1.1181096501664909,
"learning_rate": 4.522071883686141e-07,
"loss": 0.5053,
"num_input_tokens_seen": 3022790720,
"step": 3345,
"train_runtime": 445390.6168,
"train_tokens_per_second": 6786.831
},
{
"epoch": 2.8420159151193634,
"grad_norm": 0.9098207875260499,
"learning_rate": 4.291466670575506e-07,
"loss": 0.5205,
"num_input_tokens_seen": 3027432384,
"step": 3350,
"train_runtime": 446049.4125,
"train_tokens_per_second": 6787.213
},
{
"epoch": 2.846259946949602,
"grad_norm": 0.8045024632334659,
"learning_rate": 4.0668453782398696e-07,
"loss": 0.5009,
"num_input_tokens_seen": 3032032768,
"step": 3355,
"train_runtime": 446687.1742,
"train_tokens_per_second": 6787.821
},
{
"epoch": 2.850503978779841,
"grad_norm": 0.9168040586174651,
"learning_rate": 3.848213477046919e-07,
"loss": 0.5227,
"num_input_tokens_seen": 3036567680,
"step": 3360,
"train_runtime": 447330.9014,
"train_tokens_per_second": 6788.191
},
{
"epoch": 2.8547480106100798,
"grad_norm": 1.1113058183093447,
"learning_rate": 3.6355762915002143e-07,
"loss": 0.5462,
"num_input_tokens_seen": 3041310976,
"step": 3365,
"train_runtime": 448024.7841,
"train_tokens_per_second": 6788.265
},
{
"epoch": 2.8589920424403186,
"grad_norm": 1.1615276846957676,
"learning_rate": 3.4289390001097377e-07,
"loss": 0.5032,
"num_input_tokens_seen": 3045742528,
"step": 3370,
"train_runtime": 448635.5559,
"train_tokens_per_second": 6788.901
},
{
"epoch": 2.863236074270557,
"grad_norm": 1.022077622432263,
"learning_rate": 3.2283066352654936e-07,
"loss": 0.5328,
"num_input_tokens_seen": 3050306496,
"step": 3375,
"train_runtime": 449289.2558,
"train_tokens_per_second": 6789.182
},
{
"epoch": 2.8674801061007957,
"grad_norm": 0.9272311129173908,
"learning_rate": 3.0336840831151626e-07,
"loss": 0.5273,
"num_input_tokens_seen": 3054867520,
"step": 3380,
"train_runtime": 449945.3409,
"train_tokens_per_second": 6789.419
},
{
"epoch": 2.8717241379310345,
"grad_norm": 0.9436512733023248,
"learning_rate": 2.8450760834450307e-07,
"loss": 0.5194,
"num_input_tokens_seen": 3059487552,
"step": 3385,
"train_runtime": 450608.7914,
"train_tokens_per_second": 6789.676
},
{
"epoch": 2.8759681697612733,
"grad_norm": 0.9504790766629004,
"learning_rate": 2.662487229564525e-07,
"loss": 0.5342,
"num_input_tokens_seen": 3064172992,
"step": 3390,
"train_runtime": 451284.6962,
"train_tokens_per_second": 6789.889
},
{
"epoch": 2.8802122015915117,
"grad_norm": 0.9713790898947682,
"learning_rate": 2.485921968194416e-07,
"loss": 0.534,
"num_input_tokens_seen": 3068583616,
"step": 3395,
"train_runtime": 451906.6499,
"train_tokens_per_second": 6790.304
},
{
"epoch": 2.8844562334217505,
"grad_norm": 1.0394097264342836,
"learning_rate": 2.3153845993584834e-07,
"loss": 0.516,
"num_input_tokens_seen": 3073191360,
"step": 3400,
"train_runtime": 452591.3839,
"train_tokens_per_second": 6790.212
},
{
"epoch": 2.8887002652519893,
"grad_norm": 0.9630302970032499,
"learning_rate": 2.1508792762787723e-07,
"loss": 0.5191,
"num_input_tokens_seen": 3077554560,
"step": 3405,
"train_runtime": 453214.0954,
"train_tokens_per_second": 6790.509
},
{
"epoch": 2.892944297082228,
"grad_norm": 1.0072599246388074,
"learning_rate": 1.9924100052745586e-07,
"loss": 0.5081,
"num_input_tokens_seen": 3082016768,
"step": 3410,
"train_runtime": 453848.0797,
"train_tokens_per_second": 6790.856
},
{
"epoch": 2.897188328912467,
"grad_norm": 0.8435644898142695,
"learning_rate": 1.8399806456645963e-07,
"loss": 0.5184,
"num_input_tokens_seen": 3086770176,
"step": 3415,
"train_runtime": 454559.6162,
"train_tokens_per_second": 6790.683
},
{
"epoch": 2.9014323607427057,
"grad_norm": 1.0235162502387214,
"learning_rate": 1.6935949096733016e-07,
"loss": 0.534,
"num_input_tokens_seen": 3091125184,
"step": 3420,
"train_runtime": 455183.2572,
"train_tokens_per_second": 6790.947
},
{
"epoch": 2.9056763925729445,
"grad_norm": 0.9662120266545192,
"learning_rate": 1.5532563623402718e-07,
"loss": 0.526,
"num_input_tokens_seen": 3095550336,
"step": 3425,
"train_runtime": 455803.201,
"train_tokens_per_second": 6791.419
},
{
"epoch": 2.909920424403183,
"grad_norm": 0.8700940267517274,
"learning_rate": 1.4189684214334087e-07,
"loss": 0.5214,
"num_input_tokens_seen": 3100072256,
"step": 3430,
"train_runtime": 456479.466,
"train_tokens_per_second": 6791.263
},
{
"epoch": 2.9141644562334217,
"grad_norm": 1.0663457112375756,
"learning_rate": 1.2907343573658194e-07,
"loss": 0.5457,
"num_input_tokens_seen": 3104605376,
"step": 3435,
"train_runtime": 457154.9293,
"train_tokens_per_second": 6791.145
},
{
"epoch": 2.9184084880636605,
"grad_norm": 0.8678572276073582,
"learning_rate": 1.1685572931160737e-07,
"loss": 0.5259,
"num_input_tokens_seen": 3109137920,
"step": 3440,
"train_runtime": 457837.7574,
"train_tokens_per_second": 6790.916
},
{
"epoch": 2.9226525198938993,
"grad_norm": 1.015827014706817,
"learning_rate": 1.0524402041520997e-07,
"loss": 0.5011,
"num_input_tokens_seen": 3113606784,
"step": 3445,
"train_runtime": 458466.3165,
"train_tokens_per_second": 6791.353
},
{
"epoch": 2.926896551724138,
"grad_norm": 1.006376986197294,
"learning_rate": 9.42385918358879e-08,
"loss": 0.5323,
"num_input_tokens_seen": 3118176512,
"step": 3450,
"train_runtime": 459172.3619,
"train_tokens_per_second": 6790.863
},
{
"epoch": 2.9311405835543765,
"grad_norm": 0.9168280974190645,
"learning_rate": 8.383971159694193e-08,
"loss": 0.5284,
"num_input_tokens_seen": 3122635072,
"step": 3455,
"train_runtime": 459796.4487,
"train_tokens_per_second": 6791.342
},
{
"epoch": 2.9353846153846153,
"grad_norm": 0.8691560318885593,
"learning_rate": 7.404763294995565e-08,
"loss": 0.5378,
"num_input_tokens_seen": 3127211136,
"step": 3460,
"train_runtime": 460443.0378,
"train_tokens_per_second": 6791.744
},
{
"epoch": 2.939628647214854,
"grad_norm": 0.8391826158196111,
"learning_rate": 6.486259436863373e-08,
"loss": 0.5057,
"num_input_tokens_seen": 3131755648,
"step": 3465,
"train_runtime": 461119.9271,
"train_tokens_per_second": 6791.629
},
{
"epoch": 2.943872679045093,
"grad_norm": 0.9266785431840717,
"learning_rate": 5.628481954297604e-08,
"loss": 0.5041,
"num_input_tokens_seen": 3136402176,
"step": 3470,
"train_runtime": 461787.0454,
"train_tokens_per_second": 6791.88
},
{
"epoch": 2.9481167108753317,
"grad_norm": 0.9181454594188112,
"learning_rate": 4.83145173738514e-08,
"loss": 0.5146,
"num_input_tokens_seen": 3140924544,
"step": 3475,
"train_runtime": 462405.4235,
"train_tokens_per_second": 6792.577
},
{
"epoch": 2.9523607427055705,
"grad_norm": 0.8349784460725406,
"learning_rate": 4.095188196789057e-08,
"loss": 0.5158,
"num_input_tokens_seen": 3145415872,
"step": 3480,
"train_runtime": 463062.3059,
"train_tokens_per_second": 6792.641
},
{
"epoch": 2.9566047745358093,
"grad_norm": 0.9323692378640641,
"learning_rate": 3.419709263277893e-08,
"loss": 0.5251,
"num_input_tokens_seen": 3149924224,
"step": 3485,
"train_runtime": 463701.0926,
"train_tokens_per_second": 6793.006
},
{
"epoch": 2.9608488063660476,
"grad_norm": 0.8451251639267238,
"learning_rate": 2.8050313872868273e-08,
"loss": 0.5253,
"num_input_tokens_seen": 3154558400,
"step": 3490,
"train_runtime": 464407.8316,
"train_tokens_per_second": 6792.647
},
{
"epoch": 2.9650928381962864,
"grad_norm": 1.0163495151903155,
"learning_rate": 2.251169538518838e-08,
"loss": 0.5481,
"num_input_tokens_seen": 3159207360,
"step": 3495,
"train_runtime": 465073.4609,
"train_tokens_per_second": 6792.921
},
{
"epoch": 2.9693368700265252,
"grad_norm": 1.0129744457478043,
"learning_rate": 1.758137205579158e-08,
"loss": 0.5135,
"num_input_tokens_seen": 3163694016,
"step": 3500,
"train_runtime": 465699.5834,
"train_tokens_per_second": 6793.422
},
{
"epoch": 2.973580901856764,
"grad_norm": 1.000739655978131,
"learning_rate": 1.3259463956469265e-08,
"loss": 0.5001,
"num_input_tokens_seen": 3168337088,
"step": 3505,
"train_runtime": 466401.7551,
"train_tokens_per_second": 6793.15
},
{
"epoch": 2.9778249336870024,
"grad_norm": 0.9167977963170866,
"learning_rate": 9.546076341834798e-09,
"loss": 0.5125,
"num_input_tokens_seen": 3172760384,
"step": 3510,
"train_runtime": 467024.7423,
"train_tokens_per_second": 6793.56
},
{
"epoch": 2.982068965517241,
"grad_norm": 0.9752251889997392,
"learning_rate": 6.441299646750554e-09,
"loss": 0.5214,
"num_input_tokens_seen": 3177405824,
"step": 3515,
"train_runtime": 467725.4913,
"train_tokens_per_second": 6793.313
},
{
"epoch": 2.98631299734748,
"grad_norm": 0.9307610509549541,
"learning_rate": 3.945209484124135e-09,
"loss": 0.5254,
"num_input_tokens_seen": 3182049024,
"step": 3520,
"train_runtime": 468383.2029,
"train_tokens_per_second": 6793.687
},
{
"epoch": 2.990557029177719,
"grad_norm": 0.9957718574851812,
"learning_rate": 2.0578666430765e-09,
"loss": 0.5124,
"num_input_tokens_seen": 3186550272,
"step": 3525,
"train_runtime": 469031.712,
"train_tokens_per_second": 6793.891
},
{
"epoch": 2.9948010610079576,
"grad_norm": 1.1631587185707446,
"learning_rate": 7.793170874625943e-10,
"loss": 0.5197,
"num_input_tokens_seen": 3191082304,
"step": 3530,
"train_runtime": 469677.0947,
"train_tokens_per_second": 6794.205
},
{
"epoch": 2.9990450928381964,
"grad_norm": 0.9116914677984228,
"learning_rate": 1.0959195473614348e-10,
"loss": 0.5392,
"num_input_tokens_seen": 3195636736,
"step": 3535,
"train_runtime": 470373.7704,
"train_tokens_per_second": 6793.824
}
],
"logging_steps": 5,
"max_steps": 3537,
"num_input_tokens_seen": 3196694976,
"num_train_epochs": 3,
"save_steps": 1180,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.62813488136192e+16,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}