{ "best_global_step": 2360, "best_metric": 0.71119624376297, "best_model_checkpoint": "/mnt/shared-storage-user/zhangchenhao/work/LLaMA-Factory-own/LLaMA-Factory/saves/SFT_StepCount_all_with_plus_without_point_reasoning_optimized_no_prompt_answer80_point2p5/checkpoint-2360", "epoch": 3.0, "eval_steps": 295, "global_step": 3537, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.004244031830238726, "grad_norm": 297.75304987581995, "learning_rate": 5.649717514124295e-07, "loss": 5.8629, "num_input_tokens_seen": 4530624, "step": 5, "train_runtime": 653.1761, "train_tokens_per_second": 6936.298 }, { "epoch": 0.008488063660477453, "grad_norm": 131.78345996041816, "learning_rate": 1.2711864406779662e-06, "loss": 4.6976, "num_input_tokens_seen": 9034496, "step": 10, "train_runtime": 1273.69, "train_tokens_per_second": 7093.167 }, { "epoch": 0.01273209549071618, "grad_norm": 41.38402483800692, "learning_rate": 1.977401129943503e-06, "loss": 3.047, "num_input_tokens_seen": 13435712, "step": 15, "train_runtime": 1887.0646, "train_tokens_per_second": 7119.9 }, { "epoch": 0.016976127320954906, "grad_norm": 30.090022612502594, "learning_rate": 2.6836158192090396e-06, "loss": 2.1792, "num_input_tokens_seen": 17831104, "step": 20, "train_runtime": 2539.5416, "train_tokens_per_second": 7021.387 }, { "epoch": 0.021220159151193633, "grad_norm": 10.507378327830093, "learning_rate": 3.3898305084745763e-06, "loss": 1.806, "num_input_tokens_seen": 22489280, "step": 25, "train_runtime": 3223.2496, "train_tokens_per_second": 6977.207 }, { "epoch": 0.02546419098143236, "grad_norm": 7.849105701004419, "learning_rate": 4.096045197740113e-06, "loss": 1.5334, "num_input_tokens_seen": 26938368, "step": 30, "train_runtime": 3838.47, "train_tokens_per_second": 7017.996 }, { "epoch": 0.029708222811671087, "grad_norm": 7.864574931572695, "learning_rate": 4.80225988700565e-06, "loss": 1.377, "num_input_tokens_seen": 31539968, "step": 35, "train_runtime": 4503.6915, "train_tokens_per_second": 7003.137 }, { "epoch": 0.03395225464190981, "grad_norm": 7.3747833148860025, "learning_rate": 5.508474576271187e-06, "loss": 1.3089, "num_input_tokens_seen": 35979392, "step": 40, "train_runtime": 5147.1318, "train_tokens_per_second": 6990.183 }, { "epoch": 0.03819628647214854, "grad_norm": 8.361663447491122, "learning_rate": 6.214689265536724e-06, "loss": 1.2385, "num_input_tokens_seen": 40533440, "step": 45, "train_runtime": 5799.5796, "train_tokens_per_second": 6989.031 }, { "epoch": 0.042440318302387266, "grad_norm": 7.700383821460495, "learning_rate": 6.92090395480226e-06, "loss": 1.1772, "num_input_tokens_seen": 45172288, "step": 50, "train_runtime": 6470.1331, "train_tokens_per_second": 6981.663 }, { "epoch": 0.04668435013262599, "grad_norm": 7.153074169274688, "learning_rate": 7.627118644067798e-06, "loss": 1.1281, "num_input_tokens_seen": 49855040, "step": 55, "train_runtime": 7138.9239, "train_tokens_per_second": 6983.551 }, { "epoch": 0.05092838196286472, "grad_norm": 5.863677860447595, "learning_rate": 8.333333333333334e-06, "loss": 1.0659, "num_input_tokens_seen": 54260032, "step": 60, "train_runtime": 7733.863, "train_tokens_per_second": 7015.903 }, { "epoch": 0.05517241379310345, "grad_norm": 5.211482731988629, "learning_rate": 9.039548022598871e-06, "loss": 0.992, "num_input_tokens_seen": 58804672, "step": 65, "train_runtime": 8385.1953, "train_tokens_per_second": 7012.916 }, { "epoch": 0.059416445623342175, "grad_norm": 3.0025030899721137, "learning_rate": 9.745762711864407e-06, "loss": 0.9647, "num_input_tokens_seen": 63162880, "step": 70, "train_runtime": 9001.3827, "train_tokens_per_second": 7017.02 }, { "epoch": 0.0636604774535809, "grad_norm": 1.999806623127411, "learning_rate": 1.0451977401129943e-05, "loss": 0.9392, "num_input_tokens_seen": 67642560, "step": 75, "train_runtime": 9659.9049, "train_tokens_per_second": 7002.404 }, { "epoch": 0.06790450928381962, "grad_norm": 2.1507559480146172, "learning_rate": 1.115819209039548e-05, "loss": 0.9183, "num_input_tokens_seen": 72091264, "step": 80, "train_runtime": 10244.6396, "train_tokens_per_second": 7036.974 }, { "epoch": 0.07214854111405836, "grad_norm": 2.2495505770555297, "learning_rate": 1.1864406779661018e-05, "loss": 0.9072, "num_input_tokens_seen": 76507136, "step": 85, "train_runtime": 10853.5827, "train_tokens_per_second": 7049.021 }, { "epoch": 0.07639257294429708, "grad_norm": 2.121224741489407, "learning_rate": 1.2570621468926556e-05, "loss": 0.9108, "num_input_tokens_seen": 80870400, "step": 90, "train_runtime": 11472.3492, "train_tokens_per_second": 7049.158 }, { "epoch": 0.08063660477453581, "grad_norm": 2.588502611123473, "learning_rate": 1.3276836158192092e-05, "loss": 0.8924, "num_input_tokens_seen": 85329024, "step": 95, "train_runtime": 12124.6375, "train_tokens_per_second": 7037.656 }, { "epoch": 0.08488063660477453, "grad_norm": 2.168204787290024, "learning_rate": 1.3983050847457627e-05, "loss": 0.9035, "num_input_tokens_seen": 89666944, "step": 100, "train_runtime": 12721.9573, "train_tokens_per_second": 7048.203 }, { "epoch": 0.08912466843501327, "grad_norm": 2.039908089915845, "learning_rate": 1.4689265536723165e-05, "loss": 0.8925, "num_input_tokens_seen": 94089920, "step": 105, "train_runtime": 13381.4037, "train_tokens_per_second": 7031.394 }, { "epoch": 0.09336870026525199, "grad_norm": 120.4758045680371, "learning_rate": 1.5395480225988703e-05, "loss": 0.909, "num_input_tokens_seen": 98437760, "step": 110, "train_runtime": 14013.3414, "train_tokens_per_second": 7024.574 }, { "epoch": 0.09761273209549072, "grad_norm": 1.953705157702643, "learning_rate": 1.6101694915254237e-05, "loss": 0.9098, "num_input_tokens_seen": 103115456, "step": 115, "train_runtime": 14689.3018, "train_tokens_per_second": 7019.766 }, { "epoch": 0.10185676392572944, "grad_norm": 1.548682015852324, "learning_rate": 1.6807909604519774e-05, "loss": 0.8787, "num_input_tokens_seen": 107696768, "step": 120, "train_runtime": 15307.4003, "train_tokens_per_second": 7035.601 }, { "epoch": 0.10610079575596817, "grad_norm": 1.4155432758105517, "learning_rate": 1.7514124293785312e-05, "loss": 0.8971, "num_input_tokens_seen": 112245632, "step": 125, "train_runtime": 15954.3044, "train_tokens_per_second": 7035.445 }, { "epoch": 0.1103448275862069, "grad_norm": 1.5268658891716769, "learning_rate": 1.8220338983050846e-05, "loss": 0.8735, "num_input_tokens_seen": 116619392, "step": 130, "train_runtime": 16586.124, "train_tokens_per_second": 7031.142 }, { "epoch": 0.11458885941644563, "grad_norm": 1.4550371590475242, "learning_rate": 1.8926553672316387e-05, "loss": 0.8871, "num_input_tokens_seen": 121198656, "step": 135, "train_runtime": 17239.014, "train_tokens_per_second": 7030.487 }, { "epoch": 0.11883289124668435, "grad_norm": 1.4816395671279814, "learning_rate": 1.963276836158192e-05, "loss": 0.8735, "num_input_tokens_seen": 125766592, "step": 140, "train_runtime": 17856.4497, "train_tokens_per_second": 7043.203 }, { "epoch": 0.12307692307692308, "grad_norm": 1.8562529667931933, "learning_rate": 2.033898305084746e-05, "loss": 0.8841, "num_input_tokens_seen": 130417216, "step": 145, "train_runtime": 18488.9056, "train_tokens_per_second": 7053.809 }, { "epoch": 0.1273209549071618, "grad_norm": 1.3288711166284626, "learning_rate": 2.1045197740112996e-05, "loss": 0.8847, "num_input_tokens_seen": 134684288, "step": 150, "train_runtime": 19070.7007, "train_tokens_per_second": 7062.367 }, { "epoch": 0.13156498673740052, "grad_norm": 1.716843857648477, "learning_rate": 2.175141242937853e-05, "loss": 0.8699, "num_input_tokens_seen": 139319872, "step": 155, "train_runtime": 19757.791, "train_tokens_per_second": 7051.389 }, { "epoch": 0.13580901856763924, "grad_norm": 2.015691688957, "learning_rate": 2.245762711864407e-05, "loss": 0.8707, "num_input_tokens_seen": 143717952, "step": 160, "train_runtime": 20371.2146, "train_tokens_per_second": 7054.953 }, { "epoch": 0.140053050397878, "grad_norm": 1.787168466261934, "learning_rate": 2.3163841807909606e-05, "loss": 0.8796, "num_input_tokens_seen": 148245632, "step": 165, "train_runtime": 21001.0169, "train_tokens_per_second": 7058.974 }, { "epoch": 0.1442970822281167, "grad_norm": 4.713178832164353, "learning_rate": 2.3870056497175143e-05, "loss": 0.8977, "num_input_tokens_seen": 152687296, "step": 170, "train_runtime": 21617.8243, "train_tokens_per_second": 7063.028 }, { "epoch": 0.14854111405835543, "grad_norm": 1.4590858301256864, "learning_rate": 2.457627118644068e-05, "loss": 0.8959, "num_input_tokens_seen": 157314368, "step": 175, "train_runtime": 22299.4419, "train_tokens_per_second": 7054.633 }, { "epoch": 0.15278514588859415, "grad_norm": 1.4882611183761851, "learning_rate": 2.5282485875706215e-05, "loss": 0.8729, "num_input_tokens_seen": 161815808, "step": 180, "train_runtime": 22965.2954, "train_tokens_per_second": 7046.102 }, { "epoch": 0.1570291777188329, "grad_norm": 1.2937684487451329, "learning_rate": 2.5988700564971752e-05, "loss": 0.8801, "num_input_tokens_seen": 166361792, "step": 185, "train_runtime": 23622.1066, "train_tokens_per_second": 7042.631 }, { "epoch": 0.16127320954907162, "grad_norm": 1.326371860515381, "learning_rate": 2.669491525423729e-05, "loss": 0.8837, "num_input_tokens_seen": 170963072, "step": 190, "train_runtime": 24288.8241, "train_tokens_per_second": 7038.755 }, { "epoch": 0.16551724137931034, "grad_norm": 1.3881050133378776, "learning_rate": 2.7401129943502824e-05, "loss": 0.8848, "num_input_tokens_seen": 175401600, "step": 195, "train_runtime": 24906.6505, "train_tokens_per_second": 7042.36 }, { "epoch": 0.16976127320954906, "grad_norm": 1.5698900690401176, "learning_rate": 2.8107344632768362e-05, "loss": 0.8687, "num_input_tokens_seen": 179779200, "step": 200, "train_runtime": 25515.4439, "train_tokens_per_second": 7045.897 }, { "epoch": 0.1740053050397878, "grad_norm": 1.4897036851687533, "learning_rate": 2.88135593220339e-05, "loss": 0.8782, "num_input_tokens_seen": 184224512, "step": 205, "train_runtime": 26153.0151, "train_tokens_per_second": 7044.102 }, { "epoch": 0.17824933687002653, "grad_norm": 1.82238111934615, "learning_rate": 2.951977401129944e-05, "loss": 0.8838, "num_input_tokens_seen": 188802432, "step": 210, "train_runtime": 26835.6565, "train_tokens_per_second": 7035.506 }, { "epoch": 0.18249336870026525, "grad_norm": 1.653089989083378, "learning_rate": 3.022598870056497e-05, "loss": 0.8724, "num_input_tokens_seen": 193510336, "step": 215, "train_runtime": 27522.5514, "train_tokens_per_second": 7030.974 }, { "epoch": 0.18673740053050397, "grad_norm": 1.4627059373154376, "learning_rate": 3.093220338983051e-05, "loss": 0.8649, "num_input_tokens_seen": 197993536, "step": 220, "train_runtime": 28129.6066, "train_tokens_per_second": 7038.617 }, { "epoch": 0.1909814323607427, "grad_norm": 1.247186929390262, "learning_rate": 3.1638418079096046e-05, "loss": 0.8707, "num_input_tokens_seen": 202834048, "step": 225, "train_runtime": 28804.0632, "train_tokens_per_second": 7041.855 }, { "epoch": 0.19522546419098144, "grad_norm": 1.5240142785161233, "learning_rate": 3.234463276836158e-05, "loss": 0.8686, "num_input_tokens_seen": 207360320, "step": 230, "train_runtime": 29445.3467, "train_tokens_per_second": 7042.21 }, { "epoch": 0.19946949602122016, "grad_norm": 1.186249792586507, "learning_rate": 3.305084745762712e-05, "loss": 0.8591, "num_input_tokens_seen": 211972800, "step": 235, "train_runtime": 30125.9512, "train_tokens_per_second": 7036.219 }, { "epoch": 0.20371352785145888, "grad_norm": 1.5311530250979444, "learning_rate": 3.375706214689266e-05, "loss": 0.88, "num_input_tokens_seen": 216741248, "step": 240, "train_runtime": 30816.4459, "train_tokens_per_second": 7033.298 }, { "epoch": 0.2079575596816976, "grad_norm": 1.0601153174356541, "learning_rate": 3.446327683615819e-05, "loss": 0.8828, "num_input_tokens_seen": 221190080, "step": 245, "train_runtime": 31439.204, "train_tokens_per_second": 7035.486 }, { "epoch": 0.21220159151193635, "grad_norm": 1.75722517567588, "learning_rate": 3.516949152542373e-05, "loss": 0.8921, "num_input_tokens_seen": 225847168, "step": 250, "train_runtime": 32147.0921, "train_tokens_per_second": 7025.431 }, { "epoch": 0.21644562334217507, "grad_norm": 2.0274303550036263, "learning_rate": 3.587570621468927e-05, "loss": 0.9012, "num_input_tokens_seen": 230582208, "step": 255, "train_runtime": 32842.8641, "train_tokens_per_second": 7020.77 }, { "epoch": 0.2206896551724138, "grad_norm": 1.9253331362655286, "learning_rate": 3.6581920903954806e-05, "loss": 0.8999, "num_input_tokens_seen": 235274688, "step": 260, "train_runtime": 33539.0669, "train_tokens_per_second": 7014.944 }, { "epoch": 0.2249336870026525, "grad_norm": 2.2082649940934975, "learning_rate": 3.728813559322034e-05, "loss": 0.9029, "num_input_tokens_seen": 239864000, "step": 265, "train_runtime": 34221.492, "train_tokens_per_second": 7009.163 }, { "epoch": 0.22917771883289126, "grad_norm": 1.452691571968028, "learning_rate": 3.799435028248588e-05, "loss": 0.9046, "num_input_tokens_seen": 244225344, "step": 270, "train_runtime": 34836.4135, "train_tokens_per_second": 7010.634 }, { "epoch": 0.23342175066312998, "grad_norm": 2.730577739346072, "learning_rate": 3.8700564971751415e-05, "loss": 0.8874, "num_input_tokens_seen": 248726272, "step": 275, "train_runtime": 35499.0341, "train_tokens_per_second": 7006.565 }, { "epoch": 0.2376657824933687, "grad_norm": 1.093231645373866, "learning_rate": 3.940677966101695e-05, "loss": 0.8809, "num_input_tokens_seen": 253245952, "step": 280, "train_runtime": 36135.7808, "train_tokens_per_second": 7008.177 }, { "epoch": 0.24190981432360742, "grad_norm": 1.307461678503626, "learning_rate": 4.011299435028249e-05, "loss": 0.8598, "num_input_tokens_seen": 257684480, "step": 285, "train_runtime": 36728.042, "train_tokens_per_second": 7016.015 }, { "epoch": 0.24615384615384617, "grad_norm": 1.2991784389959953, "learning_rate": 4.0819209039548024e-05, "loss": 0.8981, "num_input_tokens_seen": 262108992, "step": 290, "train_runtime": 37341.918, "train_tokens_per_second": 7019.163 }, { "epoch": 0.25039787798408486, "grad_norm": 1.4781716766460902, "learning_rate": 4.152542372881356e-05, "loss": 0.9007, "num_input_tokens_seen": 266677504, "step": 295, "train_runtime": 38006.4439, "train_tokens_per_second": 7016.639 }, { "epoch": 0.25039787798408486, "eval_loss": 0.8760802745819092, "eval_runtime": 1055.1289, "eval_samples_per_second": 2.888, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 266677504, "step": 295 }, { "epoch": 0.2546419098143236, "grad_norm": 1.3728608285035016, "learning_rate": 4.22316384180791e-05, "loss": 0.8857, "num_input_tokens_seen": 271486592, "step": 300, "train_runtime": 39774.4771, "train_tokens_per_second": 6825.648 }, { "epoch": 0.25888594164456236, "grad_norm": 1.3128466580163847, "learning_rate": 4.2937853107344634e-05, "loss": 1.0976, "num_input_tokens_seen": 276035072, "step": 305, "train_runtime": 40413.7335, "train_tokens_per_second": 6830.229 }, { "epoch": 0.26312997347480105, "grad_norm": 1.9380613988266078, "learning_rate": 4.3644067796610175e-05, "loss": 0.8879, "num_input_tokens_seen": 280424512, "step": 310, "train_runtime": 41016.3815, "train_tokens_per_second": 6836.891 }, { "epoch": 0.2673740053050398, "grad_norm": 1.7436380468280226, "learning_rate": 4.435028248587571e-05, "loss": 0.894, "num_input_tokens_seen": 284818432, "step": 315, "train_runtime": 41649.8882, "train_tokens_per_second": 6838.396 }, { "epoch": 0.2716180371352785, "grad_norm": 1.81753757832679, "learning_rate": 4.505649717514124e-05, "loss": 0.8883, "num_input_tokens_seen": 289399424, "step": 320, "train_runtime": 42320.1609, "train_tokens_per_second": 6838.335 }, { "epoch": 0.27586206896551724, "grad_norm": 0.9404537896048348, "learning_rate": 4.5762711864406784e-05, "loss": 0.9124, "num_input_tokens_seen": 293917312, "step": 325, "train_runtime": 42970.7229, "train_tokens_per_second": 6839.943 }, { "epoch": 0.280106100795756, "grad_norm": 2.947005859311638, "learning_rate": 4.646892655367232e-05, "loss": 0.8892, "num_input_tokens_seen": 298366272, "step": 330, "train_runtime": 43553.8539, "train_tokens_per_second": 6850.514 }, { "epoch": 0.2843501326259947, "grad_norm": 1.6174994735745432, "learning_rate": 4.717514124293785e-05, "loss": 0.8881, "num_input_tokens_seen": 302869952, "step": 335, "train_runtime": 44211.0204, "train_tokens_per_second": 6850.553 }, { "epoch": 0.2885941644562334, "grad_norm": 1.7792646072660459, "learning_rate": 4.788135593220339e-05, "loss": 0.8803, "num_input_tokens_seen": 307426560, "step": 340, "train_runtime": 44880.4175, "train_tokens_per_second": 6849.904 }, { "epoch": 0.2928381962864722, "grad_norm": 1.0669701860722622, "learning_rate": 4.8587570621468934e-05, "loss": 0.897, "num_input_tokens_seen": 312164928, "step": 345, "train_runtime": 45578.8582, "train_tokens_per_second": 6848.897 }, { "epoch": 0.29708222811671087, "grad_norm": 1.2083166811125388, "learning_rate": 4.929378531073446e-05, "loss": 0.9033, "num_input_tokens_seen": 316741824, "step": 350, "train_runtime": 46190.9574, "train_tokens_per_second": 6857.226 }, { "epoch": 0.3013262599469496, "grad_norm": 2.667811908170323, "learning_rate": 5e-05, "loss": 0.8758, "num_input_tokens_seen": 321123008, "step": 355, "train_runtime": 46809.8332, "train_tokens_per_second": 6860.161 }, { "epoch": 0.3055702917771883, "grad_norm": 1.5315598777999704, "learning_rate": 4.999969557829892e-05, "loss": 0.8975, "num_input_tokens_seen": 325636416, "step": 360, "train_runtime": 47466.7463, "train_tokens_per_second": 6860.306 }, { "epoch": 0.30981432360742706, "grad_norm": 1.0061504802501977, "learning_rate": 4.999878232060946e-05, "loss": 0.8919, "num_input_tokens_seen": 330217472, "step": 365, "train_runtime": 48143.739, "train_tokens_per_second": 6858.991 }, { "epoch": 0.3140583554376658, "grad_norm": 1.0691191893512106, "learning_rate": 4.999726024917288e-05, "loss": 0.8775, "num_input_tokens_seen": 334605888, "step": 370, "train_runtime": 48800.4766, "train_tokens_per_second": 6856.611 }, { "epoch": 0.3183023872679045, "grad_norm": 1.1646960467870506, "learning_rate": 4.99951294010573e-05, "loss": 0.8944, "num_input_tokens_seen": 339190016, "step": 375, "train_runtime": 49427.2381, "train_tokens_per_second": 6862.411 }, { "epoch": 0.32254641909814324, "grad_norm": 13.073420853243217, "learning_rate": 4.999238982815683e-05, "loss": 0.908, "num_input_tokens_seen": 343751808, "step": 380, "train_runtime": 50079.7525, "train_tokens_per_second": 6864.088 }, { "epoch": 0.32679045092838194, "grad_norm": 1.030515508528764, "learning_rate": 4.99890415971903e-05, "loss": 0.9207, "num_input_tokens_seen": 348170752, "step": 385, "train_runtime": 50717.2414, "train_tokens_per_second": 6864.939 }, { "epoch": 0.3310344827586207, "grad_norm": 2.462002678202629, "learning_rate": 4.9985084789699645e-05, "loss": 0.8857, "num_input_tokens_seen": 352658368, "step": 390, "train_runtime": 51341.856, "train_tokens_per_second": 6868.828 }, { "epoch": 0.33527851458885943, "grad_norm": 1.1178056520913622, "learning_rate": 4.998051950204792e-05, "loss": 0.8942, "num_input_tokens_seen": 357241472, "step": 395, "train_runtime": 51991.5243, "train_tokens_per_second": 6871.148 }, { "epoch": 0.3395225464190981, "grad_norm": 12.174478000037658, "learning_rate": 4.997534584541692e-05, "loss": 0.9033, "num_input_tokens_seen": 361706368, "step": 400, "train_runtime": 52622.4401, "train_tokens_per_second": 6873.615 }, { "epoch": 0.3437665782493369, "grad_norm": 1.3129250294672423, "learning_rate": 4.996956394580453e-05, "loss": 0.9534, "num_input_tokens_seen": 366301824, "step": 405, "train_runtime": 53287.8409, "train_tokens_per_second": 6874.023 }, { "epoch": 0.3480106100795756, "grad_norm": 1.2922663914137134, "learning_rate": 4.9963173944021604e-05, "loss": 0.907, "num_input_tokens_seen": 370813440, "step": 410, "train_runtime": 53932.276, "train_tokens_per_second": 6875.539 }, { "epoch": 0.3522546419098143, "grad_norm": 3.97037390684764, "learning_rate": 4.995617599568855e-05, "loss": 0.8908, "num_input_tokens_seen": 375343232, "step": 415, "train_runtime": 54587.718, "train_tokens_per_second": 6875.965 }, { "epoch": 0.35649867374005306, "grad_norm": 1.198328142782024, "learning_rate": 4.9948570271231553e-05, "loss": 0.8871, "num_input_tokens_seen": 379904000, "step": 420, "train_runtime": 55281.7215, "train_tokens_per_second": 6872.145 }, { "epoch": 0.36074270557029176, "grad_norm": 0.971040029995864, "learning_rate": 4.9940356955878436e-05, "loss": 0.883, "num_input_tokens_seen": 384479488, "step": 425, "train_runtime": 55946.6877, "train_tokens_per_second": 6872.248 }, { "epoch": 0.3649867374005305, "grad_norm": 1.1690338997234486, "learning_rate": 4.99315362496541e-05, "loss": 0.8915, "num_input_tokens_seen": 389250176, "step": 430, "train_runtime": 56652.1149, "train_tokens_per_second": 6870.885 }, { "epoch": 0.36923076923076925, "grad_norm": 1.33505203332503, "learning_rate": 4.9922108367375695e-05, "loss": 0.8924, "num_input_tokens_seen": 393810688, "step": 435, "train_runtime": 57300.4821, "train_tokens_per_second": 6872.729 }, { "epoch": 0.37347480106100794, "grad_norm": 1.049216620404627, "learning_rate": 4.991207353864739e-05, "loss": 0.8777, "num_input_tokens_seen": 398511168, "step": 440, "train_runtime": 57988.9682, "train_tokens_per_second": 6872.189 }, { "epoch": 0.3777188328912467, "grad_norm": 1.1473154938029155, "learning_rate": 4.9901432007854744e-05, "loss": 0.8633, "num_input_tokens_seen": 403089152, "step": 445, "train_runtime": 58633.6036, "train_tokens_per_second": 6874.712 }, { "epoch": 0.3819628647214854, "grad_norm": 1.4204892310138295, "learning_rate": 4.9890184034158794e-05, "loss": 0.8873, "num_input_tokens_seen": 407921792, "step": 450, "train_runtime": 59298.1247, "train_tokens_per_second": 6879.169 }, { "epoch": 0.38620689655172413, "grad_norm": 1.1936514251690153, "learning_rate": 4.987832989148973e-05, "loss": 0.8795, "num_input_tokens_seen": 412324096, "step": 455, "train_runtime": 59926.4345, "train_tokens_per_second": 6880.504 }, { "epoch": 0.3904509283819629, "grad_norm": 1.10287952418463, "learning_rate": 4.986586986854019e-05, "loss": 0.8606, "num_input_tokens_seen": 416742912, "step": 460, "train_runtime": 60568.403, "train_tokens_per_second": 6880.533 }, { "epoch": 0.3946949602122016, "grad_norm": 1.072213338502524, "learning_rate": 4.985280426875831e-05, "loss": 0.872, "num_input_tokens_seen": 421138880, "step": 465, "train_runtime": 61260.5252, "train_tokens_per_second": 6874.555 }, { "epoch": 0.3989389920424403, "grad_norm": 1.0219296197838135, "learning_rate": 4.983913341034026e-05, "loss": 0.8775, "num_input_tokens_seen": 425727936, "step": 470, "train_runtime": 61904.3467, "train_tokens_per_second": 6877.19 }, { "epoch": 0.40318302387267907, "grad_norm": 1.3843761799310907, "learning_rate": 4.98248576262225e-05, "loss": 0.8775, "num_input_tokens_seen": 430157696, "step": 475, "train_runtime": 62556.696, "train_tokens_per_second": 6876.285 }, { "epoch": 0.40742705570291776, "grad_norm": 1.1025487338096294, "learning_rate": 4.980997726407371e-05, "loss": 0.8504, "num_input_tokens_seen": 434654208, "step": 480, "train_runtime": 63190.0546, "train_tokens_per_second": 6878.522 }, { "epoch": 0.4116710875331565, "grad_norm": 1.2747087605024068, "learning_rate": 4.979449268628632e-05, "loss": 0.8666, "num_input_tokens_seen": 439274752, "step": 485, "train_runtime": 63846.4067, "train_tokens_per_second": 6880.18 }, { "epoch": 0.4159151193633952, "grad_norm": 1.1710609815467128, "learning_rate": 4.977840426996763e-05, "loss": 0.8805, "num_input_tokens_seen": 443719872, "step": 490, "train_runtime": 64497.93, "train_tokens_per_second": 6879.599 }, { "epoch": 0.42015915119363395, "grad_norm": 1.0097086187416695, "learning_rate": 4.97617124069307e-05, "loss": 0.8903, "num_input_tokens_seen": 448255296, "step": 495, "train_runtime": 65132.19, "train_tokens_per_second": 6882.239 }, { "epoch": 0.4244031830238727, "grad_norm": 37.867408485972554, "learning_rate": 4.974441750368476e-05, "loss": 0.8397, "num_input_tokens_seen": 452923520, "step": 500, "train_runtime": 65815.3494, "train_tokens_per_second": 6881.731 }, { "epoch": 0.4286472148541114, "grad_norm": 1.0330730902667171, "learning_rate": 4.97265199814253e-05, "loss": 0.8865, "num_input_tokens_seen": 457377280, "step": 505, "train_runtime": 66451.0204, "train_tokens_per_second": 6882.923 }, { "epoch": 0.43289124668435014, "grad_norm": 1.3051062489077976, "learning_rate": 4.9708020276023874e-05, "loss": 0.86, "num_input_tokens_seen": 461956224, "step": 510, "train_runtime": 67114.9924, "train_tokens_per_second": 6883.056 }, { "epoch": 0.43713527851458883, "grad_norm": 2.1187078081806012, "learning_rate": 4.968891883801742e-05, "loss": 0.8749, "num_input_tokens_seen": 466374976, "step": 515, "train_runtime": 67739.4567, "train_tokens_per_second": 6884.835 }, { "epoch": 0.4413793103448276, "grad_norm": 1.4438973622990432, "learning_rate": 4.966921613259731e-05, "loss": 0.871, "num_input_tokens_seen": 470742528, "step": 520, "train_runtime": 68365.462, "train_tokens_per_second": 6885.678 }, { "epoch": 0.44562334217506633, "grad_norm": 1.53355639196128, "learning_rate": 4.964891263959803e-05, "loss": 0.8369, "num_input_tokens_seen": 475324480, "step": 525, "train_runtime": 69025.3358, "train_tokens_per_second": 6886.232 }, { "epoch": 0.449867374005305, "grad_norm": 1.128289481595987, "learning_rate": 4.962800885348551e-05, "loss": 0.863, "num_input_tokens_seen": 479877312, "step": 530, "train_runtime": 69684.2916, "train_tokens_per_second": 6886.449 }, { "epoch": 0.45411140583554377, "grad_norm": 1.0503072430304274, "learning_rate": 4.960650528334502e-05, "loss": 0.8667, "num_input_tokens_seen": 484343232, "step": 535, "train_runtime": 70344.4784, "train_tokens_per_second": 6885.306 }, { "epoch": 0.4583554376657825, "grad_norm": 0.9545521304763791, "learning_rate": 4.958440245286884e-05, "loss": 0.8696, "num_input_tokens_seen": 488876416, "step": 540, "train_runtime": 70972.8212, "train_tokens_per_second": 6888.22 }, { "epoch": 0.4625994694960212, "grad_norm": 1.3295608584891012, "learning_rate": 4.956170090034346e-05, "loss": 0.8349, "num_input_tokens_seen": 493485888, "step": 545, "train_runtime": 71650.7674, "train_tokens_per_second": 6887.378 }, { "epoch": 0.46684350132625996, "grad_norm": 1.1735342027871698, "learning_rate": 4.953840117863652e-05, "loss": 0.8458, "num_input_tokens_seen": 498090432, "step": 550, "train_runtime": 72292.0675, "train_tokens_per_second": 6889.974 }, { "epoch": 0.47108753315649865, "grad_norm": 1.2695672366224662, "learning_rate": 4.951450385518328e-05, "loss": 0.8423, "num_input_tokens_seen": 502546368, "step": 555, "train_runtime": 72919.7187, "train_tokens_per_second": 6891.776 }, { "epoch": 0.4753315649867374, "grad_norm": 1.0194113412118773, "learning_rate": 4.9490009511972856e-05, "loss": 0.8536, "num_input_tokens_seen": 507353920, "step": 560, "train_runtime": 73610.9277, "train_tokens_per_second": 6892.372 }, { "epoch": 0.47957559681697615, "grad_norm": 1.0743184753428263, "learning_rate": 4.9464918745534e-05, "loss": 0.8325, "num_input_tokens_seen": 511882560, "step": 565, "train_runtime": 74223.3431, "train_tokens_per_second": 6896.517 }, { "epoch": 0.48381962864721484, "grad_norm": 1.1038161960566173, "learning_rate": 4.943923216692064e-05, "loss": 0.834, "num_input_tokens_seen": 516353792, "step": 570, "train_runtime": 74834.7805, "train_tokens_per_second": 6899.917 }, { "epoch": 0.4880636604774536, "grad_norm": 1.0619822713768314, "learning_rate": 4.941295040169692e-05, "loss": 0.8388, "num_input_tokens_seen": 520893376, "step": 575, "train_runtime": 75515.5945, "train_tokens_per_second": 6897.825 }, { "epoch": 0.49230769230769234, "grad_norm": 0.9968217355531681, "learning_rate": 4.938607408992201e-05, "loss": 0.8393, "num_input_tokens_seen": 525369600, "step": 580, "train_runtime": 76150.219, "train_tokens_per_second": 6899.121 }, { "epoch": 0.496551724137931, "grad_norm": 1.081156576705322, "learning_rate": 4.9358603886134516e-05, "loss": 0.8227, "num_input_tokens_seen": 529878080, "step": 585, "train_runtime": 76814.7653, "train_tokens_per_second": 6898.128 }, { "epoch": 0.5007957559681697, "grad_norm": 0.9811791489788025, "learning_rate": 4.9330540459336536e-05, "loss": 0.8409, "num_input_tokens_seen": 534499648, "step": 590, "train_runtime": 77463.1501, "train_tokens_per_second": 6900.051 }, { "epoch": 0.5007957559681697, "eval_loss": 0.8492689728736877, "eval_runtime": 1055.1977, "eval_samples_per_second": 2.888, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 534499648, "step": 590 }, { "epoch": 0.5050397877984085, "grad_norm": 1.0876099733444793, "learning_rate": 4.930188449297737e-05, "loss": 0.8384, "num_input_tokens_seen": 538899968, "step": 595, "train_runtime": 79158.0223, "train_tokens_per_second": 6807.901 }, { "epoch": 0.5092838196286472, "grad_norm": 0.999155054979559, "learning_rate": 4.927263668493683e-05, "loss": 0.8359, "num_input_tokens_seen": 543296704, "step": 600, "train_runtime": 79818.1806, "train_tokens_per_second": 6806.679 }, { "epoch": 0.5135278514588859, "grad_norm": 1.3228294516057693, "learning_rate": 4.924279774750835e-05, "loss": 0.8315, "num_input_tokens_seen": 548007296, "step": 605, "train_runtime": 80478.8962, "train_tokens_per_second": 6809.329 }, { "epoch": 0.5177718832891247, "grad_norm": 1.1178865175204313, "learning_rate": 4.9212368407381515e-05, "loss": 0.8577, "num_input_tokens_seen": 552534656, "step": 610, "train_runtime": 81169.77, "train_tokens_per_second": 6807.148 }, { "epoch": 0.5220159151193634, "grad_norm": 1.1717734488513787, "learning_rate": 4.9181349405624444e-05, "loss": 0.8449, "num_input_tokens_seen": 557040512, "step": 615, "train_runtime": 81818.0476, "train_tokens_per_second": 6808.284 }, { "epoch": 0.5262599469496021, "grad_norm": 1.281506485794031, "learning_rate": 4.9149741497665724e-05, "loss": 0.8236, "num_input_tokens_seen": 561632640, "step": 620, "train_runtime": 82448.1767, "train_tokens_per_second": 6811.947 }, { "epoch": 0.5305039787798409, "grad_norm": 1.2084706718767035, "learning_rate": 4.9117545453276016e-05, "loss": 0.8396, "num_input_tokens_seen": 566108032, "step": 625, "train_runtime": 83077.9364, "train_tokens_per_second": 6814.18 }, { "epoch": 0.5347480106100796, "grad_norm": 0.9983384620282137, "learning_rate": 4.908476205654926e-05, "loss": 0.8534, "num_input_tokens_seen": 570380992, "step": 630, "train_runtime": 83704.2336, "train_tokens_per_second": 6814.243 }, { "epoch": 0.5389920424403183, "grad_norm": 0.9073500421909143, "learning_rate": 4.905139210588367e-05, "loss": 0.8345, "num_input_tokens_seen": 574750656, "step": 635, "train_runtime": 84320.0888, "train_tokens_per_second": 6816.296 }, { "epoch": 0.543236074270557, "grad_norm": 0.8991742233452803, "learning_rate": 4.9017436413962214e-05, "loss": 0.8238, "num_input_tokens_seen": 579340672, "step": 640, "train_runtime": 84974.8594, "train_tokens_per_second": 6817.789 }, { "epoch": 0.5474801061007958, "grad_norm": 4.653712662762566, "learning_rate": 4.898289580773284e-05, "loss": 0.8246, "num_input_tokens_seen": 583953984, "step": 645, "train_runtime": 85620.5078, "train_tokens_per_second": 6820.258 }, { "epoch": 0.5517241379310345, "grad_norm": 1.2360522636755376, "learning_rate": 4.8947771128388375e-05, "loss": 0.8556, "num_input_tokens_seen": 588613760, "step": 650, "train_runtime": 86304.8212, "train_tokens_per_second": 6820.172 }, { "epoch": 0.5559681697612732, "grad_norm": 1.012520096736992, "learning_rate": 4.891206323134598e-05, "loss": 0.8536, "num_input_tokens_seen": 593383872, "step": 655, "train_runtime": 87061.5623, "train_tokens_per_second": 6815.681 }, { "epoch": 0.560212201591512, "grad_norm": 0.8267761442768032, "learning_rate": 4.887577298622635e-05, "loss": 0.8353, "num_input_tokens_seen": 597805376, "step": 660, "train_runtime": 87694.9854, "train_tokens_per_second": 6816.871 }, { "epoch": 0.5644562334217507, "grad_norm": 0.7725220551193656, "learning_rate": 4.883890127683255e-05, "loss": 0.8328, "num_input_tokens_seen": 602285312, "step": 665, "train_runtime": 88327.7078, "train_tokens_per_second": 6818.759 }, { "epoch": 0.5687002652519894, "grad_norm": 1.0161234053244246, "learning_rate": 4.8801449001128455e-05, "loss": 0.8292, "num_input_tokens_seen": 606832384, "step": 670, "train_runtime": 88954.121, "train_tokens_per_second": 6821.858 }, { "epoch": 0.5729442970822282, "grad_norm": 0.9710515463242312, "learning_rate": 4.87634170712169e-05, "loss": 0.8299, "num_input_tokens_seen": 611343936, "step": 675, "train_runtime": 89596.2203, "train_tokens_per_second": 6823.323 }, { "epoch": 0.5771883289124669, "grad_norm": 0.877400582973328, "learning_rate": 4.872480641331747e-05, "loss": 0.8233, "num_input_tokens_seen": 615967936, "step": 680, "train_runtime": 90249.7743, "train_tokens_per_second": 6825.147 }, { "epoch": 0.5814323607427055, "grad_norm": 1.088037474445821, "learning_rate": 4.868561796774394e-05, "loss": 0.834, "num_input_tokens_seen": 620611584, "step": 685, "train_runtime": 90936.5491, "train_tokens_per_second": 6824.666 }, { "epoch": 0.5856763925729443, "grad_norm": 1.1907419217852628, "learning_rate": 4.8645852688881355e-05, "loss": 0.8409, "num_input_tokens_seen": 624875392, "step": 690, "train_runtime": 91496.14, "train_tokens_per_second": 6829.527 }, { "epoch": 0.589920424403183, "grad_norm": 1.050185541897206, "learning_rate": 4.860551154516285e-05, "loss": 0.8312, "num_input_tokens_seen": 629393280, "step": 695, "train_runtime": 92108.2917, "train_tokens_per_second": 6833.188 }, { "epoch": 0.5941644562334217, "grad_norm": 1.010783386950393, "learning_rate": 4.856459551904597e-05, "loss": 0.8435, "num_input_tokens_seen": 633982080, "step": 700, "train_runtime": 92773.246, "train_tokens_per_second": 6833.674 }, { "epoch": 0.5984084880636604, "grad_norm": 0.8655815203324599, "learning_rate": 4.8523105606988835e-05, "loss": 0.8341, "num_input_tokens_seen": 638403328, "step": 705, "train_runtime": 93409.0451, "train_tokens_per_second": 6834.492 }, { "epoch": 0.6026525198938992, "grad_norm": 1.0223867573550975, "learning_rate": 4.84810428194258e-05, "loss": 0.8298, "num_input_tokens_seen": 643227008, "step": 710, "train_runtime": 94093.6295, "train_tokens_per_second": 6836.031 }, { "epoch": 0.6068965517241379, "grad_norm": 0.9038681896334841, "learning_rate": 4.8438408180742894e-05, "loss": 0.8236, "num_input_tokens_seen": 647670528, "step": 715, "train_runtime": 94754.1626, "train_tokens_per_second": 6835.273 }, { "epoch": 0.6111405835543766, "grad_norm": 0.8915747409475175, "learning_rate": 4.839520272925286e-05, "loss": 0.8321, "num_input_tokens_seen": 652249152, "step": 720, "train_runtime": 95432.8253, "train_tokens_per_second": 6834.642 }, { "epoch": 0.6153846153846154, "grad_norm": 0.97865694250563, "learning_rate": 4.835142751716986e-05, "loss": 0.8209, "num_input_tokens_seen": 656865472, "step": 725, "train_runtime": 96098.174, "train_tokens_per_second": 6835.359 }, { "epoch": 0.6196286472148541, "grad_norm": 1.059303689963251, "learning_rate": 4.8307083610583846e-05, "loss": 0.8402, "num_input_tokens_seen": 661394048, "step": 730, "train_runtime": 96740.313, "train_tokens_per_second": 6836.799 }, { "epoch": 0.6238726790450928, "grad_norm": 1.0997482317899427, "learning_rate": 4.8262172089434635e-05, "loss": 0.8172, "num_input_tokens_seen": 666086336, "step": 735, "train_runtime": 97449.6172, "train_tokens_per_second": 6835.187 }, { "epoch": 0.6281167108753316, "grad_norm": 1.0533663604386134, "learning_rate": 4.8216694047485554e-05, "loss": 0.8044, "num_input_tokens_seen": 670654912, "step": 740, "train_runtime": 98101.755, "train_tokens_per_second": 6836.319 }, { "epoch": 0.6323607427055703, "grad_norm": 0.9308590706085377, "learning_rate": 4.817065059229682e-05, "loss": 0.8285, "num_input_tokens_seen": 675148480, "step": 745, "train_runtime": 98698.5544, "train_tokens_per_second": 6840.51 }, { "epoch": 0.636604774535809, "grad_norm": 1.3308817219838736, "learning_rate": 4.812404284519861e-05, "loss": 0.8158, "num_input_tokens_seen": 679716288, "step": 750, "train_runtime": 99373.7407, "train_tokens_per_second": 6839.999 }, { "epoch": 0.6408488063660478, "grad_norm": 0.9405621214215828, "learning_rate": 4.8076871941263676e-05, "loss": 0.8149, "num_input_tokens_seen": 684227584, "step": 755, "train_runtime": 100016.4536, "train_tokens_per_second": 6841.15 }, { "epoch": 0.6450928381962865, "grad_norm": 0.8872116598330962, "learning_rate": 4.8029139029279785e-05, "loss": 0.826, "num_input_tokens_seen": 688566720, "step": 760, "train_runtime": 100660.3191, "train_tokens_per_second": 6840.498 }, { "epoch": 0.6493368700265252, "grad_norm": 0.9148948434386133, "learning_rate": 4.798084527172167e-05, "loss": 0.8166, "num_input_tokens_seen": 693048704, "step": 765, "train_runtime": 101289.7336, "train_tokens_per_second": 6842.24 }, { "epoch": 0.6535809018567639, "grad_norm": 1.0036481967491233, "learning_rate": 4.793199184472274e-05, "loss": 0.7923, "num_input_tokens_seen": 697787008, "step": 770, "train_runtime": 101976.2461, "train_tokens_per_second": 6842.643 }, { "epoch": 0.6578249336870027, "grad_norm": 1.2742858954042269, "learning_rate": 4.7882579938046485e-05, "loss": 0.8215, "num_input_tokens_seen": 702239936, "step": 775, "train_runtime": 102610.041, "train_tokens_per_second": 6843.774 }, { "epoch": 0.6620689655172414, "grad_norm": 0.8006816309417074, "learning_rate": 4.783261075505743e-05, "loss": 0.8246, "num_input_tokens_seen": 706860352, "step": 780, "train_runtime": 103308.8254, "train_tokens_per_second": 6842.207 }, { "epoch": 0.6663129973474801, "grad_norm": 0.965961260253803, "learning_rate": 4.7782085512691875e-05, "loss": 0.8038, "num_input_tokens_seen": 711270720, "step": 785, "train_runtime": 103950.3781, "train_tokens_per_second": 6842.406 }, { "epoch": 0.6705570291777189, "grad_norm": 1.0034852088994206, "learning_rate": 4.7731005441428233e-05, "loss": 0.8138, "num_input_tokens_seen": 715836544, "step": 790, "train_runtime": 104619.9901, "train_tokens_per_second": 6842.254 }, { "epoch": 0.6748010610079576, "grad_norm": 1.0147223046467029, "learning_rate": 4.767937178525709e-05, "loss": 0.8025, "num_input_tokens_seen": 720186176, "step": 795, "train_runtime": 105243.2657, "train_tokens_per_second": 6843.062 }, { "epoch": 0.6790450928381963, "grad_norm": 0.926457311442124, "learning_rate": 4.7627185801650856e-05, "loss": 0.7936, "num_input_tokens_seen": 724606016, "step": 800, "train_runtime": 105887.5277, "train_tokens_per_second": 6843.167 }, { "epoch": 0.683289124668435, "grad_norm": 0.8015225346610259, "learning_rate": 4.757444876153323e-05, "loss": 0.8111, "num_input_tokens_seen": 729128640, "step": 805, "train_runtime": 106518.7609, "train_tokens_per_second": 6845.072 }, { "epoch": 0.6875331564986737, "grad_norm": 0.9590554887765099, "learning_rate": 4.752116194924816e-05, "loss": 0.817, "num_input_tokens_seen": 733513856, "step": 810, "train_runtime": 107154.5457, "train_tokens_per_second": 6845.383 }, { "epoch": 0.6917771883289124, "grad_norm": 1.252406369714219, "learning_rate": 4.746732666252861e-05, "loss": 0.8036, "num_input_tokens_seen": 737837376, "step": 815, "train_runtime": 107786.6851, "train_tokens_per_second": 6845.348 }, { "epoch": 0.6960212201591512, "grad_norm": 1.1913694647387847, "learning_rate": 4.7412944212464935e-05, "loss": 0.8184, "num_input_tokens_seen": 742147072, "step": 820, "train_runtime": 108401.4826, "train_tokens_per_second": 6846.282 }, { "epoch": 0.7002652519893899, "grad_norm": 0.9763850021800689, "learning_rate": 4.7358015923472986e-05, "loss": 0.8044, "num_input_tokens_seen": 747045952, "step": 825, "train_runtime": 109094.785, "train_tokens_per_second": 6847.678 }, { "epoch": 0.7045092838196286, "grad_norm": 0.9052886472757133, "learning_rate": 4.730254313326181e-05, "loss": 0.8081, "num_input_tokens_seen": 751393984, "step": 830, "train_runtime": 109732.5528, "train_tokens_per_second": 6847.503 }, { "epoch": 0.7087533156498673, "grad_norm": 0.8281556672805458, "learning_rate": 4.724652719280111e-05, "loss": 0.7982, "num_input_tokens_seen": 756075328, "step": 835, "train_runtime": 110411.1601, "train_tokens_per_second": 6847.816 }, { "epoch": 0.7129973474801061, "grad_norm": 0.971171145956504, "learning_rate": 4.718996946628829e-05, "loss": 0.7825, "num_input_tokens_seen": 760528320, "step": 840, "train_runtime": 111055.3652, "train_tokens_per_second": 6848.191 }, { "epoch": 0.7172413793103448, "grad_norm": 0.9931373968227002, "learning_rate": 4.713287133111533e-05, "loss": 0.8096, "num_input_tokens_seen": 765244928, "step": 845, "train_runtime": 111727.8513, "train_tokens_per_second": 6849.187 }, { "epoch": 0.7214854111405835, "grad_norm": 1.2834237882216515, "learning_rate": 4.707523417783511e-05, "loss": 0.7948, "num_input_tokens_seen": 769642624, "step": 850, "train_runtime": 112389.6421, "train_tokens_per_second": 6847.985 }, { "epoch": 0.7257294429708223, "grad_norm": 0.8486465229852926, "learning_rate": 4.701705941012767e-05, "loss": 0.8044, "num_input_tokens_seen": 774147136, "step": 855, "train_runtime": 113005.1043, "train_tokens_per_second": 6850.55 }, { "epoch": 0.729973474801061, "grad_norm": 0.7791940973514704, "learning_rate": 4.6958348444765954e-05, "loss": 0.7998, "num_input_tokens_seen": 778752064, "step": 860, "train_runtime": 113685.5974, "train_tokens_per_second": 6850.05 }, { "epoch": 0.7342175066312997, "grad_norm": 1.1164402590095137, "learning_rate": 4.689910271158131e-05, "loss": 0.8177, "num_input_tokens_seen": 783091968, "step": 865, "train_runtime": 114295.4264, "train_tokens_per_second": 6851.472 }, { "epoch": 0.7384615384615385, "grad_norm": 0.9094005817671243, "learning_rate": 4.6839323653428693e-05, "loss": 0.8154, "num_input_tokens_seen": 787572544, "step": 870, "train_runtime": 114937.7188, "train_tokens_per_second": 6852.168 }, { "epoch": 0.7427055702917772, "grad_norm": 1.0703160970060077, "learning_rate": 4.677901272615149e-05, "loss": 0.8013, "num_input_tokens_seen": 791977152, "step": 875, "train_runtime": 115587.8811, "train_tokens_per_second": 6851.732 }, { "epoch": 0.7469496021220159, "grad_norm": 0.749724050960587, "learning_rate": 4.6718171398546136e-05, "loss": 0.7849, "num_input_tokens_seen": 796372864, "step": 880, "train_runtime": 116239.6888, "train_tokens_per_second": 6851.127 }, { "epoch": 0.7511936339522547, "grad_norm": 0.8931070149936695, "learning_rate": 4.6656801152326244e-05, "loss": 0.7947, "num_input_tokens_seen": 800903424, "step": 885, "train_runtime": 116882.2446, "train_tokens_per_second": 6852.225 }, { "epoch": 0.7511936339522547, "eval_loss": 0.8023512363433838, "eval_runtime": 1055.9576, "eval_samples_per_second": 2.886, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 800903424, "step": 885 }, { "epoch": 0.7554376657824934, "grad_norm": 0.8326667759990252, "learning_rate": 4.6594903482086605e-05, "loss": 0.7865, "num_input_tokens_seen": 805358784, "step": 890, "train_runtime": 118552.9366, "train_tokens_per_second": 6793.242 }, { "epoch": 0.7596816976127321, "grad_norm": 0.8171872300696933, "learning_rate": 4.653247989526675e-05, "loss": 0.811, "num_input_tokens_seen": 809917248, "step": 895, "train_runtime": 119241.0553, "train_tokens_per_second": 6792.268 }, { "epoch": 0.7639257294429708, "grad_norm": 0.9066917659881738, "learning_rate": 4.646953191211422e-05, "loss": 0.7936, "num_input_tokens_seen": 814265600, "step": 900, "train_runtime": 119851.6659, "train_tokens_per_second": 6793.945 }, { "epoch": 0.7681697612732096, "grad_norm": 0.9338261305365108, "learning_rate": 4.640606106564759e-05, "loss": 0.8104, "num_input_tokens_seen": 818678144, "step": 905, "train_runtime": 120471.0976, "train_tokens_per_second": 6795.639 }, { "epoch": 0.7724137931034483, "grad_norm": 0.8719603501589874, "learning_rate": 4.6342068901619115e-05, "loss": 0.7913, "num_input_tokens_seen": 823195328, "step": 910, "train_runtime": 121127.4303, "train_tokens_per_second": 6796.11 }, { "epoch": 0.776657824933687, "grad_norm": 1.0213034444078577, "learning_rate": 4.6277556978477063e-05, "loss": 0.8081, "num_input_tokens_seen": 827899840, "step": 915, "train_runtime": 121853.2081, "train_tokens_per_second": 6794.239 }, { "epoch": 0.7809018567639258, "grad_norm": 1.0336867967280456, "learning_rate": 4.6212526867327785e-05, "loss": 0.8118, "num_input_tokens_seen": 832394688, "step": 920, "train_runtime": 122480.5431, "train_tokens_per_second": 6796.138 }, { "epoch": 0.7851458885941645, "grad_norm": 1.0694313116089975, "learning_rate": 4.614698015189744e-05, "loss": 0.8067, "num_input_tokens_seen": 837091072, "step": 925, "train_runtime": 123144.408, "train_tokens_per_second": 6797.638 }, { "epoch": 0.7893899204244031, "grad_norm": 0.9268002608848666, "learning_rate": 4.6080918428493447e-05, "loss": 0.7948, "num_input_tokens_seen": 841442112, "step": 930, "train_runtime": 123780.7152, "train_tokens_per_second": 6797.845 }, { "epoch": 0.793633952254642, "grad_norm": 0.9823980696238214, "learning_rate": 4.601434330596557e-05, "loss": 0.7926, "num_input_tokens_seen": 845885504, "step": 935, "train_runtime": 124463.32, "train_tokens_per_second": 6796.263 }, { "epoch": 0.7978779840848806, "grad_norm": 1.0766192496117053, "learning_rate": 4.594725640566679e-05, "loss": 0.8019, "num_input_tokens_seen": 850419456, "step": 940, "train_runtime": 125130.694, "train_tokens_per_second": 6796.25 }, { "epoch": 0.8021220159151193, "grad_norm": 1.0293248834632993, "learning_rate": 4.5879659361413754e-05, "loss": 0.8065, "num_input_tokens_seen": 854885120, "step": 945, "train_runtime": 125769.8721, "train_tokens_per_second": 6797.217 }, { "epoch": 0.8063660477453581, "grad_norm": 1.0089948918046479, "learning_rate": 4.581155381944705e-05, "loss": 0.8084, "num_input_tokens_seen": 859518400, "step": 950, "train_runtime": 126468.1103, "train_tokens_per_second": 6796.325 }, { "epoch": 0.8106100795755968, "grad_norm": 0.9051441031845476, "learning_rate": 4.574294143839107e-05, "loss": 0.7832, "num_input_tokens_seen": 863890816, "step": 955, "train_runtime": 127085.4612, "train_tokens_per_second": 6797.716 }, { "epoch": 0.8148541114058355, "grad_norm": 0.8826069009760195, "learning_rate": 4.567382388921363e-05, "loss": 0.8055, "num_input_tokens_seen": 868430208, "step": 960, "train_runtime": 127753.0201, "train_tokens_per_second": 6797.727 }, { "epoch": 0.8190981432360742, "grad_norm": 0.9189235994594633, "learning_rate": 4.560420285518529e-05, "loss": 0.8076, "num_input_tokens_seen": 873261376, "step": 965, "train_runtime": 128445.48, "train_tokens_per_second": 6798.693 }, { "epoch": 0.823342175066313, "grad_norm": 0.6649154972034895, "learning_rate": 4.5534080031838336e-05, "loss": 0.8748, "num_input_tokens_seen": 877848320, "step": 970, "train_runtime": 129117.949, "train_tokens_per_second": 6798.809 }, { "epoch": 0.8275862068965517, "grad_norm": 0.7645668184363056, "learning_rate": 4.5463457126925493e-05, "loss": 0.7949, "num_input_tokens_seen": 882236288, "step": 975, "train_runtime": 129732.9954, "train_tokens_per_second": 6800.4 }, { "epoch": 0.8318302387267904, "grad_norm": 0.9335431624509639, "learning_rate": 4.539233586037836e-05, "loss": 0.7904, "num_input_tokens_seen": 886710592, "step": 980, "train_runtime": 130375.8011, "train_tokens_per_second": 6801.19 }, { "epoch": 0.8360742705570292, "grad_norm": 0.8454411533111347, "learning_rate": 4.532071796426549e-05, "loss": 0.7944, "num_input_tokens_seen": 891233088, "step": 985, "train_runtime": 131026.9147, "train_tokens_per_second": 6801.909 }, { "epoch": 0.8403183023872679, "grad_norm": 0.9294276165792987, "learning_rate": 4.5248605182750224e-05, "loss": 0.8052, "num_input_tokens_seen": 895797056, "step": 990, "train_runtime": 131628.0839, "train_tokens_per_second": 6805.516 }, { "epoch": 0.8445623342175066, "grad_norm": 1.0688757710466947, "learning_rate": 4.5175999272048205e-05, "loss": 0.7871, "num_input_tokens_seen": 900477248, "step": 995, "train_runtime": 132307.0252, "train_tokens_per_second": 6805.967 }, { "epoch": 0.8488063660477454, "grad_norm": 0.8274182681903438, "learning_rate": 4.510290200038463e-05, "loss": 0.8022, "num_input_tokens_seen": 905019392, "step": 1000, "train_runtime": 132955.1592, "train_tokens_per_second": 6806.952 }, { "epoch": 0.8530503978779841, "grad_norm": 1.0546997932680735, "learning_rate": 4.502931514795116e-05, "loss": 0.7817, "num_input_tokens_seen": 909356352, "step": 1005, "train_runtime": 133539.5693, "train_tokens_per_second": 6809.64 }, { "epoch": 0.8572944297082228, "grad_norm": 1.2588102790900502, "learning_rate": 4.495524050686257e-05, "loss": 0.788, "num_input_tokens_seen": 913771904, "step": 1010, "train_runtime": 134129.5941, "train_tokens_per_second": 6812.605 }, { "epoch": 0.8615384615384616, "grad_norm": 1.1408803201963393, "learning_rate": 4.488067988111313e-05, "loss": 0.8001, "num_input_tokens_seen": 918194944, "step": 1015, "train_runtime": 134742.2212, "train_tokens_per_second": 6814.456 }, { "epoch": 0.8657824933687003, "grad_norm": 1.1899291730699249, "learning_rate": 4.480563508653264e-05, "loss": 0.7955, "num_input_tokens_seen": 922666688, "step": 1020, "train_runtime": 135362.6791, "train_tokens_per_second": 6816.256 }, { "epoch": 0.870026525198939, "grad_norm": 0.8130281973510006, "learning_rate": 4.473010795074221e-05, "loss": 0.7979, "num_input_tokens_seen": 927156672, "step": 1025, "train_runtime": 136033.28, "train_tokens_per_second": 6815.661 }, { "epoch": 0.8742705570291777, "grad_norm": 0.7570121360764169, "learning_rate": 4.465410031310979e-05, "loss": 0.8073, "num_input_tokens_seen": 931890368, "step": 1030, "train_runtime": 136755.15, "train_tokens_per_second": 6814.298 }, { "epoch": 0.8785145888594165, "grad_norm": 0.9159403793732956, "learning_rate": 4.457761402470532e-05, "loss": 0.8, "num_input_tokens_seen": 936323264, "step": 1035, "train_runtime": 137383.018, "train_tokens_per_second": 6815.422 }, { "epoch": 0.8827586206896552, "grad_norm": 1.0377351854287133, "learning_rate": 4.450065094825567e-05, "loss": 0.801, "num_input_tokens_seen": 940907840, "step": 1040, "train_runtime": 138076.6951, "train_tokens_per_second": 6814.386 }, { "epoch": 0.8870026525198939, "grad_norm": 0.8502067435449593, "learning_rate": 4.442321295809932e-05, "loss": 0.7884, "num_input_tokens_seen": 945377920, "step": 1045, "train_runtime": 138693.9405, "train_tokens_per_second": 6816.289 }, { "epoch": 0.8912466843501327, "grad_norm": 1.0008412143003285, "learning_rate": 4.4345301940140625e-05, "loss": 0.794, "num_input_tokens_seen": 949653760, "step": 1050, "train_runtime": 139252.8781, "train_tokens_per_second": 6819.635 }, { "epoch": 0.8954907161803713, "grad_norm": 0.7701062680223436, "learning_rate": 4.426691979180395e-05, "loss": 0.7879, "num_input_tokens_seen": 953995840, "step": 1055, "train_runtime": 139848.6279, "train_tokens_per_second": 6821.632 }, { "epoch": 0.89973474801061, "grad_norm": 0.7657676098694198, "learning_rate": 4.4188068421987475e-05, "loss": 0.78, "num_input_tokens_seen": 958380160, "step": 1060, "train_runtime": 140471.8443, "train_tokens_per_second": 6822.578 }, { "epoch": 0.9039787798408488, "grad_norm": 0.9443174140002766, "learning_rate": 4.410874975101662e-05, "loss": 0.7975, "num_input_tokens_seen": 962938624, "step": 1065, "train_runtime": 141147.4618, "train_tokens_per_second": 6822.217 }, { "epoch": 0.9082228116710875, "grad_norm": 1.0109837282388179, "learning_rate": 4.402896571059738e-05, "loss": 0.7979, "num_input_tokens_seen": 967324608, "step": 1070, "train_runtime": 141768.6116, "train_tokens_per_second": 6823.264 }, { "epoch": 0.9124668435013262, "grad_norm": 0.8120979404556385, "learning_rate": 4.394871824376923e-05, "loss": 0.7889, "num_input_tokens_seen": 971853824, "step": 1075, "train_runtime": 142397.8849, "train_tokens_per_second": 6824.918 }, { "epoch": 0.916710875331565, "grad_norm": 0.7674620546551851, "learning_rate": 4.386800930485777e-05, "loss": 0.7872, "num_input_tokens_seen": 976342336, "step": 1080, "train_runtime": 143033.2803, "train_tokens_per_second": 6825.98 }, { "epoch": 0.9209549071618037, "grad_norm": 0.8642492390279295, "learning_rate": 4.378684085942722e-05, "loss": 0.7968, "num_input_tokens_seen": 980950016, "step": 1085, "train_runtime": 143727.3251, "train_tokens_per_second": 6825.077 }, { "epoch": 0.9251989389920424, "grad_norm": 0.7868533581684258, "learning_rate": 4.370521488423248e-05, "loss": 0.7723, "num_input_tokens_seen": 985579968, "step": 1090, "train_runtime": 144414.0117, "train_tokens_per_second": 6824.684 }, { "epoch": 0.9294429708222812, "grad_norm": 0.9784190030448764, "learning_rate": 4.3623133367171e-05, "loss": 0.7657, "num_input_tokens_seen": 990242240, "step": 1095, "train_runtime": 145104.4504, "train_tokens_per_second": 6824.341 }, { "epoch": 0.9336870026525199, "grad_norm": 0.8416491217730794, "learning_rate": 4.354059830723439e-05, "loss": 0.7762, "num_input_tokens_seen": 994700352, "step": 1100, "train_runtime": 145724.9371, "train_tokens_per_second": 6825.876 }, { "epoch": 0.9379310344827586, "grad_norm": 0.8128690749017204, "learning_rate": 4.34576117144597e-05, "loss": 0.7872, "num_input_tokens_seen": 999373568, "step": 1105, "train_runtime": 146429.9411, "train_tokens_per_second": 6824.926 }, { "epoch": 0.9421750663129973, "grad_norm": 0.8795752491352558, "learning_rate": 4.337417560988053e-05, "loss": 0.7907, "num_input_tokens_seen": 1003937216, "step": 1110, "train_runtime": 147068.6613, "train_tokens_per_second": 6826.316 }, { "epoch": 0.9464190981432361, "grad_norm": 1.0803855481863844, "learning_rate": 4.329029202547774e-05, "loss": 0.7802, "num_input_tokens_seen": 1008544768, "step": 1115, "train_runtime": 147756.5973, "train_tokens_per_second": 6825.717 }, { "epoch": 0.9506631299734748, "grad_norm": 0.9249709902148427, "learning_rate": 4.3205963004130016e-05, "loss": 0.7835, "num_input_tokens_seen": 1013050048, "step": 1120, "train_runtime": 148402.3493, "train_tokens_per_second": 6826.375 }, { "epoch": 0.9549071618037135, "grad_norm": 0.8556253260160931, "learning_rate": 4.3121190599564075e-05, "loss": 0.7797, "num_input_tokens_seen": 1017459840, "step": 1125, "train_runtime": 149027.5856, "train_tokens_per_second": 6827.326 }, { "epoch": 0.9591511936339523, "grad_norm": 1.0166848790525587, "learning_rate": 4.30359768763047e-05, "loss": 0.7676, "num_input_tokens_seen": 1022257088, "step": 1130, "train_runtime": 149712.8732, "train_tokens_per_second": 6828.117 }, { "epoch": 0.963395225464191, "grad_norm": 0.9097894747416901, "learning_rate": 4.2950323909624404e-05, "loss": 0.7736, "num_input_tokens_seen": 1026797248, "step": 1135, "train_runtime": 150374.5156, "train_tokens_per_second": 6828.266 }, { "epoch": 0.9676392572944297, "grad_norm": 0.7704218910218441, "learning_rate": 4.286423378549294e-05, "loss": 0.7899, "num_input_tokens_seen": 1031489344, "step": 1140, "train_runtime": 151058.329, "train_tokens_per_second": 6828.418 }, { "epoch": 0.9718832891246685, "grad_norm": 0.7852167446402514, "learning_rate": 4.2777708600526475e-05, "loss": 0.7825, "num_input_tokens_seen": 1035924096, "step": 1145, "train_runtime": 151677.6959, "train_tokens_per_second": 6829.772 }, { "epoch": 0.9761273209549072, "grad_norm": 0.8709541370781978, "learning_rate": 4.269075046193651e-05, "loss": 0.7853, "num_input_tokens_seen": 1040557120, "step": 1150, "train_runtime": 152331.7174, "train_tokens_per_second": 6830.863 }, { "epoch": 0.9803713527851459, "grad_norm": 0.7354772314266763, "learning_rate": 4.2603361487478635e-05, "loss": 0.7796, "num_input_tokens_seen": 1045138240, "step": 1155, "train_runtime": 153014.033, "train_tokens_per_second": 6830.342 }, { "epoch": 0.9846153846153847, "grad_norm": 0.8929842185397294, "learning_rate": 4.2515543805400845e-05, "loss": 0.7931, "num_input_tokens_seen": 1049637440, "step": 1160, "train_runtime": 153673.9541, "train_tokens_per_second": 6830.288 }, { "epoch": 0.9888594164456234, "grad_norm": 0.8843306924811901, "learning_rate": 4.2427299554391795e-05, "loss": 0.7818, "num_input_tokens_seen": 1054084800, "step": 1165, "train_runtime": 154301.843, "train_tokens_per_second": 6831.317 }, { "epoch": 0.993103448275862, "grad_norm": 0.7260294542114571, "learning_rate": 4.2338630883528694e-05, "loss": 0.7868, "num_input_tokens_seen": 1058576128, "step": 1170, "train_runtime": 154951.0655, "train_tokens_per_second": 6831.68 }, { "epoch": 0.9973474801061007, "grad_norm": 1.0310167390412928, "learning_rate": 4.224953995222495e-05, "loss": 0.7913, "num_input_tokens_seen": 1063234944, "step": 1175, "train_runtime": 155625.4902, "train_tokens_per_second": 6832.01 }, { "epoch": 1.0008488063660477, "grad_norm": 0.9546542980993455, "learning_rate": 4.2160028930177586e-05, "loss": 0.6435, "num_input_tokens_seen": 1066978304, "step": 1180, "train_runtime": 156195.3002, "train_tokens_per_second": 6831.053 }, { "epoch": 1.0008488063660477, "eval_loss": 0.7810727953910828, "eval_runtime": 1060.3774, "eval_samples_per_second": 2.874, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 1066978304, "step": 1180 }, { "epoch": 1.0050928381962865, "grad_norm": 1.0559718016156323, "learning_rate": 4.207009999731441e-05, "loss": 0.724, "num_input_tokens_seen": 1071397376, "step": 1185, "train_runtime": 157954.8381, "train_tokens_per_second": 6782.935 }, { "epoch": 1.0093368700265253, "grad_norm": 0.8356946223535584, "learning_rate": 4.1979755343740936e-05, "loss": 0.7198, "num_input_tokens_seen": 1075925056, "step": 1190, "train_runtime": 158621.7481, "train_tokens_per_second": 6782.961 }, { "epoch": 1.0135809018567639, "grad_norm": 0.9532551565257256, "learning_rate": 4.188899716968699e-05, "loss": 0.7137, "num_input_tokens_seen": 1080308416, "step": 1195, "train_runtime": 159243.7038, "train_tokens_per_second": 6783.995 }, { "epoch": 1.0178249336870027, "grad_norm": 0.886163908743495, "learning_rate": 4.179782768545321e-05, "loss": 0.6985, "num_input_tokens_seen": 1084861632, "step": 1200, "train_runtime": 159917.1941, "train_tokens_per_second": 6783.896 }, { "epoch": 1.0220689655172415, "grad_norm": 0.8306981011679802, "learning_rate": 4.170624911135713e-05, "loss": 0.7246, "num_input_tokens_seen": 1089273600, "step": 1205, "train_runtime": 160556.71, "train_tokens_per_second": 6784.354 }, { "epoch": 1.02631299734748, "grad_norm": 0.9829055718597421, "learning_rate": 4.161426367767921e-05, "loss": 0.7219, "num_input_tokens_seen": 1093625472, "step": 1210, "train_runtime": 161182.2827, "train_tokens_per_second": 6785.023 }, { "epoch": 1.0305570291777189, "grad_norm": 0.9263411832733288, "learning_rate": 4.1521873624608396e-05, "loss": 0.7293, "num_input_tokens_seen": 1098036992, "step": 1215, "train_runtime": 161813.3681, "train_tokens_per_second": 6785.824 }, { "epoch": 1.0348010610079577, "grad_norm": 1.3238936418341176, "learning_rate": 4.1429081202187667e-05, "loss": 0.7214, "num_input_tokens_seen": 1102560128, "step": 1220, "train_runtime": 162471.5936, "train_tokens_per_second": 6786.172 }, { "epoch": 1.0390450928381962, "grad_norm": 0.822682923114858, "learning_rate": 4.1335888670259196e-05, "loss": 0.704, "num_input_tokens_seen": 1107129920, "step": 1225, "train_runtime": 163123.1296, "train_tokens_per_second": 6787.081 }, { "epoch": 1.043289124668435, "grad_norm": 0.9140492461674421, "learning_rate": 4.12422982984093e-05, "loss": 0.7199, "num_input_tokens_seen": 1111822080, "step": 1230, "train_runtime": 163825.6741, "train_tokens_per_second": 6786.617 }, { "epoch": 1.0475331564986738, "grad_norm": 0.8760621488535013, "learning_rate": 4.11483123659132e-05, "loss": 0.7055, "num_input_tokens_seen": 1116287808, "step": 1235, "train_runtime": 164439.6584, "train_tokens_per_second": 6788.434 }, { "epoch": 1.0517771883289124, "grad_norm": 0.8280642864840718, "learning_rate": 4.1053933161679494e-05, "loss": 0.7235, "num_input_tokens_seen": 1120773120, "step": 1240, "train_runtime": 165072.1322, "train_tokens_per_second": 6789.596 }, { "epoch": 1.0560212201591512, "grad_norm": 0.9389411578202482, "learning_rate": 4.095916298419441e-05, "loss": 0.7058, "num_input_tokens_seen": 1125450432, "step": 1245, "train_runtime": 165746.363, "train_tokens_per_second": 6790.197 }, { "epoch": 1.06026525198939, "grad_norm": 0.8674975290576171, "learning_rate": 4.0864004141465844e-05, "loss": 0.7144, "num_input_tokens_seen": 1129770880, "step": 1250, "train_runtime": 166381.7549, "train_tokens_per_second": 6790.233 }, { "epoch": 1.0645092838196286, "grad_norm": 0.8423637185173262, "learning_rate": 4.0768458950967135e-05, "loss": 0.6924, "num_input_tokens_seen": 1134325824, "step": 1255, "train_runtime": 167041.841, "train_tokens_per_second": 6790.669 }, { "epoch": 1.0687533156498674, "grad_norm": 0.8463731702794449, "learning_rate": 4.067252973958064e-05, "loss": 0.7022, "num_input_tokens_seen": 1138890880, "step": 1260, "train_runtime": 167698.4028, "train_tokens_per_second": 6791.304 }, { "epoch": 1.072997347480106, "grad_norm": 1.2814601777565326, "learning_rate": 4.0576218843541046e-05, "loss": 0.7244, "num_input_tokens_seen": 1143446080, "step": 1265, "train_runtime": 168389.1765, "train_tokens_per_second": 6790.496 }, { "epoch": 1.0772413793103448, "grad_norm": 0.7804384260432371, "learning_rate": 4.0479528608378515e-05, "loss": 0.7118, "num_input_tokens_seen": 1148022848, "step": 1270, "train_runtime": 169068.4692, "train_tokens_per_second": 6790.284 }, { "epoch": 1.0814854111405836, "grad_norm": 0.8440032711255191, "learning_rate": 4.0382461388861505e-05, "loss": 0.7069, "num_input_tokens_seen": 1152678080, "step": 1275, "train_runtime": 169749.9181, "train_tokens_per_second": 6790.449 }, { "epoch": 1.0857294429708222, "grad_norm": 1.0462601786941845, "learning_rate": 4.0285019548939464e-05, "loss": 0.7009, "num_input_tokens_seen": 1157385088, "step": 1280, "train_runtime": 170412.1829, "train_tokens_per_second": 6791.68 }, { "epoch": 1.089973474801061, "grad_norm": 0.7949896906145127, "learning_rate": 4.018720546168524e-05, "loss": 0.714, "num_input_tokens_seen": 1161843200, "step": 1285, "train_runtime": 171052.7364, "train_tokens_per_second": 6792.31 }, { "epoch": 1.0942175066312998, "grad_norm": 0.8723394657733861, "learning_rate": 4.008902150923731e-05, "loss": 0.7173, "num_input_tokens_seen": 1166275008, "step": 1290, "train_runtime": 171633.6694, "train_tokens_per_second": 6795.141 }, { "epoch": 1.0984615384615384, "grad_norm": 1.271199939246554, "learning_rate": 3.999047008274173e-05, "loss": 0.718, "num_input_tokens_seen": 1170805952, "step": 1295, "train_runtime": 172293.1957, "train_tokens_per_second": 6795.428 }, { "epoch": 1.1027055702917772, "grad_norm": 0.7123662125520419, "learning_rate": 3.989155358229394e-05, "loss": 0.7326, "num_input_tokens_seen": 1175398720, "step": 1300, "train_runtime": 172948.1235, "train_tokens_per_second": 6796.25 }, { "epoch": 1.106949602122016, "grad_norm": 0.838546216114602, "learning_rate": 3.979227441688028e-05, "loss": 0.7096, "num_input_tokens_seen": 1179790336, "step": 1305, "train_runtime": 173532.3842, "train_tokens_per_second": 6798.675 }, { "epoch": 1.1111936339522546, "grad_norm": 1.0578594741413954, "learning_rate": 3.969263500431935e-05, "loss": 0.736, "num_input_tokens_seen": 1184330304, "step": 1310, "train_runtime": 174200.2849, "train_tokens_per_second": 6798.67 }, { "epoch": 1.1154376657824934, "grad_norm": 1.0727364156307728, "learning_rate": 3.9592637771203114e-05, "loss": 0.7149, "num_input_tokens_seen": 1188880384, "step": 1315, "train_runtime": 174872.9271, "train_tokens_per_second": 6798.539 }, { "epoch": 1.1196816976127322, "grad_norm": 1.4213471650077618, "learning_rate": 3.949228515283777e-05, "loss": 0.7044, "num_input_tokens_seen": 1193170816, "step": 1320, "train_runtime": 175486.1393, "train_tokens_per_second": 6799.231 }, { "epoch": 1.1239257294429708, "grad_norm": 0.9528308642512087, "learning_rate": 3.9391579593184525e-05, "loss": 0.7046, "num_input_tokens_seen": 1197641344, "step": 1325, "train_runtime": 176147.7116, "train_tokens_per_second": 6799.074 }, { "epoch": 1.1281697612732096, "grad_norm": 0.8240250587692688, "learning_rate": 3.929052354479999e-05, "loss": 0.7073, "num_input_tokens_seen": 1202042432, "step": 1330, "train_runtime": 176809.4403, "train_tokens_per_second": 6798.52 }, { "epoch": 1.1324137931034484, "grad_norm": 0.9575820680101333, "learning_rate": 3.918911946877651e-05, "loss": 0.7123, "num_input_tokens_seen": 1206438080, "step": 1335, "train_runtime": 177399.569, "train_tokens_per_second": 6800.682 }, { "epoch": 1.136657824933687, "grad_norm": 1.0494568271450921, "learning_rate": 3.908736983468219e-05, "loss": 0.7037, "num_input_tokens_seen": 1211039616, "step": 1340, "train_runtime": 178060.7312, "train_tokens_per_second": 6801.273 }, { "epoch": 1.1409018567639257, "grad_norm": 1.3953016773438447, "learning_rate": 3.898527712050074e-05, "loss": 0.6992, "num_input_tokens_seen": 1215405568, "step": 1345, "train_runtime": 178700.2333, "train_tokens_per_second": 6801.365 }, { "epoch": 1.1451458885941646, "grad_norm": 0.9502009725403173, "learning_rate": 3.88828438125712e-05, "loss": 0.7273, "num_input_tokens_seen": 1220023168, "step": 1350, "train_runtime": 179354.7367, "train_tokens_per_second": 6802.291 }, { "epoch": 1.1493899204244031, "grad_norm": 1.1556987622554753, "learning_rate": 3.878007240552732e-05, "loss": 0.6946, "num_input_tokens_seen": 1224574464, "step": 1355, "train_runtime": 180001.4614, "train_tokens_per_second": 6803.136 }, { "epoch": 1.153633952254642, "grad_norm": 0.792466643427966, "learning_rate": 3.867696540223681e-05, "loss": 0.708, "num_input_tokens_seen": 1229115520, "step": 1360, "train_runtime": 180638.8559, "train_tokens_per_second": 6804.27 }, { "epoch": 1.1578779840848807, "grad_norm": 0.9099009827745455, "learning_rate": 3.8573525313740435e-05, "loss": 0.7198, "num_input_tokens_seen": 1233652160, "step": 1365, "train_runtime": 181281.2594, "train_tokens_per_second": 6805.183 }, { "epoch": 1.1621220159151193, "grad_norm": 0.8873710969043223, "learning_rate": 3.846975465919079e-05, "loss": 0.7047, "num_input_tokens_seen": 1238186112, "step": 1370, "train_runtime": 181914.6873, "train_tokens_per_second": 6806.411 }, { "epoch": 1.1663660477453581, "grad_norm": 1.0769227041949128, "learning_rate": 3.836565596579103e-05, "loss": 0.7363, "num_input_tokens_seen": 1242761728, "step": 1375, "train_runtime": 182587.2454, "train_tokens_per_second": 6806.399 }, { "epoch": 1.1706100795755967, "grad_norm": 0.9446612826256684, "learning_rate": 3.826123176873324e-05, "loss": 0.7001, "num_input_tokens_seen": 1247182656, "step": 1380, "train_runtime": 183248.8336, "train_tokens_per_second": 6805.951 }, { "epoch": 1.1748541114058355, "grad_norm": 0.7930410617767145, "learning_rate": 3.8156484611136774e-05, "loss": 0.7121, "num_input_tokens_seen": 1251653056, "step": 1385, "train_runtime": 183867.2318, "train_tokens_per_second": 6807.374 }, { "epoch": 1.1790981432360743, "grad_norm": 1.2141311456918997, "learning_rate": 3.805141704398626e-05, "loss": 0.7085, "num_input_tokens_seen": 1256043584, "step": 1390, "train_runtime": 184507.3826, "train_tokens_per_second": 6807.552 }, { "epoch": 1.1833421750663131, "grad_norm": 0.8727742652201835, "learning_rate": 3.794603162606949e-05, "loss": 0.7021, "num_input_tokens_seen": 1260434688, "step": 1395, "train_runtime": 185139.6011, "train_tokens_per_second": 6808.023 }, { "epoch": 1.1875862068965517, "grad_norm": 0.7037237894630392, "learning_rate": 3.784033092391513e-05, "loss": 0.732, "num_input_tokens_seen": 1264932736, "step": 1400, "train_runtime": 185754.0941, "train_tokens_per_second": 6809.717 }, { "epoch": 1.1918302387267905, "grad_norm": 0.8398911891859556, "learning_rate": 3.773431751173018e-05, "loss": 0.7254, "num_input_tokens_seen": 1269425664, "step": 1405, "train_runtime": 186452.0318, "train_tokens_per_second": 6808.323 }, { "epoch": 1.196074270557029, "grad_norm": 1.046439278032978, "learning_rate": 3.76279939713373e-05, "loss": 0.7034, "num_input_tokens_seen": 1273725056, "step": 1410, "train_runtime": 187095.6204, "train_tokens_per_second": 6807.883 }, { "epoch": 1.2003183023872679, "grad_norm": 0.7473279729847268, "learning_rate": 3.7521362892111945e-05, "loss": 0.7002, "num_input_tokens_seen": 1278142592, "step": 1415, "train_runtime": 187703.911, "train_tokens_per_second": 6809.355 }, { "epoch": 1.2045623342175067, "grad_norm": 1.0374275407130875, "learning_rate": 3.741442687091926e-05, "loss": 0.7204, "num_input_tokens_seen": 1282692032, "step": 1420, "train_runtime": 188345.3351, "train_tokens_per_second": 6810.32 }, { "epoch": 1.2088063660477453, "grad_norm": 0.7440268721036309, "learning_rate": 3.730718851205089e-05, "loss": 0.7114, "num_input_tokens_seen": 1287034560, "step": 1425, "train_runtime": 188944.1141, "train_tokens_per_second": 6811.721 }, { "epoch": 1.213050397877984, "grad_norm": 0.776479873397123, "learning_rate": 3.719965042716154e-05, "loss": 0.7081, "num_input_tokens_seen": 1291460416, "step": 1430, "train_runtime": 189556.4755, "train_tokens_per_second": 6813.064 }, { "epoch": 1.2172944297082229, "grad_norm": 0.7404149511150007, "learning_rate": 3.709181523520532e-05, "loss": 0.7022, "num_input_tokens_seen": 1296144576, "step": 1435, "train_runtime": 190275.4375, "train_tokens_per_second": 6811.938 }, { "epoch": 1.2215384615384615, "grad_norm": 0.7950074266734066, "learning_rate": 3.698368556237206e-05, "loss": 0.7245, "num_input_tokens_seen": 1300612352, "step": 1440, "train_runtime": 190917.0602, "train_tokens_per_second": 6812.447 }, { "epoch": 1.2257824933687003, "grad_norm": 0.8330406088802553, "learning_rate": 3.687526404202326e-05, "loss": 0.6876, "num_input_tokens_seen": 1305227776, "step": 1445, "train_runtime": 191618.6772, "train_tokens_per_second": 6811.59 }, { "epoch": 1.230026525198939, "grad_norm": 0.7986919467015979, "learning_rate": 3.6766553314628016e-05, "loss": 0.6882, "num_input_tokens_seen": 1309815616, "step": 1450, "train_runtime": 192268.2878, "train_tokens_per_second": 6812.437 }, { "epoch": 1.2342705570291777, "grad_norm": 0.8194811456779011, "learning_rate": 3.66575560276987e-05, "loss": 0.6992, "num_input_tokens_seen": 1314293440, "step": 1455, "train_runtime": 192911.152, "train_tokens_per_second": 6812.947 }, { "epoch": 1.2385145888594165, "grad_norm": 0.8920784814801206, "learning_rate": 3.654827483572647e-05, "loss": 0.7034, "num_input_tokens_seen": 1318678784, "step": 1460, "train_runtime": 193534.093, "train_tokens_per_second": 6813.677 }, { "epoch": 1.2427586206896553, "grad_norm": 0.7947592511270455, "learning_rate": 3.6438712400116626e-05, "loss": 0.7277, "num_input_tokens_seen": 1323485248, "step": 1465, "train_runtime": 194275.8066, "train_tokens_per_second": 6812.404 }, { "epoch": 1.2470026525198938, "grad_norm": 0.9177270797271538, "learning_rate": 3.6328871389123817e-05, "loss": 0.7177, "num_input_tokens_seen": 1327989184, "step": 1470, "train_runtime": 194920.646, "train_tokens_per_second": 6812.973 }, { "epoch": 1.2512466843501326, "grad_norm": 0.9095775155270419, "learning_rate": 3.6218754477787034e-05, "loss": 0.69, "num_input_tokens_seen": 1332484288, "step": 1475, "train_runtime": 195594.2663, "train_tokens_per_second": 6812.492 }, { "epoch": 1.2512466843501326, "eval_loss": 0.7701402306556702, "eval_runtime": 1058.7996, "eval_samples_per_second": 2.878, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 1332484288, "step": 1475 }, { "epoch": 1.2554907161803714, "grad_norm": 1.0025427441776165, "learning_rate": 3.610836434786448e-05, "loss": 0.7226, "num_input_tokens_seen": 1336834944, "step": 1480, "train_runtime": 197298.3845, "train_tokens_per_second": 6775.701 }, { "epoch": 1.25973474801061, "grad_norm": 1.1051405618486907, "learning_rate": 3.599770368776824e-05, "loss": 0.717, "num_input_tokens_seen": 1341307904, "step": 1485, "train_runtime": 197946.5126, "train_tokens_per_second": 6776.113 }, { "epoch": 1.2639787798408488, "grad_norm": 0.7277747101019799, "learning_rate": 3.588677519249883e-05, "loss": 0.7129, "num_input_tokens_seen": 1345945600, "step": 1490, "train_runtime": 198583.5461, "train_tokens_per_second": 6777.73 }, { "epoch": 1.2682228116710874, "grad_norm": 1.0644077159168257, "learning_rate": 3.577558156357954e-05, "loss": 0.6964, "num_input_tokens_seen": 1350721856, "step": 1495, "train_runtime": 199281.1546, "train_tokens_per_second": 6777.971 }, { "epoch": 1.2724668435013262, "grad_norm": 0.7336344093486956, "learning_rate": 3.566412550899067e-05, "loss": 0.7085, "num_input_tokens_seen": 1355297856, "step": 1500, "train_runtime": 199944.5213, "train_tokens_per_second": 6778.37 }, { "epoch": 1.276710875331565, "grad_norm": 0.8133225375902521, "learning_rate": 3.5552409743103556e-05, "loss": 0.6867, "num_input_tokens_seen": 1359679104, "step": 1505, "train_runtime": 200577.4067, "train_tokens_per_second": 6778.825 }, { "epoch": 1.2809549071618038, "grad_norm": 0.697351763291778, "learning_rate": 3.5440436986614475e-05, "loss": 0.6982, "num_input_tokens_seen": 1364232960, "step": 1510, "train_runtime": 201233.0814, "train_tokens_per_second": 6779.367 }, { "epoch": 1.2851989389920424, "grad_norm": 0.7278903758794186, "learning_rate": 3.53282099664784e-05, "loss": 0.6908, "num_input_tokens_seen": 1368631360, "step": 1515, "train_runtime": 201880.3686, "train_tokens_per_second": 6779.418 }, { "epoch": 1.2894429708222812, "grad_norm": 0.8347309141959418, "learning_rate": 3.521573141584254e-05, "loss": 0.6901, "num_input_tokens_seen": 1373082240, "step": 1520, "train_runtime": 202488.6455, "train_tokens_per_second": 6781.033 }, { "epoch": 1.2936870026525198, "grad_norm": 0.8283488230820615, "learning_rate": 3.5103004073979854e-05, "loss": 0.6807, "num_input_tokens_seen": 1377433792, "step": 1525, "train_runtime": 203113.9684, "train_tokens_per_second": 6781.581 }, { "epoch": 1.2979310344827586, "grad_norm": 0.7419000730982289, "learning_rate": 3.499003068622226e-05, "loss": 0.6995, "num_input_tokens_seen": 1382143808, "step": 1530, "train_runtime": 203848.007, "train_tokens_per_second": 6780.266 }, { "epoch": 1.3021750663129974, "grad_norm": 0.8544362640648647, "learning_rate": 3.487681400389384e-05, "loss": 0.6932, "num_input_tokens_seen": 1386581504, "step": 1535, "train_runtime": 204454.9055, "train_tokens_per_second": 6781.845 }, { "epoch": 1.3064190981432362, "grad_norm": 0.7409589690489501, "learning_rate": 3.4763356784243784e-05, "loss": 0.6943, "num_input_tokens_seen": 1391187968, "step": 1540, "train_runtime": 205112.5105, "train_tokens_per_second": 6782.56 }, { "epoch": 1.3106631299734748, "grad_norm": 0.8626742457912966, "learning_rate": 3.4649661790379285e-05, "loss": 0.6894, "num_input_tokens_seen": 1395756992, "step": 1545, "train_runtime": 205751.9562, "train_tokens_per_second": 6783.688 }, { "epoch": 1.3149071618037136, "grad_norm": 0.9751075736447785, "learning_rate": 3.453573179119821e-05, "loss": 0.6919, "num_input_tokens_seen": 1400210880, "step": 1550, "train_runtime": 206360.6402, "train_tokens_per_second": 6785.261 }, { "epoch": 1.3191511936339522, "grad_norm": 0.839491541858112, "learning_rate": 3.4421569561321705e-05, "loss": 0.7069, "num_input_tokens_seen": 1404544896, "step": 1555, "train_runtime": 206988.7608, "train_tokens_per_second": 6785.609 }, { "epoch": 1.323395225464191, "grad_norm": 0.7686049267619298, "learning_rate": 3.4307177881026574e-05, "loss": 0.7102, "num_input_tokens_seen": 1409135360, "step": 1560, "train_runtime": 207645.0724, "train_tokens_per_second": 6786.269 }, { "epoch": 1.3276392572944298, "grad_norm": 0.7897081067547944, "learning_rate": 3.419255953617762e-05, "loss": 0.7095, "num_input_tokens_seen": 1413494272, "step": 1565, "train_runtime": 208306.3313, "train_tokens_per_second": 6785.652 }, { "epoch": 1.3318832891246684, "grad_norm": 0.9461779222467549, "learning_rate": 3.407771731815975e-05, "loss": 0.685, "num_input_tokens_seen": 1417846272, "step": 1570, "train_runtime": 208904.186, "train_tokens_per_second": 6787.065 }, { "epoch": 1.3361273209549072, "grad_norm": 0.8934996824542671, "learning_rate": 3.3962654023810056e-05, "loss": 0.6814, "num_input_tokens_seen": 1422340672, "step": 1575, "train_runtime": 209567.9077, "train_tokens_per_second": 6787.016 }, { "epoch": 1.340371352785146, "grad_norm": 0.8661495630419888, "learning_rate": 3.384737245534962e-05, "loss": 0.7181, "num_input_tokens_seen": 1426955904, "step": 1580, "train_runtime": 210237.9687, "train_tokens_per_second": 6787.337 }, { "epoch": 1.3446153846153845, "grad_norm": 0.8719974075076845, "learning_rate": 3.373187542031534e-05, "loss": 0.6959, "num_input_tokens_seen": 1431489088, "step": 1585, "train_runtime": 210873.0161, "train_tokens_per_second": 6788.394 }, { "epoch": 1.3488594164456233, "grad_norm": 1.1626368748489013, "learning_rate": 3.361616573149153e-05, "loss": 0.6832, "num_input_tokens_seen": 1435997504, "step": 1590, "train_runtime": 211532.587, "train_tokens_per_second": 6788.54 }, { "epoch": 1.3531034482758622, "grad_norm": 0.8538479857974732, "learning_rate": 3.350024620684142e-05, "loss": 0.7099, "num_input_tokens_seen": 1440263232, "step": 1595, "train_runtime": 212121.5231, "train_tokens_per_second": 6789.802 }, { "epoch": 1.3573474801061007, "grad_norm": 0.8056069140705914, "learning_rate": 3.338411966943852e-05, "loss": 0.69, "num_input_tokens_seen": 1444712192, "step": 1600, "train_runtime": 212749.667, "train_tokens_per_second": 6790.667 }, { "epoch": 1.3615915119363395, "grad_norm": 0.7748917218818391, "learning_rate": 3.326778894739787e-05, "loss": 0.7012, "num_input_tokens_seen": 1449143872, "step": 1605, "train_runtime": 213375.0723, "train_tokens_per_second": 6791.533 }, { "epoch": 1.3658355437665781, "grad_norm": 0.7420765913634999, "learning_rate": 3.3151256873807166e-05, "loss": 0.7044, "num_input_tokens_seen": 1453720384, "step": 1610, "train_runtime": 214016.224, "train_tokens_per_second": 6792.571 }, { "epoch": 1.370079575596817, "grad_norm": 0.8236976518371382, "learning_rate": 3.3034526286657784e-05, "loss": 0.6767, "num_input_tokens_seen": 1458310144, "step": 1615, "train_runtime": 214691.52, "train_tokens_per_second": 6792.584 }, { "epoch": 1.3743236074270557, "grad_norm": 1.234048869991443, "learning_rate": 3.291760002877563e-05, "loss": 0.694, "num_input_tokens_seen": 1462804672, "step": 1620, "train_runtime": 215323.1611, "train_tokens_per_second": 6793.531 }, { "epoch": 1.3785676392572945, "grad_norm": 1.1190238309471685, "learning_rate": 3.280048094775194e-05, "loss": 0.6912, "num_input_tokens_seen": 1467354688, "step": 1625, "train_runtime": 215957.7438, "train_tokens_per_second": 6794.638 }, { "epoch": 1.3828116710875331, "grad_norm": 0.7424932571208389, "learning_rate": 3.268317189587389e-05, "loss": 0.6772, "num_input_tokens_seen": 1471765312, "step": 1630, "train_runtime": 216588.9755, "train_tokens_per_second": 6795.2 }, { "epoch": 1.387055702917772, "grad_norm": 0.8756485744320969, "learning_rate": 3.256567573005519e-05, "loss": 0.7056, "num_input_tokens_seen": 1476461312, "step": 1635, "train_runtime": 217239.6368, "train_tokens_per_second": 6796.464 }, { "epoch": 1.3912997347480105, "grad_norm": 0.8539432184293334, "learning_rate": 3.2447995311766426e-05, "loss": 0.6921, "num_input_tokens_seen": 1481070080, "step": 1640, "train_runtime": 217902.9575, "train_tokens_per_second": 6796.925 }, { "epoch": 1.3955437665782493, "grad_norm": 0.9984531979139321, "learning_rate": 3.233013350696547e-05, "loss": 0.6788, "num_input_tokens_seen": 1485426304, "step": 1645, "train_runtime": 218524.3978, "train_tokens_per_second": 6797.531 }, { "epoch": 1.399787798408488, "grad_norm": 0.8306599513500448, "learning_rate": 3.22120931860276e-05, "loss": 0.7068, "num_input_tokens_seen": 1489976064, "step": 1650, "train_runtime": 219164.0872, "train_tokens_per_second": 6798.45 }, { "epoch": 1.404031830238727, "grad_norm": 0.8165110526157475, "learning_rate": 3.2093877223675657e-05, "loss": 0.7055, "num_input_tokens_seen": 1494425408, "step": 1655, "train_runtime": 219817.6974, "train_tokens_per_second": 6798.476 }, { "epoch": 1.4082758620689655, "grad_norm": 0.9181922831479999, "learning_rate": 3.197548849890997e-05, "loss": 0.6919, "num_input_tokens_seen": 1499022912, "step": 1660, "train_runtime": 220540.8581, "train_tokens_per_second": 6797.03 }, { "epoch": 1.4125198938992043, "grad_norm": 0.8451299000501948, "learning_rate": 3.1856929894938294e-05, "loss": 0.6851, "num_input_tokens_seen": 1503165184, "step": 1665, "train_runtime": 221107.0695, "train_tokens_per_second": 6798.359 }, { "epoch": 1.4167639257294429, "grad_norm": 0.9309089115614846, "learning_rate": 3.17382042991056e-05, "loss": 0.6779, "num_input_tokens_seen": 1507912704, "step": 1670, "train_runtime": 221826.3413, "train_tokens_per_second": 6797.717 }, { "epoch": 1.4210079575596817, "grad_norm": 0.897730430026796, "learning_rate": 3.16193146028237e-05, "loss": 0.6916, "num_input_tokens_seen": 1512406912, "step": 1675, "train_runtime": 222474.5449, "train_tokens_per_second": 6798.112 }, { "epoch": 1.4252519893899205, "grad_norm": 1.0914616744302021, "learning_rate": 3.1500263701500896e-05, "loss": 0.7087, "num_input_tokens_seen": 1516995328, "step": 1680, "train_runtime": 223170.4553, "train_tokens_per_second": 6797.474 }, { "epoch": 1.4294960212201593, "grad_norm": 0.7658163870719906, "learning_rate": 3.1381054494471405e-05, "loss": 0.703, "num_input_tokens_seen": 1521406976, "step": 1685, "train_runtime": 223818.5244, "train_tokens_per_second": 6797.502 }, { "epoch": 1.4337400530503979, "grad_norm": 0.7295564226365354, "learning_rate": 3.12616898849248e-05, "loss": 0.7035, "num_input_tokens_seen": 1526055168, "step": 1690, "train_runtime": 224524.94, "train_tokens_per_second": 6796.818 }, { "epoch": 1.4379840848806367, "grad_norm": 0.8439956680094854, "learning_rate": 3.1142172779835274e-05, "loss": 0.6746, "num_input_tokens_seen": 1530635200, "step": 1695, "train_runtime": 225153.441, "train_tokens_per_second": 6798.187 }, { "epoch": 1.4422281167108753, "grad_norm": 0.8397468802586634, "learning_rate": 3.1022506089890876e-05, "loss": 0.7068, "num_input_tokens_seen": 1535012288, "step": 1700, "train_runtime": 225781.6539, "train_tokens_per_second": 6798.658 }, { "epoch": 1.446472148541114, "grad_norm": 0.7474048463993876, "learning_rate": 3.0902692729422575e-05, "loss": 0.6865, "num_input_tokens_seen": 1539284736, "step": 1705, "train_runtime": 226344.0602, "train_tokens_per_second": 6800.641 }, { "epoch": 1.4507161803713529, "grad_norm": 0.9736770454301451, "learning_rate": 3.078273561633335e-05, "loss": 0.6763, "num_input_tokens_seen": 1543699904, "step": 1710, "train_runtime": 226957.3024, "train_tokens_per_second": 6801.719 }, { "epoch": 1.4549602122015914, "grad_norm": 1.1147200850975938, "learning_rate": 3.066263767202706e-05, "loss": 0.6914, "num_input_tokens_seen": 1548275328, "step": 1715, "train_runtime": 227614.0923, "train_tokens_per_second": 6802.195 }, { "epoch": 1.4592042440318302, "grad_norm": 0.9261552445682865, "learning_rate": 3.0542401821337346e-05, "loss": 0.6895, "num_input_tokens_seen": 1552716864, "step": 1720, "train_runtime": 228231.3018, "train_tokens_per_second": 6803.26 }, { "epoch": 1.463448275862069, "grad_norm": 0.7494164761692941, "learning_rate": 3.042203099245639e-05, "loss": 0.6871, "num_input_tokens_seen": 1557269760, "step": 1725, "train_runtime": 228914.9513, "train_tokens_per_second": 6802.831 }, { "epoch": 1.4676923076923076, "grad_norm": 0.7383133192878851, "learning_rate": 3.0301528116863592e-05, "loss": 0.6914, "num_input_tokens_seen": 1561556608, "step": 1730, "train_runtime": 229541.9414, "train_tokens_per_second": 6802.925 }, { "epoch": 1.4719363395225464, "grad_norm": 0.877542891400688, "learning_rate": 3.0180896129254182e-05, "loss": 0.6962, "num_input_tokens_seen": 1565974592, "step": 1735, "train_runtime": 230156.1279, "train_tokens_per_second": 6803.967 }, { "epoch": 1.4761803713527852, "grad_norm": 0.7394328578918072, "learning_rate": 3.006013796746774e-05, "loss": 0.6763, "num_input_tokens_seen": 1570370368, "step": 1740, "train_runtime": 230776.2675, "train_tokens_per_second": 6804.731 }, { "epoch": 1.4804244031830238, "grad_norm": 0.8032649294789167, "learning_rate": 2.993925657241668e-05, "loss": 0.6904, "num_input_tokens_seen": 1574874432, "step": 1745, "train_runtime": 231438.1989, "train_tokens_per_second": 6804.73 }, { "epoch": 1.4846684350132626, "grad_norm": 0.9191103442108757, "learning_rate": 2.9818254888014586e-05, "loss": 0.6809, "num_input_tokens_seen": 1579401664, "step": 1750, "train_runtime": 232077.6612, "train_tokens_per_second": 6805.488 }, { "epoch": 1.4889124668435012, "grad_norm": 0.86131262301876, "learning_rate": 2.9697135861104546e-05, "loss": 0.6976, "num_input_tokens_seen": 1584000064, "step": 1755, "train_runtime": 232725.1511, "train_tokens_per_second": 6806.312 }, { "epoch": 1.49315649867374, "grad_norm": 0.7493354049181269, "learning_rate": 2.9575902441387393e-05, "loss": 0.693, "num_input_tokens_seen": 1588529152, "step": 1760, "train_runtime": 233378.3571, "train_tokens_per_second": 6806.669 }, { "epoch": 1.4974005305039788, "grad_norm": 0.790057237962092, "learning_rate": 2.9454557581349818e-05, "loss": 0.6793, "num_input_tokens_seen": 1593390656, "step": 1765, "train_runtime": 234055.6771, "train_tokens_per_second": 6807.742 }, { "epoch": 1.5016445623342176, "grad_norm": 0.8848325086982859, "learning_rate": 2.933310423619252e-05, "loss": 0.6963, "num_input_tokens_seen": 1597966720, "step": 1770, "train_runtime": 234761.2586, "train_tokens_per_second": 6806.774 }, { "epoch": 1.5016445623342176, "eval_loss": 0.7488037943840027, "eval_runtime": 1056.893, "eval_samples_per_second": 2.883, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 1597966720, "step": 1770 }, { "epoch": 1.5058885941644562, "grad_norm": 0.8733912053492838, "learning_rate": 2.9211545363758214e-05, "loss": 0.6861, "num_input_tokens_seen": 1602346944, "step": 1775, "train_runtime": 236413.8812, "train_tokens_per_second": 6777.719 }, { "epoch": 1.510132625994695, "grad_norm": 0.9213477294170037, "learning_rate": 2.9089883924459603e-05, "loss": 0.6802, "num_input_tokens_seen": 1606861888, "step": 1780, "train_runtime": 237090.594, "train_tokens_per_second": 6777.417 }, { "epoch": 1.5143766578249336, "grad_norm": 0.8533906385009806, "learning_rate": 2.8968122881207272e-05, "loss": 0.6926, "num_input_tokens_seen": 1611490176, "step": 1785, "train_runtime": 237783.8661, "train_tokens_per_second": 6777.122 }, { "epoch": 1.5186206896551724, "grad_norm": 0.700075097629672, "learning_rate": 2.884626519933753e-05, "loss": 0.6809, "num_input_tokens_seen": 1616104256, "step": 1790, "train_runtime": 238446.2141, "train_tokens_per_second": 6777.647 }, { "epoch": 1.5228647214854112, "grad_norm": 0.7529188790563152, "learning_rate": 2.872431384654021e-05, "loss": 0.6744, "num_input_tokens_seen": 1620585216, "step": 1795, "train_runtime": 239099.2559, "train_tokens_per_second": 6777.876 }, { "epoch": 1.52710875331565, "grad_norm": 0.9180471978156958, "learning_rate": 2.8602271792786355e-05, "loss": 0.6979, "num_input_tokens_seen": 1625263744, "step": 1800, "train_runtime": 239770.0872, "train_tokens_per_second": 6778.426 }, { "epoch": 1.5313527851458886, "grad_norm": 0.936376426158768, "learning_rate": 2.8480142010255956e-05, "loss": 0.6701, "num_input_tokens_seen": 1629558400, "step": 1805, "train_runtime": 240351.8577, "train_tokens_per_second": 6779.887 }, { "epoch": 1.5355968169761272, "grad_norm": 0.6813164585163979, "learning_rate": 2.835792747326549e-05, "loss": 0.6846, "num_input_tokens_seen": 1633885760, "step": 1810, "train_runtime": 240960.0435, "train_tokens_per_second": 6780.733 }, { "epoch": 1.539840848806366, "grad_norm": 0.9012352213500389, "learning_rate": 2.8235631158195542e-05, "loss": 0.6752, "num_input_tokens_seen": 1638485184, "step": 1815, "train_runtime": 241606.7877, "train_tokens_per_second": 6781.619 }, { "epoch": 1.5440848806366048, "grad_norm": 1.0501209838121786, "learning_rate": 2.8113256043418296e-05, "loss": 0.6786, "num_input_tokens_seen": 1643127424, "step": 1820, "train_runtime": 242328.0234, "train_tokens_per_second": 6780.592 }, { "epoch": 1.5483289124668436, "grad_norm": 0.7301282294890542, "learning_rate": 2.7990805109224994e-05, "loss": 0.7052, "num_input_tokens_seen": 1647584256, "step": 1825, "train_runtime": 242957.2381, "train_tokens_per_second": 6781.375 }, { "epoch": 1.5525729442970824, "grad_norm": 0.7180423134088882, "learning_rate": 2.786828133775337e-05, "loss": 0.6862, "num_input_tokens_seen": 1651979520, "step": 1830, "train_runtime": 243571.1176, "train_tokens_per_second": 6782.329 }, { "epoch": 1.556816976127321, "grad_norm": 1.0492539609549594, "learning_rate": 2.774568771291503e-05, "loss": 0.6832, "num_input_tokens_seen": 1656516672, "step": 1835, "train_runtime": 244227.2168, "train_tokens_per_second": 6782.687 }, { "epoch": 1.5610610079575595, "grad_norm": 1.1159044633198913, "learning_rate": 2.7623027220322757e-05, "loss": 0.6695, "num_input_tokens_seen": 1661151360, "step": 1840, "train_runtime": 244907.8876, "train_tokens_per_second": 6782.76 }, { "epoch": 1.5653050397877983, "grad_norm": 0.7897737642307381, "learning_rate": 2.75003028472178e-05, "loss": 0.6781, "num_input_tokens_seen": 1665702272, "step": 1845, "train_runtime": 245544.1767, "train_tokens_per_second": 6783.717 }, { "epoch": 1.5695490716180371, "grad_norm": 0.8021337355935967, "learning_rate": 2.737751758239717e-05, "loss": 0.6872, "num_input_tokens_seen": 1670142848, "step": 1850, "train_runtime": 246145.0482, "train_tokens_per_second": 6785.198 }, { "epoch": 1.573793103448276, "grad_norm": 0.9146806729788793, "learning_rate": 2.7254674416140796e-05, "loss": 0.6674, "num_input_tokens_seen": 1674686336, "step": 1855, "train_runtime": 246790.7905, "train_tokens_per_second": 6785.854 }, { "epoch": 1.5780371352785147, "grad_norm": 0.7370948567058712, "learning_rate": 2.7131776340138732e-05, "loss": 0.6835, "num_input_tokens_seen": 1679386880, "step": 1860, "train_runtime": 247484.4332, "train_tokens_per_second": 6785.828 }, { "epoch": 1.5822811671087533, "grad_norm": 0.8275204157097062, "learning_rate": 2.700882634741828e-05, "loss": 0.6633, "num_input_tokens_seen": 1683943488, "step": 1865, "train_runtime": 248132.5186, "train_tokens_per_second": 6786.468 }, { "epoch": 1.586525198938992, "grad_norm": 0.9040723037619556, "learning_rate": 2.688582743227112e-05, "loss": 0.6687, "num_input_tokens_seen": 1688602624, "step": 1870, "train_runtime": 248805.7127, "train_tokens_per_second": 6786.832 }, { "epoch": 1.5907692307692307, "grad_norm": 0.760985868201272, "learning_rate": 2.676278259018037e-05, "loss": 0.6978, "num_input_tokens_seen": 1693144960, "step": 1875, "train_runtime": 249443.0497, "train_tokens_per_second": 6787.701 }, { "epoch": 1.5950132625994695, "grad_norm": 0.6622361089518702, "learning_rate": 2.663969481774764e-05, "loss": 0.6809, "num_input_tokens_seen": 1697705216, "step": 1880, "train_runtime": 250088.7537, "train_tokens_per_second": 6788.411 }, { "epoch": 1.5992572944297083, "grad_norm": 0.9742697876211484, "learning_rate": 2.6516567112620057e-05, "loss": 0.6955, "num_input_tokens_seen": 1702328000, "step": 1885, "train_runtime": 250811.4006, "train_tokens_per_second": 6787.283 }, { "epoch": 1.603501326259947, "grad_norm": 0.8432142437312786, "learning_rate": 2.6393402473417257e-05, "loss": 0.6891, "num_input_tokens_seen": 1706848704, "step": 1890, "train_runtime": 251465.4682, "train_tokens_per_second": 6787.607 }, { "epoch": 1.6077453580901857, "grad_norm": 1.1461032434751868, "learning_rate": 2.627020389965835e-05, "loss": 0.6813, "num_input_tokens_seen": 1711334336, "step": 1895, "train_runtime": 252107.1822, "train_tokens_per_second": 6788.122 }, { "epoch": 1.6119893899204243, "grad_norm": 0.7667390099087915, "learning_rate": 2.61469743916889e-05, "loss": 0.6809, "num_input_tokens_seen": 1715695488, "step": 1900, "train_runtime": 252733.9504, "train_tokens_per_second": 6788.544 }, { "epoch": 1.616233421750663, "grad_norm": 1.319936113059114, "learning_rate": 2.6023716950607814e-05, "loss": 0.6773, "num_input_tokens_seen": 1720293184, "step": 1905, "train_runtime": 253411.1635, "train_tokens_per_second": 6788.545 }, { "epoch": 1.620477453580902, "grad_norm": 0.8696242756135805, "learning_rate": 2.590043457819428e-05, "loss": 0.6858, "num_input_tokens_seen": 1724818304, "step": 1910, "train_runtime": 254091.2449, "train_tokens_per_second": 6788.185 }, { "epoch": 1.6247214854111407, "grad_norm": 0.6984051732842884, "learning_rate": 2.5777130276834677e-05, "loss": 0.6558, "num_input_tokens_seen": 1729429824, "step": 1915, "train_runtime": 254736.195, "train_tokens_per_second": 6789.101 }, { "epoch": 1.6289655172413793, "grad_norm": 0.7277968832920597, "learning_rate": 2.56538070494494e-05, "loss": 0.6816, "num_input_tokens_seen": 1733884032, "step": 1920, "train_runtime": 255336.8051, "train_tokens_per_second": 6790.576 }, { "epoch": 1.633209549071618, "grad_norm": 0.9589871027299995, "learning_rate": 2.5530467899419792e-05, "loss": 0.6529, "num_input_tokens_seen": 1738507328, "step": 1925, "train_runtime": 256022.2293, "train_tokens_per_second": 6790.455 }, { "epoch": 1.6374535809018567, "grad_norm": 0.8788636504147925, "learning_rate": 2.5407115830514955e-05, "loss": 0.6613, "num_input_tokens_seen": 1743139584, "step": 1930, "train_runtime": 256649.7946, "train_tokens_per_second": 6791.899 }, { "epoch": 1.6416976127320955, "grad_norm": 0.751730530930981, "learning_rate": 2.5283753846818626e-05, "loss": 0.6688, "num_input_tokens_seen": 1747799104, "step": 1935, "train_runtime": 257356.5973, "train_tokens_per_second": 6791.351 }, { "epoch": 1.6459416445623343, "grad_norm": 1.0663719817746726, "learning_rate": 2.516038495265599e-05, "loss": 0.6806, "num_input_tokens_seen": 1752473536, "step": 1940, "train_runtime": 258018.0708, "train_tokens_per_second": 6792.057 }, { "epoch": 1.650185676392573, "grad_norm": 0.8682191978665272, "learning_rate": 2.503701215252056e-05, "loss": 0.6834, "num_input_tokens_seen": 1757236416, "step": 1945, "train_runtime": 258721.276, "train_tokens_per_second": 6792.006 }, { "epoch": 1.6544297082228117, "grad_norm": 0.7561744190987955, "learning_rate": 2.4913638451000926e-05, "loss": 0.6723, "num_input_tokens_seen": 1761509184, "step": 1950, "train_runtime": 259302.524, "train_tokens_per_second": 6793.259 }, { "epoch": 1.6586737400530502, "grad_norm": 0.9925925496141875, "learning_rate": 2.479026685270767e-05, "loss": 0.652, "num_input_tokens_seen": 1766119104, "step": 1955, "train_runtime": 259978.5543, "train_tokens_per_second": 6793.326 }, { "epoch": 1.662917771883289, "grad_norm": 0.6823275549351902, "learning_rate": 2.4666900362200124e-05, "loss": 0.6702, "num_input_tokens_seen": 1770634688, "step": 1960, "train_runtime": 260619.4472, "train_tokens_per_second": 6793.947 }, { "epoch": 1.6671618037135278, "grad_norm": 0.8309834963725057, "learning_rate": 2.4543541983913257e-05, "loss": 0.6498, "num_input_tokens_seen": 1775127616, "step": 1965, "train_runtime": 261263.6563, "train_tokens_per_second": 6794.392 }, { "epoch": 1.6714058355437666, "grad_norm": 0.916270148214916, "learning_rate": 2.4420194722084438e-05, "loss": 0.6637, "num_input_tokens_seen": 1779681280, "step": 1970, "train_runtime": 261910.4255, "train_tokens_per_second": 6795.0 }, { "epoch": 1.6756498673740055, "grad_norm": 0.8052511706103775, "learning_rate": 2.4296861580680348e-05, "loss": 0.6941, "num_input_tokens_seen": 1784311040, "step": 1975, "train_runtime": 262520.958, "train_tokens_per_second": 6796.833 }, { "epoch": 1.679893899204244, "grad_norm": 0.9680550812544997, "learning_rate": 2.4173545563323745e-05, "loss": 0.6812, "num_input_tokens_seen": 1788858240, "step": 1980, "train_runtime": 263164.4906, "train_tokens_per_second": 6797.491 }, { "epoch": 1.6841379310344826, "grad_norm": 0.854153010692225, "learning_rate": 2.4050249673220394e-05, "loss": 0.6798, "num_input_tokens_seen": 1793492672, "step": 1985, "train_runtime": 263823.0171, "train_tokens_per_second": 6798.09 }, { "epoch": 1.6883819628647214, "grad_norm": 0.9065907609733993, "learning_rate": 2.3926976913085848e-05, "loss": 0.6844, "num_input_tokens_seen": 1798141312, "step": 1990, "train_runtime": 264461.4917, "train_tokens_per_second": 6799.256 }, { "epoch": 1.6926259946949602, "grad_norm": 0.964561250811086, "learning_rate": 2.3803730285072366e-05, "loss": 0.6795, "num_input_tokens_seen": 1802590528, "step": 1995, "train_runtime": 265120.6177, "train_tokens_per_second": 6799.134 }, { "epoch": 1.696870026525199, "grad_norm": 0.9694821325807637, "learning_rate": 2.3680512790695818e-05, "loss": 0.6863, "num_input_tokens_seen": 1806991488, "step": 2000, "train_runtime": 265757.2174, "train_tokens_per_second": 6799.407 }, { "epoch": 1.7011140583554378, "grad_norm": 0.8677648954151779, "learning_rate": 2.3557327430762528e-05, "loss": 0.6698, "num_input_tokens_seen": 1811461056, "step": 2005, "train_runtime": 266396.2479, "train_tokens_per_second": 6799.875 }, { "epoch": 1.7053580901856764, "grad_norm": 0.7613369253339761, "learning_rate": 2.3434177205296257e-05, "loss": 0.6613, "num_input_tokens_seen": 1816142272, "step": 2010, "train_runtime": 267054.6771, "train_tokens_per_second": 6800.638 }, { "epoch": 1.709602122015915, "grad_norm": 1.0622307149303414, "learning_rate": 2.3311065113465083e-05, "loss": 0.6602, "num_input_tokens_seen": 1820555008, "step": 2015, "train_runtime": 267688.1492, "train_tokens_per_second": 6801.03 }, { "epoch": 1.7138461538461538, "grad_norm": 0.8964492847174279, "learning_rate": 2.3187994153508397e-05, "loss": 0.658, "num_input_tokens_seen": 1825223808, "step": 2020, "train_runtime": 268333.947, "train_tokens_per_second": 6802.061 }, { "epoch": 1.7180901856763926, "grad_norm": 1.177224036456473, "learning_rate": 2.3064967322663893e-05, "loss": 0.6932, "num_input_tokens_seen": 1829789568, "step": 2025, "train_runtime": 268969.5993, "train_tokens_per_second": 6802.961 }, { "epoch": 1.7223342175066314, "grad_norm": 1.0226100260638311, "learning_rate": 2.2941987617094527e-05, "loss": 0.6721, "num_input_tokens_seen": 1834277632, "step": 2030, "train_runtime": 269616.3957, "train_tokens_per_second": 6803.287 }, { "epoch": 1.72657824933687, "grad_norm": 0.776158652036907, "learning_rate": 2.2819058031815606e-05, "loss": 0.685, "num_input_tokens_seen": 1838997504, "step": 2035, "train_runtime": 270312.8678, "train_tokens_per_second": 6803.219 }, { "epoch": 1.7308222811671088, "grad_norm": 0.8255759535290057, "learning_rate": 2.26961815606218e-05, "loss": 0.6708, "num_input_tokens_seen": 1843497088, "step": 2040, "train_runtime": 270987.9158, "train_tokens_per_second": 6802.876 }, { "epoch": 1.7350663129973474, "grad_norm": 0.9846063905318818, "learning_rate": 2.2573361196014245e-05, "loss": 0.68, "num_input_tokens_seen": 1848067968, "step": 2045, "train_runtime": 271662.3336, "train_tokens_per_second": 6802.813 }, { "epoch": 1.7393103448275862, "grad_norm": 0.8104965013143679, "learning_rate": 2.2450599929127715e-05, "loss": 0.6681, "num_input_tokens_seen": 1852536512, "step": 2050, "train_runtime": 272308.8148, "train_tokens_per_second": 6803.072 }, { "epoch": 1.743554376657825, "grad_norm": 0.7512768919734117, "learning_rate": 2.2327900749657677e-05, "loss": 0.6608, "num_input_tokens_seen": 1856969408, "step": 2055, "train_runtime": 272924.5407, "train_tokens_per_second": 6803.966 }, { "epoch": 1.7477984084880638, "grad_norm": 0.938563336869354, "learning_rate": 2.2205266645787588e-05, "loss": 0.6436, "num_input_tokens_seen": 1861364032, "step": 2060, "train_runtime": 273525.8351, "train_tokens_per_second": 6805.076 }, { "epoch": 1.7520424403183024, "grad_norm": 0.9168142161151926, "learning_rate": 2.2082700604116046e-05, "loss": 0.6734, "num_input_tokens_seen": 1866079936, "step": 2065, "train_runtime": 274238.6936, "train_tokens_per_second": 6804.583 }, { "epoch": 1.7520424403183024, "eval_loss": 0.729947566986084, "eval_runtime": 1057.9673, "eval_samples_per_second": 2.88, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 1866079936, "step": 2065 }, { "epoch": 1.7562864721485412, "grad_norm": 0.7781219629120125, "learning_rate": 2.1960205609584066e-05, "loss": 0.6555, "num_input_tokens_seen": 1870563904, "step": 2070, "train_runtime": 275928.5055, "train_tokens_per_second": 6779.162 }, { "epoch": 1.7605305039787797, "grad_norm": 0.8980636445872143, "learning_rate": 2.183778464540244e-05, "loss": 0.6756, "num_input_tokens_seen": 1874859840, "step": 2075, "train_runtime": 276546.0381, "train_tokens_per_second": 6779.558 }, { "epoch": 1.7647745358090186, "grad_norm": 0.8923077727556372, "learning_rate": 2.1715440692978994e-05, "loss": 0.6779, "num_input_tokens_seen": 1879558656, "step": 2080, "train_runtime": 277240.8007, "train_tokens_per_second": 6779.517 }, { "epoch": 1.7690185676392574, "grad_norm": 1.1974687941835145, "learning_rate": 2.159317673184608e-05, "loss": 0.6671, "num_input_tokens_seen": 1883979904, "step": 2085, "train_runtime": 277861.5498, "train_tokens_per_second": 6780.283 }, { "epoch": 1.7732625994694962, "grad_norm": 0.7953214594587166, "learning_rate": 2.1470995739587944e-05, "loss": 0.6731, "num_input_tokens_seen": 1888448384, "step": 2090, "train_runtime": 278500.3438, "train_tokens_per_second": 6780.776 }, { "epoch": 1.7775066312997347, "grad_norm": 0.7353368817381706, "learning_rate": 2.13489006917682e-05, "loss": 0.6567, "num_input_tokens_seen": 1892962880, "step": 2095, "train_runtime": 279180.0365, "train_tokens_per_second": 6780.438 }, { "epoch": 1.7817506631299733, "grad_norm": 0.7817976445897892, "learning_rate": 2.1226894561857447e-05, "loss": 0.6645, "num_input_tokens_seen": 1897595968, "step": 2100, "train_runtime": 279829.9966, "train_tokens_per_second": 6781.246 }, { "epoch": 1.7859946949602121, "grad_norm": 0.8309895762650132, "learning_rate": 2.1104980321160752e-05, "loss": 0.6734, "num_input_tokens_seen": 1902109888, "step": 2105, "train_runtime": 280489.1904, "train_tokens_per_second": 6781.402 }, { "epoch": 1.790238726790451, "grad_norm": 0.6339748804576945, "learning_rate": 2.0983160938745382e-05, "loss": 0.6526, "num_input_tokens_seen": 1906705216, "step": 2110, "train_runtime": 281135.8565, "train_tokens_per_second": 6782.149 }, { "epoch": 1.7944827586206897, "grad_norm": 0.8552208011108713, "learning_rate": 2.086143938136841e-05, "loss": 0.6563, "num_input_tokens_seen": 1911218304, "step": 2115, "train_runtime": 281788.5646, "train_tokens_per_second": 6782.455 }, { "epoch": 1.7987267904509285, "grad_norm": 0.9732618024212317, "learning_rate": 2.0739818613404513e-05, "loss": 0.6619, "num_input_tokens_seen": 1915723008, "step": 2120, "train_runtime": 282459.2265, "train_tokens_per_second": 6782.299 }, { "epoch": 1.8029708222811671, "grad_norm": 1.0431409571591543, "learning_rate": 2.06183015967738e-05, "loss": 0.6451, "num_input_tokens_seen": 1920464320, "step": 2125, "train_runtime": 283127.6969, "train_tokens_per_second": 6783.032 }, { "epoch": 1.8072148541114057, "grad_norm": 0.874966660194592, "learning_rate": 2.0496891290869595e-05, "loss": 0.6679, "num_input_tokens_seen": 1924942528, "step": 2130, "train_runtime": 283780.283, "train_tokens_per_second": 6783.214 }, { "epoch": 1.8114588859416445, "grad_norm": 0.7749558949940442, "learning_rate": 2.0375590652486482e-05, "loss": 0.6803, "num_input_tokens_seen": 1929745408, "step": 2135, "train_runtime": 284477.4061, "train_tokens_per_second": 6783.475 }, { "epoch": 1.8157029177718833, "grad_norm": 0.7228243017653365, "learning_rate": 2.025440263574817e-05, "loss": 0.6338, "num_input_tokens_seen": 1934284800, "step": 2140, "train_runtime": 285111.2012, "train_tokens_per_second": 6784.317 }, { "epoch": 1.819946949602122, "grad_norm": 0.7810955976714484, "learning_rate": 2.013333019203563e-05, "loss": 0.6532, "num_input_tokens_seen": 1938877184, "step": 2145, "train_runtime": 285771.2844, "train_tokens_per_second": 6784.717 }, { "epoch": 1.8241909814323607, "grad_norm": 0.8174083310669077, "learning_rate": 2.001237626991523e-05, "loss": 0.6511, "num_input_tokens_seen": 1943391872, "step": 2150, "train_runtime": 286412.497, "train_tokens_per_second": 6785.29 }, { "epoch": 1.8284350132625995, "grad_norm": 0.795144872361272, "learning_rate": 1.989154381506684e-05, "loss": 0.6598, "num_input_tokens_seen": 1947919808, "step": 2155, "train_runtime": 287052.841, "train_tokens_per_second": 6785.928 }, { "epoch": 1.832679045092838, "grad_norm": 0.7485422633125285, "learning_rate": 1.9770835770212198e-05, "loss": 0.6566, "num_input_tokens_seen": 1952470976, "step": 2160, "train_runtime": 287718.7456, "train_tokens_per_second": 6786.04 }, { "epoch": 1.8369230769230769, "grad_norm": 1.1308794437672134, "learning_rate": 1.9650255075043163e-05, "loss": 0.6559, "num_input_tokens_seen": 1957140480, "step": 2165, "train_runtime": 288394.0615, "train_tokens_per_second": 6786.341 }, { "epoch": 1.8411671087533157, "grad_norm": 0.8804300984321736, "learning_rate": 1.9529804666150157e-05, "loss": 0.6628, "num_input_tokens_seen": 1961650176, "step": 2170, "train_runtime": 289025.4848, "train_tokens_per_second": 6787.118 }, { "epoch": 1.8454111405835545, "grad_norm": 0.7255223951059633, "learning_rate": 1.940948747695066e-05, "loss": 0.6394, "num_input_tokens_seen": 1966166336, "step": 2175, "train_runtime": 289641.3116, "train_tokens_per_second": 6788.28 }, { "epoch": 1.849655172413793, "grad_norm": 0.7679331866159973, "learning_rate": 1.9289306437617734e-05, "loss": 0.6643, "num_input_tokens_seen": 1970829888, "step": 2180, "train_runtime": 290308.436, "train_tokens_per_second": 6788.745 }, { "epoch": 1.8538992042440319, "grad_norm": 1.142785592894065, "learning_rate": 1.916926447500871e-05, "loss": 0.6499, "num_input_tokens_seen": 1975190528, "step": 2185, "train_runtime": 290918.3543, "train_tokens_per_second": 6789.501 }, { "epoch": 1.8581432360742705, "grad_norm": 0.9065635652001467, "learning_rate": 1.904936451259384e-05, "loss": 0.6607, "num_input_tokens_seen": 1979864704, "step": 2190, "train_runtime": 291624.09, "train_tokens_per_second": 6789.099 }, { "epoch": 1.8623872679045093, "grad_norm": 2.2396906301840747, "learning_rate": 1.892960947038519e-05, "loss": 0.671, "num_input_tokens_seen": 1984425600, "step": 2195, "train_runtime": 292249.7481, "train_tokens_per_second": 6790.17 }, { "epoch": 1.866631299734748, "grad_norm": 0.821508733622114, "learning_rate": 1.8810002264865444e-05, "loss": 0.6556, "num_input_tokens_seen": 1988847360, "step": 2200, "train_runtime": 292879.9609, "train_tokens_per_second": 6790.657 }, { "epoch": 1.8708753315649869, "grad_norm": 0.7742604167377043, "learning_rate": 1.8690545808916908e-05, "loss": 0.6713, "num_input_tokens_seen": 1993346432, "step": 2205, "train_runtime": 293530.2207, "train_tokens_per_second": 6790.941 }, { "epoch": 1.8751193633952254, "grad_norm": 0.7780945672181276, "learning_rate": 1.8571243011750604e-05, "loss": 0.6511, "num_input_tokens_seen": 1997950144, "step": 2210, "train_runtime": 294160.5008, "train_tokens_per_second": 6792.041 }, { "epoch": 1.879363395225464, "grad_norm": 0.7883576009365799, "learning_rate": 1.8452096778835348e-05, "loss": 0.6611, "num_input_tokens_seen": 2002450688, "step": 2215, "train_runtime": 294809.6886, "train_tokens_per_second": 6792.35 }, { "epoch": 1.8836074270557028, "grad_norm": 1.3753307055301716, "learning_rate": 1.833311001182707e-05, "loss": 0.6566, "num_input_tokens_seen": 2006911360, "step": 2220, "train_runtime": 295455.3095, "train_tokens_per_second": 6792.605 }, { "epoch": 1.8878514588859416, "grad_norm": 0.7211010767842578, "learning_rate": 1.821428560849809e-05, "loss": 0.6493, "num_input_tokens_seen": 2011488384, "step": 2225, "train_runtime": 296132.4569, "train_tokens_per_second": 6792.529 }, { "epoch": 1.8920954907161804, "grad_norm": 0.7499760395080998, "learning_rate": 1.8095626462666548e-05, "loss": 0.6688, "num_input_tokens_seen": 2016013248, "step": 2230, "train_runtime": 296757.4534, "train_tokens_per_second": 6793.471 }, { "epoch": 1.8963395225464192, "grad_norm": 0.7281088629603852, "learning_rate": 1.797713546412598e-05, "loss": 0.6691, "num_input_tokens_seen": 2020582592, "step": 2235, "train_runtime": 297412.3677, "train_tokens_per_second": 6793.875 }, { "epoch": 1.9005835543766578, "grad_norm": 0.6418455823156107, "learning_rate": 1.78588154985749e-05, "loss": 0.6638, "num_input_tokens_seen": 2025185600, "step": 2240, "train_runtime": 298049.559, "train_tokens_per_second": 6794.795 }, { "epoch": 1.9048275862068964, "grad_norm": 0.8741945049532132, "learning_rate": 1.7740669447546513e-05, "loss": 0.6691, "num_input_tokens_seen": 2029829952, "step": 2245, "train_runtime": 298722.6372, "train_tokens_per_second": 6795.032 }, { "epoch": 1.9090716180371352, "grad_norm": 0.841478880460149, "learning_rate": 1.762270018833857e-05, "loss": 0.6789, "num_input_tokens_seen": 2034547456, "step": 2250, "train_runtime": 299449.1091, "train_tokens_per_second": 6794.301 }, { "epoch": 1.913315649867374, "grad_norm": 0.8723647428014106, "learning_rate": 1.7504910593943267e-05, "loss": 0.6579, "num_input_tokens_seen": 2039144640, "step": 2255, "train_runtime": 300085.9181, "train_tokens_per_second": 6795.203 }, { "epoch": 1.9175596816976128, "grad_norm": 0.7892235650709017, "learning_rate": 1.738730353297732e-05, "loss": 0.6824, "num_input_tokens_seen": 2043803392, "step": 2260, "train_runtime": 300788.7082, "train_tokens_per_second": 6794.814 }, { "epoch": 1.9218037135278516, "grad_norm": 0.7722394088543671, "learning_rate": 1.726988186961202e-05, "loss": 0.6557, "num_input_tokens_seen": 2048335872, "step": 2265, "train_runtime": 301420.1009, "train_tokens_per_second": 6795.618 }, { "epoch": 1.9260477453580902, "grad_norm": 0.922769500095772, "learning_rate": 1.7152648463503605e-05, "loss": 0.6614, "num_input_tokens_seen": 2053131840, "step": 2270, "train_runtime": 302141.3095, "train_tokens_per_second": 6795.27 }, { "epoch": 1.9302917771883288, "grad_norm": 0.7824828630386448, "learning_rate": 1.7035606169723488e-05, "loss": 0.6478, "num_input_tokens_seen": 2057792768, "step": 2275, "train_runtime": 302806.6263, "train_tokens_per_second": 6795.732 }, { "epoch": 1.9345358090185676, "grad_norm": 0.8688413702016398, "learning_rate": 1.69187578386888e-05, "loss": 0.6524, "num_input_tokens_seen": 2062355392, "step": 2280, "train_runtime": 303443.8425, "train_tokens_per_second": 6796.498 }, { "epoch": 1.9387798408488064, "grad_norm": 0.9595105345229777, "learning_rate": 1.6802106316092966e-05, "loss": 0.6603, "num_input_tokens_seen": 2066871424, "step": 2285, "train_runtime": 304131.5685, "train_tokens_per_second": 6795.978 }, { "epoch": 1.9430238726790452, "grad_norm": 0.8712664797281483, "learning_rate": 1.6685654442836373e-05, "loss": 0.6587, "num_input_tokens_seen": 2071492864, "step": 2290, "train_runtime": 304825.7013, "train_tokens_per_second": 6795.663 }, { "epoch": 1.9472679045092838, "grad_norm": 0.7732671706732043, "learning_rate": 1.656940505495722e-05, "loss": 0.6524, "num_input_tokens_seen": 2075990976, "step": 2295, "train_runtime": 305476.6188, "train_tokens_per_second": 6795.908 }, { "epoch": 1.9515119363395226, "grad_norm": 0.8870290841303411, "learning_rate": 1.645336098356242e-05, "loss": 0.6405, "num_input_tokens_seen": 2080441856, "step": 2300, "train_runtime": 306111.3026, "train_tokens_per_second": 6796.358 }, { "epoch": 1.9557559681697612, "grad_norm": 0.8458061170360918, "learning_rate": 1.633752505475864e-05, "loss": 0.6634, "num_input_tokens_seen": 2085189888, "step": 2305, "train_runtime": 306781.713, "train_tokens_per_second": 6796.982 }, { "epoch": 1.96, "grad_norm": 1.3279259324289545, "learning_rate": 1.622190008958354e-05, "loss": 0.6473, "num_input_tokens_seen": 2089579008, "step": 2310, "train_runtime": 307390.5024, "train_tokens_per_second": 6797.8 }, { "epoch": 1.9642440318302388, "grad_norm": 0.9118221582118621, "learning_rate": 1.610648890393701e-05, "loss": 0.6665, "num_input_tokens_seen": 2094280256, "step": 2315, "train_runtime": 308079.9412, "train_tokens_per_second": 6797.847 }, { "epoch": 1.9684880636604776, "grad_norm": 0.8711258127478657, "learning_rate": 1.5991294308512595e-05, "loss": 0.6587, "num_input_tokens_seen": 2098990464, "step": 2320, "train_runtime": 308818.5174, "train_tokens_per_second": 6796.841 }, { "epoch": 1.9727320954907162, "grad_norm": 0.7184830235566304, "learning_rate": 1.5876319108729077e-05, "loss": 0.6661, "num_input_tokens_seen": 2103407872, "step": 2325, "train_runtime": 309463.3706, "train_tokens_per_second": 6796.953 }, { "epoch": 1.976976127320955, "grad_norm": 0.7747189164106947, "learning_rate": 1.5761566104662117e-05, "loss": 0.6518, "num_input_tokens_seen": 2107807168, "step": 2330, "train_runtime": 310069.0252, "train_tokens_per_second": 6797.864 }, { "epoch": 1.9812201591511935, "grad_norm": 0.878008091110606, "learning_rate": 1.5647038090976114e-05, "loss": 0.6593, "num_input_tokens_seen": 2112190016, "step": 2335, "train_runtime": 310683.7408, "train_tokens_per_second": 6798.521 }, { "epoch": 1.9854641909814323, "grad_norm": 0.7138571628786999, "learning_rate": 1.5532737856856062e-05, "loss": 0.6507, "num_input_tokens_seen": 2116487360, "step": 2340, "train_runtime": 311271.0869, "train_tokens_per_second": 6799.499 }, { "epoch": 1.9897082228116711, "grad_norm": 0.873326228404246, "learning_rate": 1.5418668185939715e-05, "loss": 0.6422, "num_input_tokens_seen": 2120920256, "step": 2345, "train_runtime": 311893.1375, "train_tokens_per_second": 6800.15 }, { "epoch": 1.99395225464191, "grad_norm": 0.6793405806925589, "learning_rate": 1.530483185624973e-05, "loss": 0.6492, "num_input_tokens_seen": 2125213056, "step": 2350, "train_runtime": 312486.8653, "train_tokens_per_second": 6800.968 }, { "epoch": 1.9981962864721485, "grad_norm": 0.7840816343012773, "learning_rate": 1.519123164012603e-05, "loss": 0.6551, "num_input_tokens_seen": 2129589248, "step": 2355, "train_runtime": 313085.9064, "train_tokens_per_second": 6801.933 }, { "epoch": 2.0016976127320953, "grad_norm": 0.7056893265010777, "learning_rate": 1.507787030415831e-05, "loss": 0.4932, "num_input_tokens_seen": 2133444224, "step": 2360, "train_runtime": 313640.7613, "train_tokens_per_second": 6802.191 }, { "epoch": 2.0016976127320953, "eval_loss": 0.71119624376297, "eval_runtime": 1056.465, "eval_samples_per_second": 2.884, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 2133444224, "step": 2360 }, { "epoch": 2.005941644562334, "grad_norm": 0.8558740917942909, "learning_rate": 1.4964750609118614e-05, "loss": 0.5706, "num_input_tokens_seen": 2138178304, "step": 2365, "train_runtime": 315463.138, "train_tokens_per_second": 6777.902 }, { "epoch": 2.010185676392573, "grad_norm": 0.8240864942611333, "learning_rate": 1.4851875309894159e-05, "loss": 0.5672, "num_input_tokens_seen": 2142597568, "step": 2370, "train_runtime": 316074.4747, "train_tokens_per_second": 6778.774 }, { "epoch": 2.0144297082228118, "grad_norm": 0.9448165249761411, "learning_rate": 1.4739247155420183e-05, "loss": 0.5481, "num_input_tokens_seen": 2147211968, "step": 2375, "train_runtime": 316715.0106, "train_tokens_per_second": 6779.634 }, { "epoch": 2.0186737400530506, "grad_norm": 0.8633716944676363, "learning_rate": 1.4626868888613027e-05, "loss": 0.5397, "num_input_tokens_seen": 2151752896, "step": 2380, "train_runtime": 317376.0878, "train_tokens_per_second": 6779.82 }, { "epoch": 2.022917771883289, "grad_norm": 0.809745604875603, "learning_rate": 1.4514743246303359e-05, "loss": 0.5531, "num_input_tokens_seen": 2156288704, "step": 2385, "train_runtime": 318045.818, "train_tokens_per_second": 6779.805 }, { "epoch": 2.0271618037135277, "grad_norm": 0.8968293184086553, "learning_rate": 1.4402872959169461e-05, "loss": 0.5337, "num_input_tokens_seen": 2160913088, "step": 2390, "train_runtime": 318702.9313, "train_tokens_per_second": 6780.336 }, { "epoch": 2.0314058355437665, "grad_norm": 0.9550756849016048, "learning_rate": 1.4291260751670816e-05, "loss": 0.5366, "num_input_tokens_seen": 2165574976, "step": 2395, "train_runtime": 319403.5503, "train_tokens_per_second": 6780.059 }, { "epoch": 2.0356498673740053, "grad_norm": 1.207409378421572, "learning_rate": 1.4179909341981625e-05, "loss": 0.5345, "num_input_tokens_seen": 2170092736, "step": 2400, "train_runtime": 320033.1824, "train_tokens_per_second": 6780.837 }, { "epoch": 2.039893899204244, "grad_norm": 0.8757416767488818, "learning_rate": 1.4068821441924779e-05, "loss": 0.5715, "num_input_tokens_seen": 2174494400, "step": 2405, "train_runtime": 320671.8092, "train_tokens_per_second": 6781.059 }, { "epoch": 2.044137931034483, "grad_norm": 0.8349852718141944, "learning_rate": 1.3957999756905643e-05, "loss": 0.5607, "num_input_tokens_seen": 2178937728, "step": 2410, "train_runtime": 321299.9151, "train_tokens_per_second": 6781.632 }, { "epoch": 2.0483819628647213, "grad_norm": 0.8313527873747903, "learning_rate": 1.3847446985846297e-05, "loss": 0.5364, "num_input_tokens_seen": 2183459520, "step": 2415, "train_runtime": 321952.4508, "train_tokens_per_second": 6781.932 }, { "epoch": 2.05262599469496, "grad_norm": 1.5262840043879295, "learning_rate": 1.3737165821119752e-05, "loss": 0.5404, "num_input_tokens_seen": 2187827712, "step": 2420, "train_runtime": 322540.9554, "train_tokens_per_second": 6783.1 }, { "epoch": 2.056870026525199, "grad_norm": 2.1756261698457076, "learning_rate": 1.3627158948484391e-05, "loss": 0.5469, "num_input_tokens_seen": 2192377216, "step": 2425, "train_runtime": 323190.2135, "train_tokens_per_second": 6783.551 }, { "epoch": 2.0611140583554377, "grad_norm": 1.0913830108446643, "learning_rate": 1.351742904701856e-05, "loss": 0.5683, "num_input_tokens_seen": 2196995328, "step": 2430, "train_runtime": 323877.7351, "train_tokens_per_second": 6783.41 }, { "epoch": 2.0653580901856765, "grad_norm": 0.9641578470507003, "learning_rate": 1.3407978789055311e-05, "loss": 0.551, "num_input_tokens_seen": 2201593728, "step": 2435, "train_runtime": 324546.3078, "train_tokens_per_second": 6783.604 }, { "epoch": 2.0696021220159153, "grad_norm": 0.79070536279513, "learning_rate": 1.3298810840117348e-05, "loss": 0.5296, "num_input_tokens_seen": 2206299712, "step": 2440, "train_runtime": 325212.3944, "train_tokens_per_second": 6784.181 }, { "epoch": 2.0738461538461537, "grad_norm": 0.9950759732472904, "learning_rate": 1.3189927858852092e-05, "loss": 0.5623, "num_input_tokens_seen": 2210768256, "step": 2445, "train_runtime": 325832.427, "train_tokens_per_second": 6784.985 }, { "epoch": 2.0780901856763925, "grad_norm": 1.0145751132111058, "learning_rate": 1.3081332496966923e-05, "loss": 0.5454, "num_input_tokens_seen": 2215064064, "step": 2450, "train_runtime": 326434.2676, "train_tokens_per_second": 6785.636 }, { "epoch": 2.0823342175066313, "grad_norm": 1.2823230060644373, "learning_rate": 1.297302739916463e-05, "loss": 0.5435, "num_input_tokens_seen": 2219600896, "step": 2455, "train_runtime": 327106.8846, "train_tokens_per_second": 6785.552 }, { "epoch": 2.08657824933687, "grad_norm": 0.9419195367761739, "learning_rate": 1.2865015203078996e-05, "loss": 0.5445, "num_input_tokens_seen": 2224140416, "step": 2460, "train_runtime": 327852.8283, "train_tokens_per_second": 6783.96 }, { "epoch": 2.090822281167109, "grad_norm": 1.125875379008506, "learning_rate": 1.27572985392105e-05, "loss": 0.5443, "num_input_tokens_seen": 2228717248, "step": 2465, "train_runtime": 328535.4625, "train_tokens_per_second": 6783.795 }, { "epoch": 2.0950663129973477, "grad_norm": 1.1324409125579475, "learning_rate": 1.2649880030862393e-05, "loss": 0.5599, "num_input_tokens_seen": 2233320128, "step": 2470, "train_runtime": 329151.185, "train_tokens_per_second": 6785.089 }, { "epoch": 2.099310344827586, "grad_norm": 0.931872300199955, "learning_rate": 1.2542762294076631e-05, "loss": 0.5637, "num_input_tokens_seen": 2237752384, "step": 2475, "train_runtime": 329773.7972, "train_tokens_per_second": 6785.719 }, { "epoch": 2.103554376657825, "grad_norm": 0.8489286981286124, "learning_rate": 1.2435947937570355e-05, "loss": 0.5598, "num_input_tokens_seen": 2242141568, "step": 2480, "train_runtime": 330402.9852, "train_tokens_per_second": 6786.081 }, { "epoch": 2.1077984084880637, "grad_norm": 1.1131489144682933, "learning_rate": 1.2329439562672178e-05, "loss": 0.5418, "num_input_tokens_seen": 2246654592, "step": 2485, "train_runtime": 331056.83, "train_tokens_per_second": 6786.311 }, { "epoch": 2.1120424403183025, "grad_norm": 0.9322580519781613, "learning_rate": 1.2223239763258965e-05, "loss": 0.5505, "num_input_tokens_seen": 2251247168, "step": 2490, "train_runtime": 331724.102, "train_tokens_per_second": 6786.505 }, { "epoch": 2.1162864721485413, "grad_norm": 0.9283803102424425, "learning_rate": 1.2117351125692603e-05, "loss": 0.5568, "num_input_tokens_seen": 2255680768, "step": 2495, "train_runtime": 332394.4276, "train_tokens_per_second": 6786.157 }, { "epoch": 2.12053050397878, "grad_norm": 1.1047391998064584, "learning_rate": 1.2011776228757024e-05, "loss": 0.5505, "num_input_tokens_seen": 2260087168, "step": 2500, "train_runtime": 333031.3463, "train_tokens_per_second": 6786.41 }, { "epoch": 2.1247745358090184, "grad_norm": 0.975091099261222, "learning_rate": 1.1906517643595408e-05, "loss": 0.5573, "num_input_tokens_seen": 2264578560, "step": 2505, "train_runtime": 333699.72, "train_tokens_per_second": 6786.276 }, { "epoch": 2.1290185676392572, "grad_norm": 1.1511567847202058, "learning_rate": 1.180157793364756e-05, "loss": 0.5413, "num_input_tokens_seen": 2269041472, "step": 2510, "train_runtime": 334339.6624, "train_tokens_per_second": 6786.636 }, { "epoch": 2.133262599469496, "grad_norm": 1.0466460806645501, "learning_rate": 1.1696959654587474e-05, "loss": 0.5493, "num_input_tokens_seen": 2273598720, "step": 2515, "train_runtime": 334997.2489, "train_tokens_per_second": 6786.918 }, { "epoch": 2.137506631299735, "grad_norm": 1.0291472253443341, "learning_rate": 1.1592665354261118e-05, "loss": 0.5456, "num_input_tokens_seen": 2278146944, "step": 2520, "train_runtime": 335632.7848, "train_tokens_per_second": 6787.617 }, { "epoch": 2.1417506631299736, "grad_norm": 1.0091794838431885, "learning_rate": 1.1488697572624351e-05, "loss": 0.5668, "num_input_tokens_seen": 2282573568, "step": 2525, "train_runtime": 336253.6573, "train_tokens_per_second": 6788.249 }, { "epoch": 2.145994694960212, "grad_norm": 0.85992217862161, "learning_rate": 1.138505884168109e-05, "loss": 0.5308, "num_input_tokens_seen": 2287001600, "step": 2530, "train_runtime": 336902.3826, "train_tokens_per_second": 6788.321 }, { "epoch": 2.150238726790451, "grad_norm": 0.8608747847941026, "learning_rate": 1.1281751685421646e-05, "loss": 0.5605, "num_input_tokens_seen": 2291406080, "step": 2535, "train_runtime": 337530.1254, "train_tokens_per_second": 6788.745 }, { "epoch": 2.1544827586206896, "grad_norm": 1.0839910126914474, "learning_rate": 1.1178778619761209e-05, "loss": 0.5507, "num_input_tokens_seen": 2295897472, "step": 2540, "train_runtime": 338146.0422, "train_tokens_per_second": 6789.662 }, { "epoch": 2.1587267904509284, "grad_norm": 0.8806247078251732, "learning_rate": 1.1076142152478686e-05, "loss": 0.5449, "num_input_tokens_seen": 2300505152, "step": 2545, "train_runtime": 338818.075, "train_tokens_per_second": 6789.795 }, { "epoch": 2.162970822281167, "grad_norm": 1.1516459030706268, "learning_rate": 1.0973844783155474e-05, "loss": 0.5267, "num_input_tokens_seen": 2304838976, "step": 2550, "train_runtime": 339432.6849, "train_tokens_per_second": 6790.268 }, { "epoch": 2.167214854111406, "grad_norm": 0.8899542374126376, "learning_rate": 1.0871889003114743e-05, "loss": 0.5415, "num_input_tokens_seen": 2309598144, "step": 2555, "train_runtime": 340127.2354, "train_tokens_per_second": 6790.395 }, { "epoch": 2.1714588859416444, "grad_norm": 0.9275051879320549, "learning_rate": 1.0770277295360629e-05, "loss": 0.535, "num_input_tokens_seen": 2314332800, "step": 2560, "train_runtime": 340820.4477, "train_tokens_per_second": 6790.475 }, { "epoch": 2.175702917771883, "grad_norm": 1.0929509877136006, "learning_rate": 1.066901213451785e-05, "loss": 0.5407, "num_input_tokens_seen": 2318735744, "step": 2565, "train_runtime": 341455.108, "train_tokens_per_second": 6790.748 }, { "epoch": 2.179946949602122, "grad_norm": 1.2365680582016083, "learning_rate": 1.0568095986771414e-05, "loss": 0.5256, "num_input_tokens_seen": 2323017216, "step": 2570, "train_runtime": 342078.7972, "train_tokens_per_second": 6790.883 }, { "epoch": 2.184190981432361, "grad_norm": 0.9117201092121693, "learning_rate": 1.0467531309806547e-05, "loss": 0.5471, "num_input_tokens_seen": 2327511360, "step": 2575, "train_runtime": 342733.0181, "train_tokens_per_second": 6791.033 }, { "epoch": 2.1884350132625996, "grad_norm": 0.9389916789629429, "learning_rate": 1.0367320552748849e-05, "loss": 0.533, "num_input_tokens_seen": 2332033792, "step": 2580, "train_runtime": 343352.1052, "train_tokens_per_second": 6791.96 }, { "epoch": 2.1926790450928384, "grad_norm": 0.9268628668955959, "learning_rate": 1.0267466156104655e-05, "loss": 0.5493, "num_input_tokens_seen": 2336623744, "step": 2585, "train_runtime": 344011.0131, "train_tokens_per_second": 6792.293 }, { "epoch": 2.1969230769230768, "grad_norm": 0.9632958346283522, "learning_rate": 1.0167970551701586e-05, "loss": 0.5585, "num_input_tokens_seen": 2341059904, "step": 2590, "train_runtime": 344653.4787, "train_tokens_per_second": 6792.503 }, { "epoch": 2.2011671087533156, "grad_norm": 0.9858607973372651, "learning_rate": 1.0068836162629333e-05, "loss": 0.551, "num_input_tokens_seen": 2345544192, "step": 2595, "train_runtime": 345280.1341, "train_tokens_per_second": 6793.163 }, { "epoch": 2.2054111405835544, "grad_norm": 0.9219599125064627, "learning_rate": 9.970065403180648e-06, "loss": 0.5456, "num_input_tokens_seen": 2350091328, "step": 2600, "train_runtime": 345944.8214, "train_tokens_per_second": 6793.255 }, { "epoch": 2.209655172413793, "grad_norm": 1.0056863158147027, "learning_rate": 9.871660678792532e-06, "loss": 0.5573, "num_input_tokens_seen": 2354507008, "step": 2605, "train_runtime": 346586.1214, "train_tokens_per_second": 6793.426 }, { "epoch": 2.213899204244032, "grad_norm": 0.805746716680293, "learning_rate": 9.77362438598769e-06, "loss": 0.5316, "num_input_tokens_seen": 2358953152, "step": 2610, "train_runtime": 347186.5132, "train_tokens_per_second": 6794.484 }, { "epoch": 2.2181432360742708, "grad_norm": 1.0149168469719647, "learning_rate": 9.675958912316091e-06, "loss": 0.5582, "num_input_tokens_seen": 2363513408, "step": 2615, "train_runtime": 347848.8662, "train_tokens_per_second": 6794.656 }, { "epoch": 2.222387267904509, "grad_norm": 0.9339666893590254, "learning_rate": 9.578666636296946e-06, "loss": 0.5468, "num_input_tokens_seen": 2368091328, "step": 2620, "train_runtime": 348511.7283, "train_tokens_per_second": 6794.868 }, { "epoch": 2.226631299734748, "grad_norm": 0.8237811196852415, "learning_rate": 9.481749927360627e-06, "loss": 0.5219, "num_input_tokens_seen": 2372630144, "step": 2625, "train_runtime": 349198.393, "train_tokens_per_second": 6794.505 }, { "epoch": 2.2308753315649867, "grad_norm": 0.9312360048745897, "learning_rate": 9.385211145791126e-06, "loss": 0.5316, "num_input_tokens_seen": 2377249792, "step": 2630, "train_runtime": 349861.9157, "train_tokens_per_second": 6794.823 }, { "epoch": 2.2351193633952255, "grad_norm": 1.0941579207092957, "learning_rate": 9.289052642668416e-06, "loss": 0.5307, "num_input_tokens_seen": 2381752576, "step": 2635, "train_runtime": 350477.7588, "train_tokens_per_second": 6795.731 }, { "epoch": 2.2393633952254643, "grad_norm": 1.054157711767946, "learning_rate": 9.193276759811339e-06, "loss": 0.5608, "num_input_tokens_seen": 2386147072, "step": 2640, "train_runtime": 351111.8395, "train_tokens_per_second": 6795.974 }, { "epoch": 2.2436074270557027, "grad_norm": 0.8910120445576349, "learning_rate": 9.097885829720443e-06, "loss": 0.5412, "num_input_tokens_seen": 2390898880, "step": 2645, "train_runtime": 351810.8417, "train_tokens_per_second": 6795.978 }, { "epoch": 2.2478514588859415, "grad_norm": 1.2007055994992777, "learning_rate": 9.002882175521272e-06, "loss": 0.5399, "num_input_tokens_seen": 2395503232, "step": 2650, "train_runtime": 352435.7792, "train_tokens_per_second": 6796.992 }, { "epoch": 2.2520954907161803, "grad_norm": 1.1196939855484749, "learning_rate": 8.90826811090775e-06, "loss": 0.5167, "num_input_tokens_seen": 2399908928, "step": 2655, "train_runtime": 353060.1062, "train_tokens_per_second": 6797.451 }, { "epoch": 2.2520954907161803, "eval_loss": 0.7378480434417725, "eval_runtime": 1056.8523, "eval_samples_per_second": 2.883, "eval_steps_per_second": 0.091, "num_input_tokens_seen": 2399908928, "step": 2655 }, { "epoch": 2.256339522546419, "grad_norm": 1.2536339396950835, "learning_rate": 8.814045940085832e-06, "loss": 0.5282, "num_input_tokens_seen": 2404340864, "step": 2660, "train_runtime": 354734.293, "train_tokens_per_second": 6777.864 }, { "epoch": 2.260583554376658, "grad_norm": 1.0188614881940494, "learning_rate": 8.720217957717409e-06, "loss": 0.5746, "num_input_tokens_seen": 2408905408, "step": 2665, "train_runtime": 355379.4662, "train_tokens_per_second": 6778.403 }, { "epoch": 2.2648275862068967, "grad_norm": 0.8497257096281086, "learning_rate": 8.62678644886439e-06, "loss": 0.5434, "num_input_tokens_seen": 2413171200, "step": 2670, "train_runtime": 355989.2119, "train_tokens_per_second": 6778.776 }, { "epoch": 2.269071618037135, "grad_norm": 0.9253941979058675, "learning_rate": 8.533753688933093e-06, "loss": 0.5716, "num_input_tokens_seen": 2417806976, "step": 2675, "train_runtime": 356663.8352, "train_tokens_per_second": 6778.952 }, { "epoch": 2.273315649867374, "grad_norm": 0.7919650133709085, "learning_rate": 8.441121943618797e-06, "loss": 0.5217, "num_input_tokens_seen": 2422469504, "step": 2680, "train_runtime": 357398.8789, "train_tokens_per_second": 6778.056 }, { "epoch": 2.2775596816976127, "grad_norm": 0.9826186100423545, "learning_rate": 8.34889346885058e-06, "loss": 0.537, "num_input_tokens_seen": 2427001408, "step": 2685, "train_runtime": 358060.275, "train_tokens_per_second": 6778.192 }, { "epoch": 2.2818037135278515, "grad_norm": 0.9270721696870025, "learning_rate": 8.257070510736375e-06, "loss": 0.5473, "num_input_tokens_seen": 2431585984, "step": 2690, "train_runtime": 358720.1033, "train_tokens_per_second": 6778.505 }, { "epoch": 2.2860477453580903, "grad_norm": 1.0463399115668766, "learning_rate": 8.165655305508283e-06, "loss": 0.5199, "num_input_tokens_seen": 2436195008, "step": 2695, "train_runtime": 359371.4989, "train_tokens_per_second": 6779.043 }, { "epoch": 2.290291777188329, "grad_norm": 0.8433980949747677, "learning_rate": 8.074650079468061e-06, "loss": 0.5406, "num_input_tokens_seen": 2440643712, "step": 2700, "train_runtime": 360023.2487, "train_tokens_per_second": 6779.128 }, { "epoch": 2.2945358090185675, "grad_norm": 0.8344761722848487, "learning_rate": 7.984057048932994e-06, "loss": 0.523, "num_input_tokens_seen": 2445383360, "step": 2705, "train_runtime": 360768.2247, "train_tokens_per_second": 6778.267 }, { "epoch": 2.2987798408488063, "grad_norm": 0.9238013027939782, "learning_rate": 7.893878420181814e-06, "loss": 0.5394, "num_input_tokens_seen": 2449944832, "step": 2710, "train_runtime": 361432.627, "train_tokens_per_second": 6778.427 }, { "epoch": 2.303023872679045, "grad_norm": 0.830003880013244, "learning_rate": 7.80411638940107e-06, "loss": 0.5329, "num_input_tokens_seen": 2454313856, "step": 2715, "train_runtime": 362054.2575, "train_tokens_per_second": 6778.856 }, { "epoch": 2.307267904509284, "grad_norm": 1.2592294301355798, "learning_rate": 7.714773142631553e-06, "loss": 0.5287, "num_input_tokens_seen": 2458717888, "step": 2720, "train_runtime": 362669.4395, "train_tokens_per_second": 6779.501 }, { "epoch": 2.3115119363395227, "grad_norm": 1.04619682795218, "learning_rate": 7.625850855715125e-06, "loss": 0.5494, "num_input_tokens_seen": 2463239104, "step": 2725, "train_runtime": 363325.7794, "train_tokens_per_second": 6779.698 }, { "epoch": 2.3157559681697615, "grad_norm": 0.9229168180828327, "learning_rate": 7.53735169424169e-06, "loss": 0.5247, "num_input_tokens_seen": 2467903744, "step": 2730, "train_runtime": 364007.0501, "train_tokens_per_second": 6779.824 }, { "epoch": 2.32, "grad_norm": 0.9124446948723857, "learning_rate": 7.449277813496469e-06, "loss": 0.5264, "num_input_tokens_seen": 2472315328, "step": 2735, "train_runtime": 364630.5034, "train_tokens_per_second": 6780.331 }, { "epoch": 2.3242440318302386, "grad_norm": 0.9029907706736199, "learning_rate": 7.361631358407511e-06, "loss": 0.5462, "num_input_tokens_seen": 2476917568, "step": 2740, "train_runtime": 365293.8544, "train_tokens_per_second": 6780.617 }, { "epoch": 2.3284880636604774, "grad_norm": 0.965769243054048, "learning_rate": 7.274414463493457e-06, "loss": 0.5276, "num_input_tokens_seen": 2481395968, "step": 2745, "train_runtime": 365941.4705, "train_tokens_per_second": 6780.855 }, { "epoch": 2.3327320954907163, "grad_norm": 1.0526371028734811, "learning_rate": 7.1876292528115425e-06, "loss": 0.524, "num_input_tokens_seen": 2485823424, "step": 2750, "train_runtime": 366578.7434, "train_tokens_per_second": 6781.144 }, { "epoch": 2.336976127320955, "grad_norm": 1.029915538876609, "learning_rate": 7.101277839905887e-06, "loss": 0.5337, "num_input_tokens_seen": 2490461696, "step": 2755, "train_runtime": 367244.9697, "train_tokens_per_second": 6781.473 }, { "epoch": 2.3412201591511934, "grad_norm": 0.8452644471628386, "learning_rate": 7.015362327756009e-06, "loss": 0.5565, "num_input_tokens_seen": 2494895104, "step": 2760, "train_runtime": 367940.8003, "train_tokens_per_second": 6780.697 }, { "epoch": 2.345464190981432, "grad_norm": 1.0284851305831375, "learning_rate": 6.92988480872562e-06, "loss": 0.5551, "num_input_tokens_seen": 2499453376, "step": 2765, "train_runtime": 368624.734, "train_tokens_per_second": 6780.482 }, { "epoch": 2.349708222811671, "grad_norm": 0.9746799945929299, "learning_rate": 6.844847364511667e-06, "loss": 0.5652, "num_input_tokens_seen": 2503898176, "step": 2770, "train_runtime": 369245.4874, "train_tokens_per_second": 6781.121 }, { "epoch": 2.35395225464191, "grad_norm": 1.0403632612371465, "learning_rate": 6.760252066093598e-06, "loss": 0.536, "num_input_tokens_seen": 2508404032, "step": 2775, "train_runtime": 369898.6243, "train_tokens_per_second": 6781.328 }, { "epoch": 2.3581962864721486, "grad_norm": 1.1324249124610484, "learning_rate": 6.676100973683019e-06, "loss": 0.5293, "num_input_tokens_seen": 2512983360, "step": 2780, "train_runtime": 370586.7059, "train_tokens_per_second": 6781.094 }, { "epoch": 2.3624403183023874, "grad_norm": 0.9136829751433924, "learning_rate": 6.592396136673396e-06, "loss": 0.5133, "num_input_tokens_seen": 2517545792, "step": 2785, "train_runtime": 371255.3251, "train_tokens_per_second": 6781.171 }, { "epoch": 2.3666843501326262, "grad_norm": 1.0337343562459835, "learning_rate": 6.509139593590263e-06, "loss": 0.5449, "num_input_tokens_seen": 2521995456, "step": 2790, "train_runtime": 371858.1677, "train_tokens_per_second": 6782.144 }, { "epoch": 2.3709283819628646, "grad_norm": 0.8876485451947176, "learning_rate": 6.426333372041482e-06, "loss": 0.5321, "num_input_tokens_seen": 2526578304, "step": 2795, "train_runtime": 372512.2967, "train_tokens_per_second": 6782.537 }, { "epoch": 2.3751724137931034, "grad_norm": 1.01633470136039, "learning_rate": 6.343979488667923e-06, "loss": 0.546, "num_input_tokens_seen": 2531181120, "step": 2800, "train_runtime": 373190.8361, "train_tokens_per_second": 6782.538 }, { "epoch": 2.379416445623342, "grad_norm": 1.17898509851511, "learning_rate": 6.2620799490943296e-06, "loss": 0.5339, "num_input_tokens_seen": 2535604096, "step": 2805, "train_runtime": 373804.9218, "train_tokens_per_second": 6783.228 }, { "epoch": 2.383660477453581, "grad_norm": 0.995446476535709, "learning_rate": 6.18063674788047e-06, "loss": 0.5294, "num_input_tokens_seen": 2539962496, "step": 2810, "train_runtime": 374428.8855, "train_tokens_per_second": 6783.565 }, { "epoch": 2.38790450928382, "grad_norm": 1.4781781798293818, "learning_rate": 6.099651868472578e-06, "loss": 0.5377, "num_input_tokens_seen": 2544523264, "step": 2815, "train_runtime": 375082.1614, "train_tokens_per_second": 6783.909 }, { "epoch": 2.392148541114058, "grad_norm": 0.9042911431359419, "learning_rate": 6.0191272831550296e-06, "loss": 0.5277, "num_input_tokens_seen": 2549001728, "step": 2820, "train_runtime": 375739.4269, "train_tokens_per_second": 6783.961 }, { "epoch": 2.396392572944297, "grad_norm": 0.9427728279286258, "learning_rate": 5.939064953002324e-06, "loss": 0.5286, "num_input_tokens_seen": 2553568448, "step": 2825, "train_runtime": 376350.7934, "train_tokens_per_second": 6785.075 }, { "epoch": 2.4006366047745358, "grad_norm": 1.140690055341589, "learning_rate": 5.859466827831325e-06, "loss": 0.5404, "num_input_tokens_seen": 2557913088, "step": 2830, "train_runtime": 376975.3041, "train_tokens_per_second": 6785.36 }, { "epoch": 2.4048806366047746, "grad_norm": 0.973992690753586, "learning_rate": 5.780334846153762e-06, "loss": 0.5361, "num_input_tokens_seen": 2562388224, "step": 2835, "train_runtime": 377604.1066, "train_tokens_per_second": 6785.912 }, { "epoch": 2.4091246684350134, "grad_norm": 0.8947292451598252, "learning_rate": 5.701670935129033e-06, "loss": 0.5458, "num_input_tokens_seen": 2566790976, "step": 2840, "train_runtime": 378241.5611, "train_tokens_per_second": 6786.116 }, { "epoch": 2.413368700265252, "grad_norm": 0.9334940824258273, "learning_rate": 5.623477010517269e-06, "loss": 0.5225, "num_input_tokens_seen": 2571270592, "step": 2845, "train_runtime": 378873.6154, "train_tokens_per_second": 6786.618 }, { "epoch": 2.4176127320954905, "grad_norm": 0.7746666166786925, "learning_rate": 5.545754976632672e-06, "loss": 0.534, "num_input_tokens_seen": 2575889152, "step": 2850, "train_runtime": 379547.952, "train_tokens_per_second": 6786.729 }, { "epoch": 2.4218567639257294, "grad_norm": 1.0182336381313688, "learning_rate": 5.468506726297149e-06, "loss": 0.5221, "num_input_tokens_seen": 2580260608, "step": 2855, "train_runtime": 380137.2732, "train_tokens_per_second": 6787.707 }, { "epoch": 2.426100795755968, "grad_norm": 0.9862563470365936, "learning_rate": 5.391734140794183e-06, "loss": 0.5398, "num_input_tokens_seen": 2585142720, "step": 2860, "train_runtime": 380839.0154, "train_tokens_per_second": 6788.02 }, { "epoch": 2.430344827586207, "grad_norm": 0.9041618526091733, "learning_rate": 5.3154390898230846e-06, "loss": 0.512, "num_input_tokens_seen": 2589755136, "step": 2865, "train_runtime": 381496.5415, "train_tokens_per_second": 6788.411 }, { "epoch": 2.4345888594164458, "grad_norm": 0.914359636860779, "learning_rate": 5.2396234314533665e-06, "loss": 0.5143, "num_input_tokens_seen": 2594081792, "step": 2870, "train_runtime": 382111.5001, "train_tokens_per_second": 6788.808 }, { "epoch": 2.438832891246684, "grad_norm": 1.033477002199706, "learning_rate": 5.16428901207959e-06, "loss": 0.5324, "num_input_tokens_seen": 2598687872, "step": 2875, "train_runtime": 382756.5754, "train_tokens_per_second": 6789.401 }, { "epoch": 2.443076923076923, "grad_norm": 0.9057174065952954, "learning_rate": 5.089437666376304e-06, "loss": 0.5263, "num_input_tokens_seen": 2603366144, "step": 2880, "train_runtime": 383439.9985, "train_tokens_per_second": 6789.501 }, { "epoch": 2.4473209549071617, "grad_norm": 0.8529850767813882, "learning_rate": 5.015071217253428e-06, "loss": 0.5113, "num_input_tokens_seen": 2608005440, "step": 2885, "train_runtime": 384090.9187, "train_tokens_per_second": 6790.073 }, { "epoch": 2.4515649867374005, "grad_norm": 1.3879570401538315, "learning_rate": 4.941191475811843e-06, "loss": 0.5222, "num_input_tokens_seen": 2612660480, "step": 2890, "train_runtime": 384747.1837, "train_tokens_per_second": 6790.59 }, { "epoch": 2.4558090185676393, "grad_norm": 1.2654753413045388, "learning_rate": 4.867800241299275e-06, "loss": 0.5128, "num_input_tokens_seen": 2617068928, "step": 2895, "train_runtime": 385371.9974, "train_tokens_per_second": 6791.02 }, { "epoch": 2.460053050397878, "grad_norm": 0.9035128336837777, "learning_rate": 4.794899301066477e-06, "loss": 0.529, "num_input_tokens_seen": 2621648384, "step": 2900, "train_runtime": 386085.7096, "train_tokens_per_second": 6790.327 }, { "epoch": 2.464297082228117, "grad_norm": 0.8567446173412482, "learning_rate": 4.72249043052371e-06, "loss": 0.5269, "num_input_tokens_seen": 2626115776, "step": 2905, "train_runtime": 386725.7007, "train_tokens_per_second": 6790.642 }, { "epoch": 2.4685411140583553, "grad_norm": 0.8912412665948739, "learning_rate": 4.650575393097498e-06, "loss": 0.5387, "num_input_tokens_seen": 2630631040, "step": 2910, "train_runtime": 387368.1112, "train_tokens_per_second": 6791.037 }, { "epoch": 2.472785145888594, "grad_norm": 1.1130459835714437, "learning_rate": 4.57915594018768e-06, "loss": 0.5337, "num_input_tokens_seen": 2635143296, "step": 2915, "train_runtime": 388043.1985, "train_tokens_per_second": 6790.85 }, { "epoch": 2.477029177718833, "grad_norm": 0.9430504436597011, "learning_rate": 4.508233811124765e-06, "loss": 0.5261, "num_input_tokens_seen": 2639559808, "step": 2920, "train_runtime": 388662.674, "train_tokens_per_second": 6791.39 }, { "epoch": 2.4812732095490717, "grad_norm": 0.9383789050215547, "learning_rate": 4.437810733127571e-06, "loss": 0.5464, "num_input_tokens_seen": 2644172096, "step": 2925, "train_runtime": 389299.3342, "train_tokens_per_second": 6792.131 }, { "epoch": 2.4855172413793105, "grad_norm": 0.9063379227339012, "learning_rate": 4.367888421261154e-06, "loss": 0.509, "num_input_tokens_seen": 2648731712, "step": 2930, "train_runtime": 389972.3983, "train_tokens_per_second": 6792.1 }, { "epoch": 2.489761273209549, "grad_norm": 0.8758741295078302, "learning_rate": 4.298468578395029e-06, "loss": 0.5072, "num_input_tokens_seen": 2653268800, "step": 2935, "train_runtime": 390605.8102, "train_tokens_per_second": 6792.702 }, { "epoch": 2.4940053050397877, "grad_norm": 1.0498774972803595, "learning_rate": 4.229552895161754e-06, "loss": 0.5399, "num_input_tokens_seen": 2657767616, "step": 2940, "train_runtime": 391245.0688, "train_tokens_per_second": 6793.102 }, { "epoch": 2.4982493368700265, "grad_norm": 0.9576568252690971, "learning_rate": 4.161143049915661e-06, "loss": 0.5167, "num_input_tokens_seen": 2662204992, "step": 2945, "train_runtime": 391840.1072, "train_tokens_per_second": 6794.111 }, { "epoch": 2.5024933687002653, "grad_norm": 1.297030102246004, "learning_rate": 4.093240708692098e-06, "loss": 0.5434, "num_input_tokens_seen": 2666829440, "step": 2950, "train_runtime": 392486.1885, "train_tokens_per_second": 6794.709 }, { "epoch": 2.5024933687002653, "eval_loss": 0.7277879118919373, "eval_runtime": 1062.7442, "eval_samples_per_second": 2.867, "eval_steps_per_second": 0.09, "num_input_tokens_seen": 2666829440, "step": 2950 }, { "epoch": 2.506737400530504, "grad_norm": 0.9337370037141564, "learning_rate": 4.025847525166737e-06, "loss": 0.521, "num_input_tokens_seen": 2671303040, "step": 2955, "train_runtime": 394192.7243, "train_tokens_per_second": 6776.642 }, { "epoch": 2.510981432360743, "grad_norm": 1.0142479442870918, "learning_rate": 3.958965140615395e-06, "loss": 0.5013, "num_input_tokens_seen": 2675854528, "step": 2960, "train_runtime": 394856.393, "train_tokens_per_second": 6776.779 }, { "epoch": 2.5152254641909817, "grad_norm": 0.9673554697201667, "learning_rate": 3.892595183874015e-06, "loss": 0.5138, "num_input_tokens_seen": 2680310976, "step": 2965, "train_runtime": 395478.0069, "train_tokens_per_second": 6777.396 }, { "epoch": 2.51946949602122, "grad_norm": 0.9773791433571442, "learning_rate": 3.826739271299004e-06, "loss": 0.5374, "num_input_tokens_seen": 2685063360, "step": 2970, "train_runtime": 396163.4888, "train_tokens_per_second": 6777.665 }, { "epoch": 2.523713527851459, "grad_norm": 0.8762686166416458, "learning_rate": 3.761399006727878e-06, "loss": 0.518, "num_input_tokens_seen": 2689742912, "step": 2975, "train_runtime": 396848.2908, "train_tokens_per_second": 6777.761 }, { "epoch": 2.5279575596816977, "grad_norm": 0.9814246027282987, "learning_rate": 3.696575981440198e-06, "loss": 0.5333, "num_input_tokens_seen": 2694003776, "step": 2980, "train_runtime": 397469.5249, "train_tokens_per_second": 6777.888 }, { "epoch": 2.5322015915119365, "grad_norm": 1.0494687013783062, "learning_rate": 3.632271774118812e-06, "loss": 0.5249, "num_input_tokens_seen": 2698482432, "step": 2985, "train_runtime": 398112.2116, "train_tokens_per_second": 6778.196 }, { "epoch": 2.536445623342175, "grad_norm": 0.9167136316488117, "learning_rate": 3.568487950811414e-06, "loss": 0.5179, "num_input_tokens_seen": 2703208384, "step": 2990, "train_runtime": 398770.7605, "train_tokens_per_second": 6778.853 }, { "epoch": 2.5406896551724136, "grad_norm": 1.0631373570087592, "learning_rate": 3.5052260648924056e-06, "loss": 0.5258, "num_input_tokens_seen": 2707736448, "step": 2995, "train_runtime": 399417.3719, "train_tokens_per_second": 6779.216 }, { "epoch": 2.5449336870026524, "grad_norm": 0.8646476638396003, "learning_rate": 3.442487657025059e-06, "loss": 0.5148, "num_input_tokens_seen": 2712052544, "step": 3000, "train_runtime": 400011.3913, "train_tokens_per_second": 6779.938 }, { "epoch": 2.5491777188328912, "grad_norm": 1.3814464306803582, "learning_rate": 3.380274255124008e-06, "loss": 0.5328, "num_input_tokens_seen": 2716590016, "step": 3005, "train_runtime": 400647.6096, "train_tokens_per_second": 6780.497 }, { "epoch": 2.55342175066313, "grad_norm": 0.9608139693650856, "learning_rate": 3.318587374318008e-06, "loss": 0.5221, "num_input_tokens_seen": 2721057600, "step": 3010, "train_runtime": 401293.542, "train_tokens_per_second": 6780.716 }, { "epoch": 2.557665782493369, "grad_norm": 1.052375691071485, "learning_rate": 3.257428516913094e-06, "loss": 0.5356, "num_input_tokens_seen": 2725470592, "step": 3015, "train_runtime": 401915.8909, "train_tokens_per_second": 6781.196 }, { "epoch": 2.5619098143236076, "grad_norm": 1.0422726914250204, "learning_rate": 3.1967991723559186e-06, "loss": 0.5357, "num_input_tokens_seen": 2729976320, "step": 3020, "train_runtime": 402550.4344, "train_tokens_per_second": 6781.7 }, { "epoch": 2.566153846153846, "grad_norm": 1.0115755929526138, "learning_rate": 3.1367008171975606e-06, "loss": 0.5292, "num_input_tokens_seen": 2734356096, "step": 3025, "train_runtime": 403147.2983, "train_tokens_per_second": 6782.524 }, { "epoch": 2.570397877984085, "grad_norm": 0.9326348646064712, "learning_rate": 3.0771349150574833e-06, "loss": 0.5032, "num_input_tokens_seen": 2738757568, "step": 3030, "train_runtime": 403750.7771, "train_tokens_per_second": 6783.287 }, { "epoch": 2.5746419098143236, "grad_norm": 0.8962193098952964, "learning_rate": 3.0181029165879505e-06, "loss": 0.5235, "num_input_tokens_seen": 2743195520, "step": 3035, "train_runtime": 404385.6448, "train_tokens_per_second": 6783.613 }, { "epoch": 2.5788859416445624, "grad_norm": 0.9643001215470395, "learning_rate": 2.959606259438677e-06, "loss": 0.5371, "num_input_tokens_seen": 2747720128, "step": 3040, "train_runtime": 405024.1548, "train_tokens_per_second": 6784.09 }, { "epoch": 2.583129973474801, "grad_norm": 1.083989853159668, "learning_rate": 2.9016463682218137e-06, "loss": 0.5045, "num_input_tokens_seen": 2752185344, "step": 3045, "train_runtime": 405663.7384, "train_tokens_per_second": 6784.401 }, { "epoch": 2.5873740053050396, "grad_norm": 1.0350757402442816, "learning_rate": 2.844224654477251e-06, "loss": 0.535, "num_input_tokens_seen": 2756955712, "step": 3050, "train_runtime": 406343.9367, "train_tokens_per_second": 6784.784 }, { "epoch": 2.5916180371352784, "grad_norm": 1.0876688931917096, "learning_rate": 2.787342516638253e-06, "loss": 0.5241, "num_input_tokens_seen": 2761469632, "step": 3055, "train_runtime": 406991.7195, "train_tokens_per_second": 6785.076 }, { "epoch": 2.595862068965517, "grad_norm": 1.187734783928453, "learning_rate": 2.7310013399973937e-06, "loss": 0.5083, "num_input_tokens_seen": 2766078848, "step": 3060, "train_runtime": 407630.5997, "train_tokens_per_second": 6785.749 }, { "epoch": 2.600106100795756, "grad_norm": 0.8033965202229059, "learning_rate": 2.6752024966728186e-06, "loss": 0.5078, "num_input_tokens_seen": 2770419968, "step": 3065, "train_runtime": 408227.7584, "train_tokens_per_second": 6786.457 }, { "epoch": 2.604350132625995, "grad_norm": 1.0703320175428703, "learning_rate": 2.6199473455748302e-06, "loss": 0.523, "num_input_tokens_seen": 2774944704, "step": 3070, "train_runtime": 408874.2335, "train_tokens_per_second": 6786.793 }, { "epoch": 2.6085941644562336, "grad_norm": 0.8853137013274479, "learning_rate": 2.5652372323727995e-06, "loss": 0.5344, "num_input_tokens_seen": 2779491968, "step": 3075, "train_runtime": 409530.6069, "train_tokens_per_second": 6787.019 }, { "epoch": 2.6128381962864724, "grad_norm": 0.9523960118476597, "learning_rate": 2.5110734894623845e-06, "loss": 0.5238, "num_input_tokens_seen": 2784101184, "step": 3080, "train_runtime": 410217.1802, "train_tokens_per_second": 6786.896 }, { "epoch": 2.6170822281167108, "grad_norm": 0.9141894377610371, "learning_rate": 2.457457435933083e-06, "loss": 0.5072, "num_input_tokens_seen": 2788760320, "step": 3085, "train_runtime": 410902.2723, "train_tokens_per_second": 6786.919 }, { "epoch": 2.6213262599469496, "grad_norm": 0.9601107951690412, "learning_rate": 2.404390377536117e-06, "loss": 0.5461, "num_input_tokens_seen": 2793482240, "step": 3090, "train_runtime": 411591.1251, "train_tokens_per_second": 6787.032 }, { "epoch": 2.6255702917771884, "grad_norm": 1.092143809724588, "learning_rate": 2.3518736066526106e-06, "loss": 0.5355, "num_input_tokens_seen": 2798052800, "step": 3095, "train_runtime": 412253.9967, "train_tokens_per_second": 6787.206 }, { "epoch": 2.629814323607427, "grad_norm": 0.9775173060346775, "learning_rate": 2.2999084022621575e-06, "loss": 0.5187, "num_input_tokens_seen": 2802623616, "step": 3100, "train_runtime": 412972.0833, "train_tokens_per_second": 6786.472 }, { "epoch": 2.6340583554376655, "grad_norm": 0.8882149505754918, "learning_rate": 2.2484960299116176e-06, "loss": 0.5234, "num_input_tokens_seen": 2807059072, "step": 3105, "train_runtime": 413616.3561, "train_tokens_per_second": 6786.625 }, { "epoch": 2.6383023872679043, "grad_norm": 1.04737202246014, "learning_rate": 2.1976377416843496e-06, "loss": 0.5337, "num_input_tokens_seen": 2811685952, "step": 3110, "train_runtime": 414302.3818, "train_tokens_per_second": 6786.555 }, { "epoch": 2.642546419098143, "grad_norm": 0.8525650275605402, "learning_rate": 2.1473347761696765e-06, "loss": 0.5122, "num_input_tokens_seen": 2816097472, "step": 3115, "train_runtime": 414912.2037, "train_tokens_per_second": 6787.213 }, { "epoch": 2.646790450928382, "grad_norm": 1.4897884354340427, "learning_rate": 2.097588358432745e-06, "loss": 0.5344, "num_input_tokens_seen": 2820541952, "step": 3120, "train_runtime": 415551.6146, "train_tokens_per_second": 6787.465 }, { "epoch": 2.6510344827586207, "grad_norm": 0.9315543634592929, "learning_rate": 2.048399699984685e-06, "loss": 0.5204, "num_input_tokens_seen": 2825211008, "step": 3125, "train_runtime": 416236.0998, "train_tokens_per_second": 6787.52 }, { "epoch": 2.6552785145888596, "grad_norm": 0.994893210880857, "learning_rate": 1.999769998753101e-06, "loss": 0.5092, "num_input_tokens_seen": 2829805440, "step": 3130, "train_runtime": 416937.1267, "train_tokens_per_second": 6787.128 }, { "epoch": 2.6595225464190984, "grad_norm": 0.9211547478732683, "learning_rate": 1.951700439052906e-06, "loss": 0.509, "num_input_tokens_seen": 2834319168, "step": 3135, "train_runtime": 417580.9275, "train_tokens_per_second": 6787.473 }, { "epoch": 2.6637665782493367, "grad_norm": 1.1092045966595683, "learning_rate": 1.9041921915574718e-06, "loss": 0.5213, "num_input_tokens_seen": 2838842944, "step": 3140, "train_runtime": 418214.8375, "train_tokens_per_second": 6788.001 }, { "epoch": 2.6680106100795755, "grad_norm": 0.8680731962765761, "learning_rate": 1.857246413270114e-06, "loss": 0.5303, "num_input_tokens_seen": 2843287936, "step": 3145, "train_runtime": 418837.8662, "train_tokens_per_second": 6788.517 }, { "epoch": 2.6722546419098143, "grad_norm": 1.370277456535701, "learning_rate": 1.810864247495933e-06, "loss": 0.5351, "num_input_tokens_seen": 2847823872, "step": 3150, "train_runtime": 419460.3925, "train_tokens_per_second": 6789.256 }, { "epoch": 2.676498673740053, "grad_norm": 1.0792319656449618, "learning_rate": 1.7650468238139484e-06, "loss": 0.5114, "num_input_tokens_seen": 2852148544, "step": 3155, "train_runtime": 420051.6697, "train_tokens_per_second": 6789.995 }, { "epoch": 2.680742705570292, "grad_norm": 0.8781496188483874, "learning_rate": 1.7197952580496086e-06, "loss": 0.5126, "num_input_tokens_seen": 2856524608, "step": 3160, "train_runtime": 420644.3052, "train_tokens_per_second": 6790.832 }, { "epoch": 2.6849867374005303, "grad_norm": 0.9491071552727024, "learning_rate": 1.6751106522476078e-06, "loss": 0.5242, "num_input_tokens_seen": 2861096576, "step": 3165, "train_runtime": 421277.8845, "train_tokens_per_second": 6791.471 }, { "epoch": 2.689230769230769, "grad_norm": 0.9895886874283716, "learning_rate": 1.6309940946450276e-06, "loss": 0.5422, "num_input_tokens_seen": 2865604864, "step": 3170, "train_runtime": 421930.8634, "train_tokens_per_second": 6791.646 }, { "epoch": 2.693474801061008, "grad_norm": 0.9535636472034763, "learning_rate": 1.5874466596448894e-06, "loss": 0.5138, "num_input_tokens_seen": 2870295232, "step": 3175, "train_runtime": 422607.9976, "train_tokens_per_second": 6791.862 }, { "epoch": 2.6977188328912467, "grad_norm": 1.2371458441648222, "learning_rate": 1.5444694077899112e-06, "loss": 0.5143, "num_input_tokens_seen": 2874723904, "step": 3180, "train_runtime": 423241.8919, "train_tokens_per_second": 6792.154 }, { "epoch": 2.7019628647214855, "grad_norm": 1.0270055431553666, "learning_rate": 1.5020633857367629e-06, "loss": 0.5097, "num_input_tokens_seen": 2879036160, "step": 3185, "train_runtime": 423847.9941, "train_tokens_per_second": 6792.615 }, { "epoch": 2.7062068965517243, "grad_norm": 1.0047782227717184, "learning_rate": 1.4602296262304998e-06, "loss": 0.531, "num_input_tokens_seen": 2883503104, "step": 3190, "train_runtime": 424512.874, "train_tokens_per_second": 6792.499 }, { "epoch": 2.710450928381963, "grad_norm": 0.7991655241640006, "learning_rate": 1.4189691480794659e-06, "loss": 0.5031, "num_input_tokens_seen": 2888068736, "step": 3195, "train_runtime": 425152.1322, "train_tokens_per_second": 6793.024 }, { "epoch": 2.7146949602122015, "grad_norm": 0.9290447093133773, "learning_rate": 1.3782829561304528e-06, "loss": 0.5214, "num_input_tokens_seen": 2892383104, "step": 3200, "train_runtime": 425736.2555, "train_tokens_per_second": 6793.838 }, { "epoch": 2.7189389920424403, "grad_norm": 0.9336749097922185, "learning_rate": 1.3381720412442484e-06, "loss": 0.5046, "num_input_tokens_seen": 2896798592, "step": 3205, "train_runtime": 426372.4313, "train_tokens_per_second": 6794.057 }, { "epoch": 2.723183023872679, "grad_norm": 0.923464487407943, "learning_rate": 1.2986373802714806e-06, "loss": 0.519, "num_input_tokens_seen": 2901355840, "step": 3210, "train_runtime": 427007.4668, "train_tokens_per_second": 6794.626 }, { "epoch": 2.727427055702918, "grad_norm": 1.054195900551347, "learning_rate": 1.259679936028857e-06, "loss": 0.5278, "num_input_tokens_seen": 2906018048, "step": 3215, "train_runtime": 427698.5588, "train_tokens_per_second": 6794.547 }, { "epoch": 2.7316710875331562, "grad_norm": 1.053479012440482, "learning_rate": 1.2213006572756868e-06, "loss": 0.5052, "num_input_tokens_seen": 2910317120, "step": 3220, "train_runtime": 428289.4292, "train_tokens_per_second": 6795.211 }, { "epoch": 2.735915119363395, "grad_norm": 0.9664035844106731, "learning_rate": 1.1835004786907994e-06, "loss": 0.5285, "num_input_tokens_seen": 2914766912, "step": 3225, "train_runtime": 428926.8091, "train_tokens_per_second": 6795.488 }, { "epoch": 2.740159151193634, "grad_norm": 0.9743299232561019, "learning_rate": 1.1462803208497658e-06, "loss": 0.5349, "num_input_tokens_seen": 2919116864, "step": 3230, "train_runtime": 429579.5988, "train_tokens_per_second": 6795.287 }, { "epoch": 2.7444031830238726, "grad_norm": 0.9853416108002045, "learning_rate": 1.1096410902024874e-06, "loss": 0.5281, "num_input_tokens_seen": 2923624768, "step": 3235, "train_runtime": 430191.282, "train_tokens_per_second": 6796.104 }, { "epoch": 2.7486472148541115, "grad_norm": 0.9605567975250936, "learning_rate": 1.073583679051124e-06, "loss": 0.5301, "num_input_tokens_seen": 2928050752, "step": 3240, "train_runtime": 430824.3873, "train_tokens_per_second": 6796.39 }, { "epoch": 2.7528912466843503, "grad_norm": 1.1024226803397792, "learning_rate": 1.0381089655283394e-06, "loss": 0.5316, "num_input_tokens_seen": 2932468416, "step": 3245, "train_runtime": 431459.9194, "train_tokens_per_second": 6796.618 }, { "epoch": 2.7528912466843503, "eval_loss": 0.7258533835411072, "eval_runtime": 1061.3977, "eval_samples_per_second": 2.871, "eval_steps_per_second": 0.09, "num_input_tokens_seen": 2932468416, "step": 3245 }, { "epoch": 2.757135278514589, "grad_norm": 1.0283547426763588, "learning_rate": 1.0032178135759546e-06, "loss": 0.5119, "num_input_tokens_seen": 2936775872, "step": 3250, "train_runtime": 433099.9457, "train_tokens_per_second": 6780.827 }, { "epoch": 2.7613793103448274, "grad_norm": 0.9455593371411998, "learning_rate": 9.68911072923867e-07, "loss": 0.5204, "num_input_tokens_seen": 2941314240, "step": 3255, "train_runtime": 433763.0738, "train_tokens_per_second": 6780.924 }, { "epoch": 2.7656233421750662, "grad_norm": 0.9779382933428667, "learning_rate": 9.351895790693955e-07, "loss": 0.5273, "num_input_tokens_seen": 2945907136, "step": 3260, "train_runtime": 434418.6226, "train_tokens_per_second": 6781.263 }, { "epoch": 2.769867374005305, "grad_norm": 0.9433322444237364, "learning_rate": 9.020541532568899e-07, "loss": 0.5289, "num_input_tokens_seen": 2950538496, "step": 3265, "train_runtime": 435105.6078, "train_tokens_per_second": 6781.201 }, { "epoch": 2.774111405835544, "grad_norm": 1.0359211392265961, "learning_rate": 8.695056024577792e-07, "loss": 0.5082, "num_input_tokens_seen": 2955022208, "step": 3270, "train_runtime": 435708.6215, "train_tokens_per_second": 6782.106 }, { "epoch": 2.7783554376657826, "grad_norm": 0.8927269780868947, "learning_rate": 8.375447193508662e-07, "loss": 0.5196, "num_input_tokens_seen": 2959619008, "step": 3275, "train_runtime": 436404.7763, "train_tokens_per_second": 6781.821 }, { "epoch": 2.782599469496021, "grad_norm": 0.994250431687146, "learning_rate": 8.061722823030693e-07, "loss": 0.5294, "num_input_tokens_seen": 2964249856, "step": 3280, "train_runtime": 437073.9242, "train_tokens_per_second": 6782.033 }, { "epoch": 2.78684350132626, "grad_norm": 0.8514551639091007, "learning_rate": 7.753890553504422e-07, "loss": 0.5073, "num_input_tokens_seen": 2968889344, "step": 3285, "train_runtime": 437728.8681, "train_tokens_per_second": 6782.485 }, { "epoch": 2.7910875331564986, "grad_norm": 1.0389333757881603, "learning_rate": 7.451957881795673e-07, "loss": 0.532, "num_input_tokens_seen": 2973153152, "step": 3290, "train_runtime": 438317.3334, "train_tokens_per_second": 6783.106 }, { "epoch": 2.7953315649867374, "grad_norm": 0.8808663623494958, "learning_rate": 7.155932161093032e-07, "loss": 0.5312, "num_input_tokens_seen": 2977666176, "step": 3295, "train_runtime": 438968.9598, "train_tokens_per_second": 6783.318 }, { "epoch": 2.799575596816976, "grad_norm": 1.0049671673931104, "learning_rate": 6.865820600728823e-07, "loss": 0.5352, "num_input_tokens_seen": 2982187392, "step": 3300, "train_runtime": 439614.2835, "train_tokens_per_second": 6783.645 }, { "epoch": 2.803819628647215, "grad_norm": 0.9855264400649804, "learning_rate": 6.581630266003419e-07, "loss": 0.5523, "num_input_tokens_seen": 2986898304, "step": 3305, "train_runtime": 440321.2266, "train_tokens_per_second": 6783.453 }, { "epoch": 2.808063660477454, "grad_norm": 1.0813234011741917, "learning_rate": 6.303368078013183e-07, "loss": 0.5116, "num_input_tokens_seen": 2991295104, "step": 3310, "train_runtime": 440944.6485, "train_tokens_per_second": 6783.834 }, { "epoch": 2.812307692307692, "grad_norm": 0.9341722231468776, "learning_rate": 6.031040813482047e-07, "loss": 0.5215, "num_input_tokens_seen": 2995983808, "step": 3315, "train_runtime": 441625.9103, "train_tokens_per_second": 6783.986 }, { "epoch": 2.816551724137931, "grad_norm": 0.8962570756580445, "learning_rate": 5.764655104596311e-07, "loss": 0.536, "num_input_tokens_seen": 3000598080, "step": 3320, "train_runtime": 442297.9755, "train_tokens_per_second": 6784.11 }, { "epoch": 2.82079575596817, "grad_norm": 0.884157880198946, "learning_rate": 5.504217438843301e-07, "loss": 0.5187, "num_input_tokens_seen": 3005166144, "step": 3325, "train_runtime": 442917.9433, "train_tokens_per_second": 6784.928 }, { "epoch": 2.8250397877984086, "grad_norm": 0.9880705840554078, "learning_rate": 5.249734158853048e-07, "loss": 0.5367, "num_input_tokens_seen": 3009451904, "step": 3330, "train_runtime": 443509.8078, "train_tokens_per_second": 6785.536 }, { "epoch": 2.829283819628647, "grad_norm": 0.9577753896346771, "learning_rate": 5.001211462244359e-07, "loss": 0.5214, "num_input_tokens_seen": 3013832512, "step": 3335, "train_runtime": 444142.2629, "train_tokens_per_second": 6785.737 }, { "epoch": 2.8335278514588857, "grad_norm": 0.9814158353022203, "learning_rate": 4.758655401473272e-07, "loss": 0.5255, "num_input_tokens_seen": 3018354112, "step": 3340, "train_runtime": 444781.1211, "train_tokens_per_second": 6786.156 }, { "epoch": 2.8377718832891246, "grad_norm": 1.1181096501664909, "learning_rate": 4.522071883686141e-07, "loss": 0.5053, "num_input_tokens_seen": 3022790720, "step": 3345, "train_runtime": 445390.6168, "train_tokens_per_second": 6786.831 }, { "epoch": 2.8420159151193634, "grad_norm": 0.9098207875260499, "learning_rate": 4.291466670575506e-07, "loss": 0.5205, "num_input_tokens_seen": 3027432384, "step": 3350, "train_runtime": 446049.4125, "train_tokens_per_second": 6787.213 }, { "epoch": 2.846259946949602, "grad_norm": 0.8045024632334659, "learning_rate": 4.0668453782398696e-07, "loss": 0.5009, "num_input_tokens_seen": 3032032768, "step": 3355, "train_runtime": 446687.1742, "train_tokens_per_second": 6787.821 }, { "epoch": 2.850503978779841, "grad_norm": 0.9168040586174651, "learning_rate": 3.848213477046919e-07, "loss": 0.5227, "num_input_tokens_seen": 3036567680, "step": 3360, "train_runtime": 447330.9014, "train_tokens_per_second": 6788.191 }, { "epoch": 2.8547480106100798, "grad_norm": 1.1113058183093447, "learning_rate": 3.6355762915002143e-07, "loss": 0.5462, "num_input_tokens_seen": 3041310976, "step": 3365, "train_runtime": 448024.7841, "train_tokens_per_second": 6788.265 }, { "epoch": 2.8589920424403186, "grad_norm": 1.1615276846957676, "learning_rate": 3.4289390001097377e-07, "loss": 0.5032, "num_input_tokens_seen": 3045742528, "step": 3370, "train_runtime": 448635.5559, "train_tokens_per_second": 6788.901 }, { "epoch": 2.863236074270557, "grad_norm": 1.022077622432263, "learning_rate": 3.2283066352654936e-07, "loss": 0.5328, "num_input_tokens_seen": 3050306496, "step": 3375, "train_runtime": 449289.2558, "train_tokens_per_second": 6789.182 }, { "epoch": 2.8674801061007957, "grad_norm": 0.9272311129173908, "learning_rate": 3.0336840831151626e-07, "loss": 0.5273, "num_input_tokens_seen": 3054867520, "step": 3380, "train_runtime": 449945.3409, "train_tokens_per_second": 6789.419 }, { "epoch": 2.8717241379310345, "grad_norm": 0.9436512733023248, "learning_rate": 2.8450760834450307e-07, "loss": 0.5194, "num_input_tokens_seen": 3059487552, "step": 3385, "train_runtime": 450608.7914, "train_tokens_per_second": 6789.676 }, { "epoch": 2.8759681697612733, "grad_norm": 0.9504790766629004, "learning_rate": 2.662487229564525e-07, "loss": 0.5342, "num_input_tokens_seen": 3064172992, "step": 3390, "train_runtime": 451284.6962, "train_tokens_per_second": 6789.889 }, { "epoch": 2.8802122015915117, "grad_norm": 0.9713790898947682, "learning_rate": 2.485921968194416e-07, "loss": 0.534, "num_input_tokens_seen": 3068583616, "step": 3395, "train_runtime": 451906.6499, "train_tokens_per_second": 6790.304 }, { "epoch": 2.8844562334217505, "grad_norm": 1.0394097264342836, "learning_rate": 2.3153845993584834e-07, "loss": 0.516, "num_input_tokens_seen": 3073191360, "step": 3400, "train_runtime": 452591.3839, "train_tokens_per_second": 6790.212 }, { "epoch": 2.8887002652519893, "grad_norm": 0.9630302970032499, "learning_rate": 2.1508792762787723e-07, "loss": 0.5191, "num_input_tokens_seen": 3077554560, "step": 3405, "train_runtime": 453214.0954, "train_tokens_per_second": 6790.509 }, { "epoch": 2.892944297082228, "grad_norm": 1.0072599246388074, "learning_rate": 1.9924100052745586e-07, "loss": 0.5081, "num_input_tokens_seen": 3082016768, "step": 3410, "train_runtime": 453848.0797, "train_tokens_per_second": 6790.856 }, { "epoch": 2.897188328912467, "grad_norm": 0.8435644898142695, "learning_rate": 1.8399806456645963e-07, "loss": 0.5184, "num_input_tokens_seen": 3086770176, "step": 3415, "train_runtime": 454559.6162, "train_tokens_per_second": 6790.683 }, { "epoch": 2.9014323607427057, "grad_norm": 1.0235162502387214, "learning_rate": 1.6935949096733016e-07, "loss": 0.534, "num_input_tokens_seen": 3091125184, "step": 3420, "train_runtime": 455183.2572, "train_tokens_per_second": 6790.947 }, { "epoch": 2.9056763925729445, "grad_norm": 0.9662120266545192, "learning_rate": 1.5532563623402718e-07, "loss": 0.526, "num_input_tokens_seen": 3095550336, "step": 3425, "train_runtime": 455803.201, "train_tokens_per_second": 6791.419 }, { "epoch": 2.909920424403183, "grad_norm": 0.8700940267517274, "learning_rate": 1.4189684214334087e-07, "loss": 0.5214, "num_input_tokens_seen": 3100072256, "step": 3430, "train_runtime": 456479.466, "train_tokens_per_second": 6791.263 }, { "epoch": 2.9141644562334217, "grad_norm": 1.0663457112375756, "learning_rate": 1.2907343573658194e-07, "loss": 0.5457, "num_input_tokens_seen": 3104605376, "step": 3435, "train_runtime": 457154.9293, "train_tokens_per_second": 6791.145 }, { "epoch": 2.9184084880636605, "grad_norm": 0.8678572276073582, "learning_rate": 1.1685572931160737e-07, "loss": 0.5259, "num_input_tokens_seen": 3109137920, "step": 3440, "train_runtime": 457837.7574, "train_tokens_per_second": 6790.916 }, { "epoch": 2.9226525198938993, "grad_norm": 1.015827014706817, "learning_rate": 1.0524402041520997e-07, "loss": 0.5011, "num_input_tokens_seen": 3113606784, "step": 3445, "train_runtime": 458466.3165, "train_tokens_per_second": 6791.353 }, { "epoch": 2.926896551724138, "grad_norm": 1.006376986197294, "learning_rate": 9.42385918358879e-08, "loss": 0.5323, "num_input_tokens_seen": 3118176512, "step": 3450, "train_runtime": 459172.3619, "train_tokens_per_second": 6790.863 }, { "epoch": 2.9311405835543765, "grad_norm": 0.9168280974190645, "learning_rate": 8.383971159694193e-08, "loss": 0.5284, "num_input_tokens_seen": 3122635072, "step": 3455, "train_runtime": 459796.4487, "train_tokens_per_second": 6791.342 }, { "epoch": 2.9353846153846153, "grad_norm": 0.8691560318885593, "learning_rate": 7.404763294995565e-08, "loss": 0.5378, "num_input_tokens_seen": 3127211136, "step": 3460, "train_runtime": 460443.0378, "train_tokens_per_second": 6791.744 }, { "epoch": 2.939628647214854, "grad_norm": 0.8391826158196111, "learning_rate": 6.486259436863373e-08, "loss": 0.5057, "num_input_tokens_seen": 3131755648, "step": 3465, "train_runtime": 461119.9271, "train_tokens_per_second": 6791.629 }, { "epoch": 2.943872679045093, "grad_norm": 0.9266785431840717, "learning_rate": 5.628481954297604e-08, "loss": 0.5041, "num_input_tokens_seen": 3136402176, "step": 3470, "train_runtime": 461787.0454, "train_tokens_per_second": 6791.88 }, { "epoch": 2.9481167108753317, "grad_norm": 0.9181454594188112, "learning_rate": 4.83145173738514e-08, "loss": 0.5146, "num_input_tokens_seen": 3140924544, "step": 3475, "train_runtime": 462405.4235, "train_tokens_per_second": 6792.577 }, { "epoch": 2.9523607427055705, "grad_norm": 0.8349784460725406, "learning_rate": 4.095188196789057e-08, "loss": 0.5158, "num_input_tokens_seen": 3145415872, "step": 3480, "train_runtime": 463062.3059, "train_tokens_per_second": 6792.641 }, { "epoch": 2.9566047745358093, "grad_norm": 0.9323692378640641, "learning_rate": 3.419709263277893e-08, "loss": 0.5251, "num_input_tokens_seen": 3149924224, "step": 3485, "train_runtime": 463701.0926, "train_tokens_per_second": 6793.006 }, { "epoch": 2.9608488063660476, "grad_norm": 0.8451251639267238, "learning_rate": 2.8050313872868273e-08, "loss": 0.5253, "num_input_tokens_seen": 3154558400, "step": 3490, "train_runtime": 464407.8316, "train_tokens_per_second": 6792.647 }, { "epoch": 2.9650928381962864, "grad_norm": 1.0163495151903155, "learning_rate": 2.251169538518838e-08, "loss": 0.5481, "num_input_tokens_seen": 3159207360, "step": 3495, "train_runtime": 465073.4609, "train_tokens_per_second": 6792.921 }, { "epoch": 2.9693368700265252, "grad_norm": 1.0129744457478043, "learning_rate": 1.758137205579158e-08, "loss": 0.5135, "num_input_tokens_seen": 3163694016, "step": 3500, "train_runtime": 465699.5834, "train_tokens_per_second": 6793.422 }, { "epoch": 2.973580901856764, "grad_norm": 1.000739655978131, "learning_rate": 1.3259463956469265e-08, "loss": 0.5001, "num_input_tokens_seen": 3168337088, "step": 3505, "train_runtime": 466401.7551, "train_tokens_per_second": 6793.15 }, { "epoch": 2.9778249336870024, "grad_norm": 0.9167977963170866, "learning_rate": 9.546076341834798e-09, "loss": 0.5125, "num_input_tokens_seen": 3172760384, "step": 3510, "train_runtime": 467024.7423, "train_tokens_per_second": 6793.56 }, { "epoch": 2.982068965517241, "grad_norm": 0.9752251889997392, "learning_rate": 6.441299646750554e-09, "loss": 0.5214, "num_input_tokens_seen": 3177405824, "step": 3515, "train_runtime": 467725.4913, "train_tokens_per_second": 6793.313 }, { "epoch": 2.98631299734748, "grad_norm": 0.9307610509549541, "learning_rate": 3.945209484124135e-09, "loss": 0.5254, "num_input_tokens_seen": 3182049024, "step": 3520, "train_runtime": 468383.2029, "train_tokens_per_second": 6793.687 }, { "epoch": 2.990557029177719, "grad_norm": 0.9957718574851812, "learning_rate": 2.0578666430765e-09, "loss": 0.5124, "num_input_tokens_seen": 3186550272, "step": 3525, "train_runtime": 469031.712, "train_tokens_per_second": 6793.891 }, { "epoch": 2.9948010610079576, "grad_norm": 1.1631587185707446, "learning_rate": 7.793170874625943e-10, "loss": 0.5197, "num_input_tokens_seen": 3191082304, "step": 3530, "train_runtime": 469677.0947, "train_tokens_per_second": 6794.205 }, { "epoch": 2.9990450928381964, "grad_norm": 0.9116914677984228, "learning_rate": 1.0959195473614348e-10, "loss": 0.5392, "num_input_tokens_seen": 3195636736, "step": 3535, "train_runtime": 470373.7704, "train_tokens_per_second": 6793.824 } ], "logging_steps": 5, "max_steps": 3537, "num_input_tokens_seen": 3196694976, "num_train_epochs": 3, "save_steps": 1180, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.62813488136192e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }