{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9999958772917105, "eval_steps": 1000, "global_step": 121279, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 0.00024811801548585953, "loss": 8.2906, "step": 100 }, { "epoch": 0.0, "learning_rate": 0.00029403430324938403, "loss": 5.1339, "step": 200 }, { "epoch": 0.0, "learning_rate": 0.00029980426002857634, "loss": 4.2501, "step": 300 }, { "epoch": 0.0, "learning_rate": 0.00029955648791285023, "loss": 3.94, "step": 400 }, { "epoch": 0.0, "learning_rate": 0.0002993087157971242, "loss": 3.7765, "step": 500 }, { "epoch": 0.0, "learning_rate": 0.0002990609436813981, "loss": 3.6756, "step": 600 }, { "epoch": 0.01, "learning_rate": 0.000298813171565672, "loss": 3.6106, "step": 700 }, { "epoch": 0.01, "learning_rate": 0.00029856539944994587, "loss": 3.5704, "step": 800 }, { "epoch": 0.01, "learning_rate": 0.00029831762733421977, "loss": 3.5304, "step": 900 }, { "epoch": 0.01, "learning_rate": 0.00029806985521849366, "loss": 3.5018, "step": 1000 }, { "epoch": 0.01, "eval_accuracy": 0.3767958311915216, "eval_loss": 3.509509325027466, "eval_runtime": 36.7301, "eval_samples_per_second": 309.447, "eval_steps_per_second": 2.586, "step": 1000 }, { "epoch": 0.01, "learning_rate": 0.0002978220831027676, "loss": 3.4716, "step": 1100 }, { "epoch": 0.01, "learning_rate": 0.00029757678870819877, "loss": 3.4476, "step": 1200 }, { "epoch": 0.01, "learning_rate": 0.00029732901659247266, "loss": 3.4368, "step": 1300 }, { "epoch": 0.01, "learning_rate": 0.00029708124447674656, "loss": 3.4075, "step": 1400 }, { "epoch": 0.01, "learning_rate": 0.00029683347236102045, "loss": 3.3945, "step": 1500 }, { "epoch": 0.01, "learning_rate": 0.00029658570024529435, "loss": 3.3788, "step": 1600 }, { "epoch": 0.01, "learning_rate": 0.00029633792812956824, "loss": 3.3738, "step": 1700 }, { "epoch": 0.01, "learning_rate": 0.0002960901560138422, "loss": 3.3585, "step": 1800 }, { "epoch": 0.02, "learning_rate": 0.0002958423838981161, "loss": 3.3479, "step": 1900 }, { "epoch": 0.02, "learning_rate": 0.00029559461178239, "loss": 3.3375, "step": 2000 }, { "epoch": 0.02, "eval_accuracy": 0.391758262312935, "eval_loss": 3.3602521419525146, "eval_runtime": 36.8476, "eval_samples_per_second": 308.46, "eval_steps_per_second": 2.578, "step": 2000 }, { "epoch": 0.02, "learning_rate": 0.0002953468396666639, "loss": 3.3351, "step": 2100 }, { "epoch": 0.02, "learning_rate": 0.00029509906755093777, "loss": 3.3283, "step": 2200 }, { "epoch": 0.02, "learning_rate": 0.0002948512954352117, "loss": 3.3185, "step": 2300 }, { "epoch": 0.02, "learning_rate": 0.0002946035233194856, "loss": 3.309, "step": 2400 }, { "epoch": 0.02, "learning_rate": 0.0002943557512037595, "loss": 3.3015, "step": 2500 }, { "epoch": 0.02, "learning_rate": 0.0002941079790880334, "loss": 3.3015, "step": 2600 }, { "epoch": 0.02, "learning_rate": 0.0002938602069723073, "loss": 3.2913, "step": 2700 }, { "epoch": 0.02, "learning_rate": 0.0002936124348565812, "loss": 3.2871, "step": 2800 }, { "epoch": 0.02, "learning_rate": 0.00029336466274085515, "loss": 3.2761, "step": 2900 }, { "epoch": 0.02, "learning_rate": 0.0002931193683462863, "loss": 3.2694, "step": 3000 }, { "epoch": 0.02, "eval_accuracy": 0.39822664917193284, "eval_loss": 3.2995240688323975, "eval_runtime": 36.3126, "eval_samples_per_second": 313.004, "eval_steps_per_second": 2.616, "step": 3000 }, { "epoch": 0.03, "learning_rate": 0.0002928715962305602, "loss": 3.2757, "step": 3100 }, { "epoch": 0.03, "learning_rate": 0.0002926238241148341, "loss": 3.2597, "step": 3200 }, { "epoch": 0.03, "learning_rate": 0.000292376051999108, "loss": 3.2603, "step": 3300 }, { "epoch": 0.03, "learning_rate": 0.0002921282798833819, "loss": 3.2499, "step": 3400 }, { "epoch": 0.03, "learning_rate": 0.0002918805077676558, "loss": 3.2522, "step": 3500 }, { "epoch": 0.03, "learning_rate": 0.00029163273565192973, "loss": 3.2511, "step": 3600 }, { "epoch": 0.03, "learning_rate": 0.0002913849635362036, "loss": 3.2399, "step": 3700 }, { "epoch": 0.03, "learning_rate": 0.0002911371914204775, "loss": 3.2459, "step": 3800 }, { "epoch": 0.03, "learning_rate": 0.0002908894193047514, "loss": 3.2364, "step": 3900 }, { "epoch": 0.03, "learning_rate": 0.0002906416471890253, "loss": 3.236, "step": 4000 }, { "epoch": 0.03, "eval_accuracy": 0.4022114581269458, "eval_loss": 3.262552261352539, "eval_runtime": 36.6585, "eval_samples_per_second": 310.051, "eval_steps_per_second": 2.591, "step": 4000 }, { "epoch": 0.03, "learning_rate": 0.0002903938750732992, "loss": 3.2373, "step": 4100 }, { "epoch": 0.03, "learning_rate": 0.00029014610295757315, "loss": 3.2318, "step": 4200 }, { "epoch": 0.04, "learning_rate": 0.00028989833084184705, "loss": 3.2394, "step": 4300 }, { "epoch": 0.04, "learning_rate": 0.00028965055872612095, "loss": 3.2198, "step": 4400 }, { "epoch": 0.04, "learning_rate": 0.00028940278661039484, "loss": 3.2214, "step": 4500 }, { "epoch": 0.04, "learning_rate": 0.000289157492215826, "loss": 3.2195, "step": 4600 }, { "epoch": 0.04, "learning_rate": 0.0002889097201000999, "loss": 3.22, "step": 4700 }, { "epoch": 0.04, "learning_rate": 0.0002886619479843738, "loss": 3.2135, "step": 4800 }, { "epoch": 0.04, "learning_rate": 0.00028841417586864774, "loss": 3.2163, "step": 4900 }, { "epoch": 0.04, "learning_rate": 0.00028816640375292163, "loss": 3.2131, "step": 5000 }, { "epoch": 0.04, "eval_accuracy": 0.40479479480866404, "eval_loss": 3.2350122928619385, "eval_runtime": 36.6665, "eval_samples_per_second": 309.983, "eval_steps_per_second": 2.591, "step": 5000 }, { "epoch": 0.04, "learning_rate": 0.0002879186316371955, "loss": 3.2066, "step": 5100 }, { "epoch": 0.04, "learning_rate": 0.0002876708595214694, "loss": 3.2067, "step": 5200 }, { "epoch": 0.04, "learning_rate": 0.0002874230874057433, "loss": 3.2011, "step": 5300 }, { "epoch": 0.04, "learning_rate": 0.00028717531529001727, "loss": 3.1956, "step": 5400 }, { "epoch": 0.05, "learning_rate": 0.00028692754317429116, "loss": 3.1997, "step": 5500 }, { "epoch": 0.05, "learning_rate": 0.00028667977105856506, "loss": 3.2032, "step": 5600 }, { "epoch": 0.05, "learning_rate": 0.00028643199894283895, "loss": 3.1976, "step": 5700 }, { "epoch": 0.05, "learning_rate": 0.00028618422682711285, "loss": 3.1987, "step": 5800 }, { "epoch": 0.05, "learning_rate": 0.00028593645471138674, "loss": 3.1932, "step": 5900 }, { "epoch": 0.05, "learning_rate": 0.0002856886825956607, "loss": 3.1934, "step": 6000 }, { "epoch": 0.05, "eval_accuracy": 0.4072977702527638, "eval_loss": 3.21467661857605, "eval_runtime": 35.9987, "eval_samples_per_second": 315.734, "eval_steps_per_second": 2.639, "step": 6000 }, { "epoch": 0.05, "learning_rate": 0.00028544091047993453, "loss": 3.1823, "step": 6100 }, { "epoch": 0.05, "learning_rate": 0.00028519313836420843, "loss": 3.1893, "step": 6200 }, { "epoch": 0.05, "learning_rate": 0.0002849453662484824, "loss": 3.1883, "step": 6300 }, { "epoch": 0.05, "learning_rate": 0.0002846975941327563, "loss": 3.1824, "step": 6400 }, { "epoch": 0.05, "learning_rate": 0.00028444982201703017, "loss": 3.1806, "step": 6500 }, { "epoch": 0.05, "learning_rate": 0.00028420204990130406, "loss": 3.178, "step": 6600 }, { "epoch": 0.06, "learning_rate": 0.00028395427778557796, "loss": 3.1819, "step": 6700 }, { "epoch": 0.06, "learning_rate": 0.00028370898339100917, "loss": 3.1905, "step": 6800 }, { "epoch": 0.06, "learning_rate": 0.00028346121127528306, "loss": 3.1775, "step": 6900 }, { "epoch": 0.06, "learning_rate": 0.00028321343915955696, "loss": 3.177, "step": 7000 }, { "epoch": 0.06, "eval_accuracy": 0.40887866977212045, "eval_loss": 3.2006027698516846, "eval_runtime": 36.3445, "eval_samples_per_second": 312.73, "eval_steps_per_second": 2.614, "step": 7000 }, { "epoch": 0.06, "learning_rate": 0.00028296566704383085, "loss": 3.1713, "step": 7100 }, { "epoch": 0.06, "learning_rate": 0.0002827178949281048, "loss": 3.1711, "step": 7200 }, { "epoch": 0.06, "learning_rate": 0.0002824701228123787, "loss": 3.1705, "step": 7300 }, { "epoch": 0.06, "learning_rate": 0.00028222235069665254, "loss": 3.1755, "step": 7400 }, { "epoch": 0.06, "learning_rate": 0.0002819745785809265, "loss": 3.1673, "step": 7500 }, { "epoch": 0.06, "learning_rate": 0.0002817268064652004, "loss": 3.1703, "step": 7600 }, { "epoch": 0.06, "learning_rate": 0.0002814790343494743, "loss": 3.1608, "step": 7700 }, { "epoch": 0.06, "learning_rate": 0.0002812312622337482, "loss": 3.1637, "step": 7800 }, { "epoch": 0.07, "learning_rate": 0.00028098349011802207, "loss": 3.1726, "step": 7900 }, { "epoch": 0.07, "learning_rate": 0.00028073571800229597, "loss": 3.1653, "step": 8000 }, { "epoch": 0.07, "eval_accuracy": 0.4097224627671145, "eval_loss": 3.1890077590942383, "eval_runtime": 36.3132, "eval_samples_per_second": 312.999, "eval_steps_per_second": 2.616, "step": 8000 }, { "epoch": 0.07, "learning_rate": 0.0002804879458865699, "loss": 3.1585, "step": 8100 }, { "epoch": 0.07, "learning_rate": 0.0002802401737708438, "loss": 3.1608, "step": 8200 }, { "epoch": 0.07, "learning_rate": 0.0002799924016551177, "loss": 3.1606, "step": 8300 }, { "epoch": 0.07, "learning_rate": 0.0002797446295393916, "loss": 3.1605, "step": 8400 }, { "epoch": 0.07, "learning_rate": 0.0002794968574236655, "loss": 3.1588, "step": 8500 }, { "epoch": 0.07, "learning_rate": 0.00027924908530793945, "loss": 3.1691, "step": 8600 }, { "epoch": 0.07, "learning_rate": 0.00027900131319221334, "loss": 3.1556, "step": 8700 }, { "epoch": 0.07, "learning_rate": 0.00027875354107648724, "loss": 3.1505, "step": 8800 }, { "epoch": 0.07, "learning_rate": 0.0002785082466819184, "loss": 3.162, "step": 8900 }, { "epoch": 0.07, "learning_rate": 0.0002782604745661923, "loss": 3.1548, "step": 9000 }, { "epoch": 0.07, "eval_accuracy": 0.4109264957928121, "eval_loss": 3.1779375076293945, "eval_runtime": 36.2677, "eval_samples_per_second": 313.392, "eval_steps_per_second": 2.619, "step": 9000 }, { "epoch": 0.08, "learning_rate": 0.0002780127024504662, "loss": 3.1461, "step": 9100 }, { "epoch": 0.08, "learning_rate": 0.0002777649303347401, "loss": 3.1508, "step": 9200 }, { "epoch": 0.08, "learning_rate": 0.00027751715821901403, "loss": 3.1521, "step": 9300 }, { "epoch": 0.08, "learning_rate": 0.0002772718638244452, "loss": 3.1486, "step": 9400 }, { "epoch": 0.08, "learning_rate": 0.0002770240917087191, "loss": 3.1457, "step": 9500 }, { "epoch": 0.08, "learning_rate": 0.000276776319592993, "loss": 3.1577, "step": 9600 }, { "epoch": 0.08, "learning_rate": 0.00027652854747726687, "loss": 3.1528, "step": 9700 }, { "epoch": 0.08, "learning_rate": 0.0002762807753615408, "loss": 3.1404, "step": 9800 }, { "epoch": 0.08, "learning_rate": 0.0002760330032458147, "loss": 3.145, "step": 9900 }, { "epoch": 0.08, "learning_rate": 0.00027578523113008855, "loss": 3.152, "step": 10000 }, { "epoch": 0.08, "eval_accuracy": 0.4119192341851163, "eval_loss": 3.169926404953003, "eval_runtime": 36.3986, "eval_samples_per_second": 312.265, "eval_steps_per_second": 2.61, "step": 10000 }, { "epoch": 0.08, "learning_rate": 0.0002755374590143625, "loss": 3.1468, "step": 10100 }, { "epoch": 0.08, "learning_rate": 0.0002752896868986364, "loss": 3.1396, "step": 10200 }, { "epoch": 0.08, "learning_rate": 0.0002750419147829103, "loss": 3.1375, "step": 10300 }, { "epoch": 0.09, "learning_rate": 0.0002747941426671842, "loss": 3.1463, "step": 10400 }, { "epoch": 0.09, "learning_rate": 0.0002745463705514581, "loss": 3.1433, "step": 10500 }, { "epoch": 0.09, "learning_rate": 0.00027429859843573203, "loss": 3.145, "step": 10600 }, { "epoch": 0.09, "learning_rate": 0.00027405082632000593, "loss": 3.1393, "step": 10700 }, { "epoch": 0.09, "learning_rate": 0.0002738055319254371, "loss": 3.1389, "step": 10800 }, { "epoch": 0.09, "learning_rate": 0.000273557759809711, "loss": 3.1366, "step": 10900 }, { "epoch": 0.09, "learning_rate": 0.00027330998769398493, "loss": 3.1416, "step": 11000 }, { "epoch": 0.09, "eval_accuracy": 0.41301242412444356, "eval_loss": 3.1622183322906494, "eval_runtime": 36.132, "eval_samples_per_second": 314.569, "eval_steps_per_second": 2.629, "step": 11000 }, { "epoch": 0.09, "learning_rate": 0.0002730622155782588, "loss": 3.1402, "step": 11100 }, { "epoch": 0.09, "learning_rate": 0.0002728144434625327, "loss": 3.1331, "step": 11200 }, { "epoch": 0.09, "learning_rate": 0.0002725666713468066, "loss": 3.1401, "step": 11300 }, { "epoch": 0.09, "learning_rate": 0.0002723188992310805, "loss": 3.1359, "step": 11400 }, { "epoch": 0.09, "learning_rate": 0.0002720711271153544, "loss": 3.1345, "step": 11500 }, { "epoch": 0.1, "learning_rate": 0.0002718233549996283, "loss": 3.1287, "step": 11600 }, { "epoch": 0.1, "learning_rate": 0.0002715755828839022, "loss": 3.1352, "step": 11700 }, { "epoch": 0.1, "learning_rate": 0.0002713278107681761, "loss": 3.1246, "step": 11800 }, { "epoch": 0.1, "learning_rate": 0.00027108003865245004, "loss": 3.1382, "step": 11900 }, { "epoch": 0.1, "learning_rate": 0.00027083226653672394, "loss": 3.1387, "step": 12000 }, { "epoch": 0.1, "eval_accuracy": 0.4138569098887274, "eval_loss": 3.156071662902832, "eval_runtime": 37.0052, "eval_samples_per_second": 307.146, "eval_steps_per_second": 2.567, "step": 12000 }, { "epoch": 0.1, "learning_rate": 0.00027058449442099783, "loss": 3.1329, "step": 12100 }, { "epoch": 0.1, "learning_rate": 0.000270339200026429, "loss": 3.1254, "step": 12200 }, { "epoch": 0.1, "learning_rate": 0.00027009142791070294, "loss": 3.1315, "step": 12300 }, { "epoch": 0.1, "learning_rate": 0.00026984365579497683, "loss": 3.1257, "step": 12400 }, { "epoch": 0.1, "learning_rate": 0.00026959588367925073, "loss": 3.1171, "step": 12500 }, { "epoch": 0.1, "learning_rate": 0.0002693481115635246, "loss": 3.128, "step": 12600 }, { "epoch": 0.1, "learning_rate": 0.0002691003394477985, "loss": 3.1234, "step": 12700 }, { "epoch": 0.11, "learning_rate": 0.0002688525673320724, "loss": 3.1354, "step": 12800 }, { "epoch": 0.11, "learning_rate": 0.0002686047952163463, "loss": 3.1286, "step": 12900 }, { "epoch": 0.11, "learning_rate": 0.0002683570231006202, "loss": 3.1192, "step": 13000 }, { "epoch": 0.11, "eval_accuracy": 0.4142968183877596, "eval_loss": 3.149528741836548, "eval_runtime": 36.1498, "eval_samples_per_second": 314.414, "eval_steps_per_second": 2.628, "step": 13000 }, { "epoch": 0.11, "learning_rate": 0.00026810925098489415, "loss": 3.1155, "step": 13100 }, { "epoch": 0.11, "learning_rate": 0.00026786147886916805, "loss": 3.1268, "step": 13200 }, { "epoch": 0.11, "learning_rate": 0.00026761370675344194, "loss": 3.1216, "step": 13300 }, { "epoch": 0.11, "learning_rate": 0.00026736593463771584, "loss": 3.1262, "step": 13400 }, { "epoch": 0.11, "learning_rate": 0.000267120640243147, "loss": 3.1215, "step": 13500 }, { "epoch": 0.11, "learning_rate": 0.00026687286812742094, "loss": 3.1259, "step": 13600 }, { "epoch": 0.11, "learning_rate": 0.00026662509601169484, "loss": 3.1206, "step": 13700 }, { "epoch": 0.11, "learning_rate": 0.00026637732389596874, "loss": 3.1161, "step": 13800 }, { "epoch": 0.11, "learning_rate": 0.00026612955178024263, "loss": 3.1127, "step": 13900 }, { "epoch": 0.12, "learning_rate": 0.0002658817796645165, "loss": 3.1221, "step": 14000 }, { "epoch": 0.12, "eval_accuracy": 0.4149895876775741, "eval_loss": 3.1430952548980713, "eval_runtime": 36.2908, "eval_samples_per_second": 313.192, "eval_steps_per_second": 2.618, "step": 14000 }, { "epoch": 0.12, "learning_rate": 0.0002656340075487905, "loss": 3.1211, "step": 14100 }, { "epoch": 0.12, "learning_rate": 0.0002653862354330643, "loss": 3.1159, "step": 14200 }, { "epoch": 0.12, "learning_rate": 0.0002651384633173382, "loss": 3.1211, "step": 14300 }, { "epoch": 0.12, "learning_rate": 0.00026489069120161216, "loss": 3.1168, "step": 14400 }, { "epoch": 0.12, "learning_rate": 0.00026464291908588606, "loss": 3.1166, "step": 14500 }, { "epoch": 0.12, "learning_rate": 0.00026439514697015995, "loss": 3.1168, "step": 14600 }, { "epoch": 0.12, "learning_rate": 0.0002641498525755911, "loss": 3.1165, "step": 14700 }, { "epoch": 0.12, "learning_rate": 0.00026390208045986506, "loss": 3.1115, "step": 14800 }, { "epoch": 0.12, "learning_rate": 0.00026365430834413895, "loss": 3.1124, "step": 14900 }, { "epoch": 0.12, "learning_rate": 0.00026340653622841285, "loss": 3.1136, "step": 15000 }, { "epoch": 0.12, "eval_accuracy": 0.4154509720245905, "eval_loss": 3.139134168624878, "eval_runtime": 36.9033, "eval_samples_per_second": 307.994, "eval_steps_per_second": 2.574, "step": 15000 }, { "epoch": 0.12, "learning_rate": 0.00026315876411268674, "loss": 3.1096, "step": 15100 }, { "epoch": 0.13, "learning_rate": 0.00026291099199696064, "loss": 3.1066, "step": 15200 }, { "epoch": 0.13, "learning_rate": 0.00026266321988123453, "loss": 3.1172, "step": 15300 }, { "epoch": 0.13, "learning_rate": 0.0002624154477655085, "loss": 3.1209, "step": 15400 }, { "epoch": 0.13, "learning_rate": 0.0002621676756497823, "loss": 3.1057, "step": 15500 }, { "epoch": 0.13, "learning_rate": 0.0002619199035340562, "loss": 3.1105, "step": 15600 }, { "epoch": 0.13, "learning_rate": 0.00026167213141833017, "loss": 3.1094, "step": 15700 }, { "epoch": 0.13, "learning_rate": 0.0002614268370237613, "loss": 3.1113, "step": 15800 }, { "epoch": 0.13, "learning_rate": 0.0002611790649080352, "loss": 3.1052, "step": 15900 }, { "epoch": 0.13, "learning_rate": 0.0002609312927923091, "loss": 3.106, "step": 16000 }, { "epoch": 0.13, "eval_accuracy": 0.41599687422496434, "eval_loss": 3.1348280906677246, "eval_runtime": 36.2431, "eval_samples_per_second": 313.605, "eval_steps_per_second": 2.621, "step": 16000 }, { "epoch": 0.13, "learning_rate": 0.00026068599839774027, "loss": 3.1111, "step": 16100 }, { "epoch": 0.13, "learning_rate": 0.00026043822628201417, "loss": 3.1061, "step": 16200 }, { "epoch": 0.13, "learning_rate": 0.0002601904541662881, "loss": 3.1147, "step": 16300 }, { "epoch": 0.14, "learning_rate": 0.000259942682050562, "loss": 3.1053, "step": 16400 }, { "epoch": 0.14, "learning_rate": 0.0002596949099348359, "loss": 3.1002, "step": 16500 }, { "epoch": 0.14, "learning_rate": 0.0002594471378191098, "loss": 3.113, "step": 16600 }, { "epoch": 0.14, "learning_rate": 0.0002591993657033837, "loss": 3.1075, "step": 16700 }, { "epoch": 0.14, "learning_rate": 0.00025895159358765765, "loss": 3.109, "step": 16800 }, { "epoch": 0.14, "learning_rate": 0.00025870382147193154, "loss": 3.0998, "step": 16900 }, { "epoch": 0.14, "learning_rate": 0.00025845604935620544, "loss": 3.1023, "step": 17000 }, { "epoch": 0.14, "eval_accuracy": 0.416494975344341, "eval_loss": 3.1311593055725098, "eval_runtime": 36.4415, "eval_samples_per_second": 311.897, "eval_steps_per_second": 2.607, "step": 17000 }, { "epoch": 0.14, "learning_rate": 0.00025820827724047933, "loss": 3.1055, "step": 17100 }, { "epoch": 0.14, "learning_rate": 0.0002579605051247532, "loss": 3.1019, "step": 17200 }, { "epoch": 0.14, "learning_rate": 0.0002577127330090271, "loss": 3.1029, "step": 17300 }, { "epoch": 0.14, "learning_rate": 0.00025746496089330107, "loss": 3.1028, "step": 17400 }, { "epoch": 0.14, "learning_rate": 0.00025721718877757497, "loss": 3.1048, "step": 17500 }, { "epoch": 0.15, "learning_rate": 0.00025696941666184886, "loss": 3.1033, "step": 17600 }, { "epoch": 0.15, "learning_rate": 0.00025672164454612276, "loss": 3.109, "step": 17700 }, { "epoch": 0.15, "learning_rate": 0.00025647387243039665, "loss": 3.0972, "step": 17800 }, { "epoch": 0.15, "learning_rate": 0.0002562261003146706, "loss": 3.1006, "step": 17900 }, { "epoch": 0.15, "learning_rate": 0.0002559783281989445, "loss": 3.1062, "step": 18000 }, { "epoch": 0.15, "eval_accuracy": 0.4171718109404897, "eval_loss": 3.1264827251434326, "eval_runtime": 37.3996, "eval_samples_per_second": 303.907, "eval_steps_per_second": 2.54, "step": 18000 }, { "epoch": 0.15, "learning_rate": 0.00025573055608321834, "loss": 3.1048, "step": 18100 }, { "epoch": 0.15, "learning_rate": 0.0002554827839674923, "loss": 3.0967, "step": 18200 }, { "epoch": 0.15, "learning_rate": 0.00025523748957292344, "loss": 3.0937, "step": 18300 }, { "epoch": 0.15, "learning_rate": 0.00025498971745719734, "loss": 3.1036, "step": 18400 }, { "epoch": 0.15, "learning_rate": 0.00025474194534147123, "loss": 3.0981, "step": 18500 }, { "epoch": 0.15, "learning_rate": 0.0002544941732257452, "loss": 3.1013, "step": 18600 }, { "epoch": 0.15, "learning_rate": 0.0002542464011100191, "loss": 3.0977, "step": 18700 }, { "epoch": 0.16, "learning_rate": 0.000253998628994293, "loss": 3.0961, "step": 18800 }, { "epoch": 0.16, "learning_rate": 0.00025375085687856687, "loss": 3.0993, "step": 18900 }, { "epoch": 0.16, "learning_rate": 0.00025350308476284076, "loss": 3.1007, "step": 19000 }, { "epoch": 0.16, "eval_accuracy": 0.4176775325220543, "eval_loss": 3.1230428218841553, "eval_runtime": 36.1092, "eval_samples_per_second": 314.767, "eval_steps_per_second": 2.631, "step": 19000 }, { "epoch": 0.16, "learning_rate": 0.00025325531264711466, "loss": 3.0957, "step": 19100 }, { "epoch": 0.16, "learning_rate": 0.0002530075405313886, "loss": 3.099, "step": 19200 }, { "epoch": 0.16, "learning_rate": 0.0002527597684156625, "loss": 3.1044, "step": 19300 }, { "epoch": 0.16, "learning_rate": 0.00025251199629993635, "loss": 3.0985, "step": 19400 }, { "epoch": 0.16, "learning_rate": 0.0002522642241842103, "loss": 3.1026, "step": 19500 }, { "epoch": 0.16, "learning_rate": 0.00025201892978964145, "loss": 3.0926, "step": 19600 }, { "epoch": 0.16, "learning_rate": 0.00025177115767391535, "loss": 3.0932, "step": 19700 }, { "epoch": 0.16, "learning_rate": 0.00025152338555818924, "loss": 3.0912, "step": 19800 }, { "epoch": 0.16, "learning_rate": 0.0002512756134424632, "loss": 3.0914, "step": 19900 }, { "epoch": 0.16, "learning_rate": 0.0002510278413267371, "loss": 3.0979, "step": 20000 }, { "epoch": 0.16, "eval_accuracy": 0.41779807437848204, "eval_loss": 3.1200578212738037, "eval_runtime": 36.2694, "eval_samples_per_second": 313.377, "eval_steps_per_second": 2.619, "step": 20000 }, { "epoch": 0.17, "learning_rate": 0.000250780069211011, "loss": 3.088, "step": 20100 }, { "epoch": 0.17, "learning_rate": 0.0002505322970952849, "loss": 3.0936, "step": 20200 }, { "epoch": 0.17, "learning_rate": 0.00025028452497955877, "loss": 3.0931, "step": 20300 }, { "epoch": 0.17, "learning_rate": 0.0002500392305849899, "loss": 3.09, "step": 20400 }, { "epoch": 0.17, "learning_rate": 0.0002497914584692638, "loss": 3.0961, "step": 20500 }, { "epoch": 0.17, "learning_rate": 0.00024954368635353777, "loss": 3.0979, "step": 20600 }, { "epoch": 0.17, "learning_rate": 0.00024929591423781167, "loss": 3.0899, "step": 20700 }, { "epoch": 0.17, "learning_rate": 0.00024904814212208556, "loss": 3.0919, "step": 20800 }, { "epoch": 0.17, "learning_rate": 0.00024880037000635946, "loss": 3.0944, "step": 20900 }, { "epoch": 0.17, "learning_rate": 0.00024855259789063335, "loss": 3.0897, "step": 21000 }, { "epoch": 0.17, "eval_accuracy": 0.4178625019224348, "eval_loss": 3.1168224811553955, "eval_runtime": 36.6661, "eval_samples_per_second": 309.987, "eval_steps_per_second": 2.591, "step": 21000 }, { "epoch": 0.17, "learning_rate": 0.00024830730349606456, "loss": 3.091, "step": 21100 }, { "epoch": 0.17, "learning_rate": 0.0002480595313803384, "loss": 3.0925, "step": 21200 }, { "epoch": 0.18, "learning_rate": 0.00024781175926461235, "loss": 3.0861, "step": 21300 }, { "epoch": 0.18, "learning_rate": 0.00024756398714888625, "loss": 3.0845, "step": 21400 }, { "epoch": 0.18, "learning_rate": 0.00024731621503316014, "loss": 3.0944, "step": 21500 }, { "epoch": 0.18, "learning_rate": 0.00024706844291743404, "loss": 3.083, "step": 21600 }, { "epoch": 0.18, "learning_rate": 0.00024682067080170793, "loss": 3.0803, "step": 21700 }, { "epoch": 0.18, "learning_rate": 0.00024657289868598183, "loss": 3.0899, "step": 21800 }, { "epoch": 0.18, "learning_rate": 0.0002463251265702558, "loss": 3.0872, "step": 21900 }, { "epoch": 0.18, "learning_rate": 0.0002460773544545297, "loss": 3.0863, "step": 22000 }, { "epoch": 0.18, "eval_accuracy": 0.4188947281642584, "eval_loss": 3.1127541065216064, "eval_runtime": 36.4488, "eval_samples_per_second": 311.835, "eval_steps_per_second": 2.606, "step": 22000 }, { "epoch": 0.18, "learning_rate": 0.00024582958233880357, "loss": 3.0979, "step": 22100 }, { "epoch": 0.18, "learning_rate": 0.00024558181022307746, "loss": 3.0893, "step": 22200 }, { "epoch": 0.18, "learning_rate": 0.00024533403810735136, "loss": 3.0898, "step": 22300 }, { "epoch": 0.18, "learning_rate": 0.0002450862659916253, "loss": 3.0875, "step": 22400 }, { "epoch": 0.19, "learning_rate": 0.0002448384938758992, "loss": 3.0822, "step": 22500 }, { "epoch": 0.19, "learning_rate": 0.0002445907217601731, "loss": 3.08, "step": 22600 }, { "epoch": 0.19, "learning_rate": 0.000244342949644447, "loss": 3.0835, "step": 22700 }, { "epoch": 0.19, "learning_rate": 0.0002440951775287209, "loss": 3.0913, "step": 22800 }, { "epoch": 0.19, "learning_rate": 0.00024384740541299479, "loss": 3.0822, "step": 22900 }, { "epoch": 0.19, "learning_rate": 0.0002435996332972687, "loss": 3.0898, "step": 23000 }, { "epoch": 0.19, "eval_accuracy": 0.4191060227976518, "eval_loss": 3.1097447872161865, "eval_runtime": 36.3549, "eval_samples_per_second": 312.64, "eval_steps_per_second": 2.613, "step": 23000 }, { "epoch": 0.19, "learning_rate": 0.0002433518611815426, "loss": 3.076, "step": 23100 }, { "epoch": 0.19, "learning_rate": 0.0002431040890658165, "loss": 3.0892, "step": 23200 }, { "epoch": 0.19, "learning_rate": 0.00024285631695009042, "loss": 3.0842, "step": 23300 }, { "epoch": 0.19, "learning_rate": 0.00024260854483436432, "loss": 3.0857, "step": 23400 }, { "epoch": 0.19, "learning_rate": 0.0002423607727186382, "loss": 3.0845, "step": 23500 }, { "epoch": 0.19, "learning_rate": 0.00024211300060291213, "loss": 3.0933, "step": 23600 }, { "epoch": 0.2, "learning_rate": 0.00024186522848718603, "loss": 3.0855, "step": 23700 }, { "epoch": 0.2, "learning_rate": 0.00024161745637145995, "loss": 3.0843, "step": 23800 }, { "epoch": 0.2, "learning_rate": 0.00024136968425573385, "loss": 3.097, "step": 23900 }, { "epoch": 0.2, "learning_rate": 0.00024112191214000774, "loss": 3.0825, "step": 24000 }, { "epoch": 0.2, "eval_accuracy": 0.41910394448978233, "eval_loss": 3.107358694076538, "eval_runtime": 36.6977, "eval_samples_per_second": 309.72, "eval_steps_per_second": 2.589, "step": 24000 }, { "epoch": 0.2, "learning_rate": 0.00024087414002428166, "loss": 3.0926, "step": 24100 }, { "epoch": 0.2, "learning_rate": 0.00024062636790855556, "loss": 3.079, "step": 24200 }, { "epoch": 0.2, "learning_rate": 0.00024037859579282945, "loss": 3.0831, "step": 24300 }, { "epoch": 0.2, "learning_rate": 0.0002401333013982606, "loss": 3.0865, "step": 24400 }, { "epoch": 0.2, "learning_rate": 0.00023988552928253453, "loss": 3.0748, "step": 24500 }, { "epoch": 0.2, "learning_rate": 0.00023963775716680843, "loss": 3.073, "step": 24600 }, { "epoch": 0.2, "learning_rate": 0.00023938998505108232, "loss": 3.0822, "step": 24700 }, { "epoch": 0.2, "learning_rate": 0.00023914221293535625, "loss": 3.0817, "step": 24800 }, { "epoch": 0.21, "learning_rate": 0.00023889444081963014, "loss": 3.0803, "step": 24900 }, { "epoch": 0.21, "learning_rate": 0.00023864666870390404, "loss": 3.0808, "step": 25000 }, { "epoch": 0.21, "eval_accuracy": 0.41998445425713654, "eval_loss": 3.103720188140869, "eval_runtime": 36.4784, "eval_samples_per_second": 311.582, "eval_steps_per_second": 2.604, "step": 25000 }, { "epoch": 0.21, "learning_rate": 0.00023839889658817796, "loss": 3.0778, "step": 25100 }, { "epoch": 0.21, "learning_rate": 0.00023815112447245185, "loss": 3.0755, "step": 25200 }, { "epoch": 0.21, "learning_rate": 0.00023790335235672575, "loss": 3.0817, "step": 25300 }, { "epoch": 0.21, "learning_rate": 0.00023765558024099967, "loss": 3.0759, "step": 25400 }, { "epoch": 0.21, "learning_rate": 0.00023740780812527357, "loss": 3.0813, "step": 25500 }, { "epoch": 0.21, "learning_rate": 0.00023716003600954746, "loss": 3.08, "step": 25600 }, { "epoch": 0.21, "learning_rate": 0.00023691226389382138, "loss": 3.0746, "step": 25700 }, { "epoch": 0.21, "learning_rate": 0.00023666449177809528, "loss": 3.0786, "step": 25800 }, { "epoch": 0.21, "learning_rate": 0.00023641919738352644, "loss": 3.0857, "step": 25900 }, { "epoch": 0.21, "learning_rate": 0.00023617142526780033, "loss": 3.0774, "step": 26000 }, { "epoch": 0.21, "eval_accuracy": 0.4197142742341089, "eval_loss": 3.1032252311706543, "eval_runtime": 36.4807, "eval_samples_per_second": 311.562, "eval_steps_per_second": 2.604, "step": 26000 }, { "epoch": 0.22, "learning_rate": 0.00023592365315207425, "loss": 3.0776, "step": 26100 }, { "epoch": 0.22, "learning_rate": 0.00023567588103634815, "loss": 3.0806, "step": 26200 }, { "epoch": 0.22, "learning_rate": 0.00023542810892062204, "loss": 3.0768, "step": 26300 }, { "epoch": 0.22, "learning_rate": 0.00023518033680489597, "loss": 3.0733, "step": 26400 }, { "epoch": 0.22, "learning_rate": 0.00023493256468916986, "loss": 3.0822, "step": 26500 }, { "epoch": 0.22, "learning_rate": 0.00023468479257344378, "loss": 3.0774, "step": 26600 }, { "epoch": 0.22, "learning_rate": 0.00023443702045771768, "loss": 3.0763, "step": 26700 }, { "epoch": 0.22, "learning_rate": 0.00023418924834199157, "loss": 3.0773, "step": 26800 }, { "epoch": 0.22, "learning_rate": 0.0002339414762262655, "loss": 3.0774, "step": 26900 }, { "epoch": 0.22, "learning_rate": 0.0002336937041105394, "loss": 3.0652, "step": 27000 }, { "epoch": 0.22, "eval_accuracy": 0.42021445366135496, "eval_loss": 3.098003387451172, "eval_runtime": 37.7336, "eval_samples_per_second": 301.217, "eval_steps_per_second": 2.518, "step": 27000 }, { "epoch": 0.22, "learning_rate": 0.0002334459319948133, "loss": 3.0767, "step": 27100 }, { "epoch": 0.22, "learning_rate": 0.0002331981598790872, "loss": 3.0743, "step": 27200 }, { "epoch": 0.23, "learning_rate": 0.00023295038776336108, "loss": 3.0693, "step": 27300 }, { "epoch": 0.23, "learning_rate": 0.00023270509336879226, "loss": 3.0829, "step": 27400 }, { "epoch": 0.23, "learning_rate": 0.00023245732125306616, "loss": 3.069, "step": 27500 }, { "epoch": 0.23, "learning_rate": 0.0002322120268584973, "loss": 3.0764, "step": 27600 }, { "epoch": 0.23, "learning_rate": 0.0002319642547427712, "loss": 3.0722, "step": 27700 }, { "epoch": 0.23, "learning_rate": 0.00023171648262704513, "loss": 3.0703, "step": 27800 }, { "epoch": 0.23, "learning_rate": 0.00023146871051131902, "loss": 3.0752, "step": 27900 }, { "epoch": 0.23, "learning_rate": 0.00023122093839559292, "loss": 3.0693, "step": 28000 }, { "epoch": 0.23, "eval_accuracy": 0.4207465004759325, "eval_loss": 3.096764087677002, "eval_runtime": 36.5764, "eval_samples_per_second": 310.747, "eval_steps_per_second": 2.597, "step": 28000 }, { "epoch": 0.23, "learning_rate": 0.00023097316627986684, "loss": 3.0629, "step": 28100 }, { "epoch": 0.23, "learning_rate": 0.00023072539416414074, "loss": 3.0749, "step": 28200 }, { "epoch": 0.23, "learning_rate": 0.00023047762204841466, "loss": 3.0658, "step": 28300 }, { "epoch": 0.23, "learning_rate": 0.00023022984993268855, "loss": 3.0806, "step": 28400 }, { "epoch": 0.23, "learning_rate": 0.00022998207781696245, "loss": 3.0677, "step": 28500 }, { "epoch": 0.24, "learning_rate": 0.00022973430570123637, "loss": 3.0779, "step": 28600 }, { "epoch": 0.24, "learning_rate": 0.00022948653358551027, "loss": 3.0714, "step": 28700 }, { "epoch": 0.24, "learning_rate": 0.00022923876146978416, "loss": 3.0707, "step": 28800 }, { "epoch": 0.24, "learning_rate": 0.00022899098935405808, "loss": 3.0757, "step": 28900 }, { "epoch": 0.24, "learning_rate": 0.00022874321723833198, "loss": 3.0665, "step": 29000 }, { "epoch": 0.24, "eval_accuracy": 0.4209293915684435, "eval_loss": 3.0943939685821533, "eval_runtime": 36.661, "eval_samples_per_second": 310.03, "eval_steps_per_second": 2.591, "step": 29000 }, { "epoch": 0.24, "learning_rate": 0.00022849544512260588, "loss": 3.0677, "step": 29100 }, { "epoch": 0.24, "learning_rate": 0.0002282476730068798, "loss": 3.0662, "step": 29200 }, { "epoch": 0.24, "learning_rate": 0.0002279999008911537, "loss": 3.0649, "step": 29300 }, { "epoch": 0.24, "learning_rate": 0.0002277521287754276, "loss": 3.067, "step": 29400 }, { "epoch": 0.24, "learning_rate": 0.0002275043566597015, "loss": 3.0686, "step": 29500 }, { "epoch": 0.24, "learning_rate": 0.0002272565845439754, "loss": 3.0726, "step": 29600 }, { "epoch": 0.24, "learning_rate": 0.00022700881242824933, "loss": 3.0639, "step": 29700 }, { "epoch": 0.25, "learning_rate": 0.00022676104031252322, "loss": 3.0679, "step": 29800 }, { "epoch": 0.25, "learning_rate": 0.0002265132681967971, "loss": 3.0778, "step": 29900 }, { "epoch": 0.25, "learning_rate": 0.00022626549608107104, "loss": 3.0657, "step": 30000 }, { "epoch": 0.25, "eval_accuracy": 0.4210423129626833, "eval_loss": 3.09199595451355, "eval_runtime": 36.5589, "eval_samples_per_second": 310.896, "eval_steps_per_second": 2.599, "step": 30000 }, { "epoch": 0.25, "learning_rate": 0.0002260177239653449, "loss": 3.0664, "step": 30100 }, { "epoch": 0.25, "learning_rate": 0.0002257699518496188, "loss": 3.0657, "step": 30200 }, { "epoch": 0.25, "learning_rate": 0.00022552217973389273, "loss": 3.0674, "step": 30300 }, { "epoch": 0.25, "learning_rate": 0.00022527440761816662, "loss": 3.0611, "step": 30400 }, { "epoch": 0.25, "learning_rate": 0.00022502663550244052, "loss": 3.0694, "step": 30500 }, { "epoch": 0.25, "learning_rate": 0.00022477886338671444, "loss": 3.0563, "step": 30600 }, { "epoch": 0.25, "learning_rate": 0.00022453109127098833, "loss": 3.0611, "step": 30700 }, { "epoch": 0.25, "learning_rate": 0.00022428331915526223, "loss": 3.0647, "step": 30800 }, { "epoch": 0.25, "learning_rate": 0.00022403554703953615, "loss": 3.0645, "step": 30900 }, { "epoch": 0.26, "learning_rate": 0.00022378777492381005, "loss": 3.0608, "step": 31000 }, { "epoch": 0.26, "eval_accuracy": 0.4213305049872461, "eval_loss": 3.0911319255828857, "eval_runtime": 36.982, "eval_samples_per_second": 307.339, "eval_steps_per_second": 2.569, "step": 31000 }, { "epoch": 0.26, "learning_rate": 0.00022354248052924123, "loss": 3.0683, "step": 31100 }, { "epoch": 0.26, "learning_rate": 0.0002232947084135151, "loss": 3.0719, "step": 31200 }, { "epoch": 0.26, "learning_rate": 0.00022304693629778905, "loss": 3.0614, "step": 31300 }, { "epoch": 0.26, "learning_rate": 0.0002228016419032202, "loss": 3.0655, "step": 31400 }, { "epoch": 0.26, "learning_rate": 0.0002225538697874941, "loss": 3.0682, "step": 31500 }, { "epoch": 0.26, "learning_rate": 0.000222306097671768, "loss": 3.0588, "step": 31600 }, { "epoch": 0.26, "learning_rate": 0.00022205832555604192, "loss": 3.0709, "step": 31700 }, { "epoch": 0.26, "learning_rate": 0.0002218105534403158, "loss": 3.0687, "step": 31800 }, { "epoch": 0.26, "learning_rate": 0.0002215627813245897, "loss": 3.0679, "step": 31900 }, { "epoch": 0.26, "learning_rate": 0.00022131500920886363, "loss": 3.0647, "step": 32000 }, { "epoch": 0.26, "eval_accuracy": 0.42134713145020164, "eval_loss": 3.089580774307251, "eval_runtime": 36.6102, "eval_samples_per_second": 310.46, "eval_steps_per_second": 2.595, "step": 32000 }, { "epoch": 0.26, "learning_rate": 0.00022106723709313752, "loss": 3.054, "step": 32100 }, { "epoch": 0.27, "learning_rate": 0.00022081946497741142, "loss": 3.0674, "step": 32200 }, { "epoch": 0.27, "learning_rate": 0.00022057169286168534, "loss": 3.0678, "step": 32300 }, { "epoch": 0.27, "learning_rate": 0.00022032392074595924, "loss": 3.06, "step": 32400 }, { "epoch": 0.27, "learning_rate": 0.0002200786263513904, "loss": 3.0647, "step": 32500 }, { "epoch": 0.27, "learning_rate": 0.0002198308542356643, "loss": 3.0619, "step": 32600 }, { "epoch": 0.27, "learning_rate": 0.0002195830821199382, "loss": 3.0655, "step": 32700 }, { "epoch": 0.27, "learning_rate": 0.0002193353100042121, "loss": 3.066, "step": 32800 }, { "epoch": 0.27, "learning_rate": 0.000219087537888486, "loss": 3.0615, "step": 32900 }, { "epoch": 0.27, "learning_rate": 0.00021883976577275992, "loss": 3.0604, "step": 33000 }, { "epoch": 0.27, "eval_accuracy": 0.4216789679400228, "eval_loss": 3.0860743522644043, "eval_runtime": 36.6297, "eval_samples_per_second": 310.295, "eval_steps_per_second": 2.594, "step": 33000 }, { "epoch": 0.27, "learning_rate": 0.00021859199365703382, "loss": 3.0556, "step": 33100 }, { "epoch": 0.27, "learning_rate": 0.00021834422154130771, "loss": 3.0664, "step": 33200 }, { "epoch": 0.27, "learning_rate": 0.00021809644942558164, "loss": 3.0585, "step": 33300 }, { "epoch": 0.28, "learning_rate": 0.00021784867730985553, "loss": 3.0628, "step": 33400 }, { "epoch": 0.28, "learning_rate": 0.00021760090519412945, "loss": 3.0653, "step": 33500 }, { "epoch": 0.28, "learning_rate": 0.00021735313307840335, "loss": 3.0616, "step": 33600 }, { "epoch": 0.28, "learning_rate": 0.00021710536096267724, "loss": 3.0661, "step": 33700 }, { "epoch": 0.28, "learning_rate": 0.00021685758884695117, "loss": 3.0638, "step": 33800 }, { "epoch": 0.28, "learning_rate": 0.00021660981673122506, "loss": 3.0547, "step": 33900 }, { "epoch": 0.28, "learning_rate": 0.00021636204461549893, "loss": 3.0577, "step": 34000 }, { "epoch": 0.28, "eval_accuracy": 0.422070382588768, "eval_loss": 3.084482431411743, "eval_runtime": 36.4437, "eval_samples_per_second": 311.879, "eval_steps_per_second": 2.607, "step": 34000 }, { "epoch": 0.28, "learning_rate": 0.00021611427249977288, "loss": 3.0627, "step": 34100 }, { "epoch": 0.28, "learning_rate": 0.00021586650038404675, "loss": 3.0604, "step": 34200 }, { "epoch": 0.28, "learning_rate": 0.00021561872826832064, "loss": 3.061, "step": 34300 }, { "epoch": 0.28, "learning_rate": 0.00021537095615259457, "loss": 3.0572, "step": 34400 }, { "epoch": 0.28, "learning_rate": 0.00021512566175802575, "loss": 3.057, "step": 34500 }, { "epoch": 0.29, "learning_rate": 0.00021487788964229964, "loss": 3.0622, "step": 34600 }, { "epoch": 0.29, "learning_rate": 0.00021463011752657354, "loss": 3.0586, "step": 34700 }, { "epoch": 0.29, "learning_rate": 0.00021438234541084746, "loss": 3.0567, "step": 34800 }, { "epoch": 0.29, "learning_rate": 0.00021413457329512136, "loss": 3.0617, "step": 34900 }, { "epoch": 0.29, "learning_rate": 0.00021388680117939525, "loss": 3.0606, "step": 35000 }, { "epoch": 0.29, "eval_accuracy": 0.4220225815077708, "eval_loss": 3.081350088119507, "eval_runtime": 39.7683, "eval_samples_per_second": 285.806, "eval_steps_per_second": 2.389, "step": 35000 }, { "epoch": 0.29, "learning_rate": 0.00021363902906366917, "loss": 3.0641, "step": 35100 }, { "epoch": 0.29, "learning_rate": 0.00021339125694794307, "loss": 3.0528, "step": 35200 }, { "epoch": 0.29, "learning_rate": 0.00021314348483221694, "loss": 3.0602, "step": 35300 }, { "epoch": 0.29, "learning_rate": 0.0002128957127164909, "loss": 3.0572, "step": 35400 }, { "epoch": 0.29, "learning_rate": 0.00021265041832192204, "loss": 3.0549, "step": 35500 }, { "epoch": 0.29, "learning_rate": 0.00021240512392735317, "loss": 3.0561, "step": 35600 }, { "epoch": 0.29, "learning_rate": 0.00021215735181162712, "loss": 3.0499, "step": 35700 }, { "epoch": 0.3, "learning_rate": 0.000211909579695901, "loss": 3.0517, "step": 35800 }, { "epoch": 0.3, "learning_rate": 0.0002116618075801749, "loss": 3.0606, "step": 35900 }, { "epoch": 0.3, "learning_rate": 0.0002114140354644488, "loss": 3.0515, "step": 36000 }, { "epoch": 0.3, "eval_accuracy": 0.4227111941818464, "eval_loss": 3.080108642578125, "eval_runtime": 36.5107, "eval_samples_per_second": 311.306, "eval_steps_per_second": 2.602, "step": 36000 }, { "epoch": 0.3, "learning_rate": 0.0002111662633487227, "loss": 3.0534, "step": 36100 }, { "epoch": 0.3, "learning_rate": 0.00021091849123299662, "loss": 3.0623, "step": 36200 }, { "epoch": 0.3, "learning_rate": 0.00021067071911727052, "loss": 3.0529, "step": 36300 }, { "epoch": 0.3, "learning_rate": 0.00021042294700154441, "loss": 3.0513, "step": 36400 }, { "epoch": 0.3, "learning_rate": 0.00021017517488581834, "loss": 3.0526, "step": 36500 }, { "epoch": 0.3, "learning_rate": 0.00020992740277009223, "loss": 3.0523, "step": 36600 }, { "epoch": 0.3, "learning_rate": 0.00020967963065436613, "loss": 3.052, "step": 36700 }, { "epoch": 0.3, "learning_rate": 0.0002094343362597973, "loss": 3.0554, "step": 36800 }, { "epoch": 0.3, "learning_rate": 0.00020918656414407123, "loss": 3.0506, "step": 36900 }, { "epoch": 0.31, "learning_rate": 0.0002089387920283451, "loss": 3.0527, "step": 37000 }, { "epoch": 0.31, "eval_accuracy": 0.42249435739413443, "eval_loss": 3.0771751403808594, "eval_runtime": 36.5791, "eval_samples_per_second": 310.724, "eval_steps_per_second": 2.597, "step": 37000 }, { "epoch": 0.31, "learning_rate": 0.000208691019912619, "loss": 3.0438, "step": 37100 }, { "epoch": 0.31, "learning_rate": 0.00020844324779689292, "loss": 3.0513, "step": 37200 }, { "epoch": 0.31, "learning_rate": 0.00020819547568116681, "loss": 3.0513, "step": 37300 }, { "epoch": 0.31, "learning_rate": 0.0002079477035654407, "loss": 3.0597, "step": 37400 }, { "epoch": 0.31, "learning_rate": 0.00020769993144971463, "loss": 3.0448, "step": 37500 }, { "epoch": 0.31, "learning_rate": 0.00020745215933398853, "loss": 3.049, "step": 37600 }, { "epoch": 0.31, "learning_rate": 0.00020720438721826242, "loss": 3.0482, "step": 37700 }, { "epoch": 0.31, "learning_rate": 0.00020695661510253634, "loss": 3.0529, "step": 37800 }, { "epoch": 0.31, "learning_rate": 0.00020670884298681024, "loss": 3.0511, "step": 37900 }, { "epoch": 0.31, "learning_rate": 0.00020646354859224142, "loss": 3.0507, "step": 38000 }, { "epoch": 0.31, "eval_accuracy": 0.4227541458778149, "eval_loss": 3.075801134109497, "eval_runtime": 36.6131, "eval_samples_per_second": 310.436, "eval_steps_per_second": 2.595, "step": 38000 }, { "epoch": 0.31, "learning_rate": 0.0002062157764765153, "loss": 3.0569, "step": 38100 }, { "epoch": 0.31, "learning_rate": 0.00020596800436078924, "loss": 3.0587, "step": 38200 }, { "epoch": 0.32, "learning_rate": 0.0002057202322450631, "loss": 3.0528, "step": 38300 }, { "epoch": 0.32, "learning_rate": 0.000205472460129337, "loss": 3.0474, "step": 38400 }, { "epoch": 0.32, "learning_rate": 0.00020522468801361093, "loss": 3.0521, "step": 38500 }, { "epoch": 0.32, "learning_rate": 0.00020497691589788482, "loss": 3.0479, "step": 38600 }, { "epoch": 0.32, "learning_rate": 0.00020472914378215872, "loss": 3.0475, "step": 38700 }, { "epoch": 0.32, "learning_rate": 0.00020448137166643264, "loss": 3.0441, "step": 38800 }, { "epoch": 0.32, "learning_rate": 0.00020423359955070653, "loss": 3.0514, "step": 38900 }, { "epoch": 0.32, "learning_rate": 0.00020398582743498046, "loss": 3.0433, "step": 39000 }, { "epoch": 0.32, "eval_accuracy": 0.4233707105457498, "eval_loss": 3.0738978385925293, "eval_runtime": 36.3934, "eval_samples_per_second": 312.31, "eval_steps_per_second": 2.61, "step": 39000 }, { "epoch": 0.32, "learning_rate": 0.00020373805531925435, "loss": 3.0446, "step": 39100 }, { "epoch": 0.32, "learning_rate": 0.00020349028320352825, "loss": 3.055, "step": 39200 }, { "epoch": 0.32, "learning_rate": 0.00020324251108780217, "loss": 3.0532, "step": 39300 }, { "epoch": 0.32, "learning_rate": 0.00020299473897207606, "loss": 3.0457, "step": 39400 }, { "epoch": 0.33, "learning_rate": 0.00020274944457750725, "loss": 3.0481, "step": 39500 }, { "epoch": 0.33, "learning_rate": 0.00020250167246178112, "loss": 3.0524, "step": 39600 }, { "epoch": 0.33, "learning_rate": 0.00020225390034605506, "loss": 3.0513, "step": 39700 }, { "epoch": 0.33, "learning_rate": 0.00020200612823032893, "loss": 3.0518, "step": 39800 }, { "epoch": 0.33, "learning_rate": 0.00020175835611460283, "loss": 3.0441, "step": 39900 }, { "epoch": 0.33, "learning_rate": 0.00020151058399887675, "loss": 3.0546, "step": 40000 }, { "epoch": 0.33, "eval_accuracy": 0.42341574054958775, "eval_loss": 3.0717380046844482, "eval_runtime": 36.8684, "eval_samples_per_second": 308.286, "eval_steps_per_second": 2.577, "step": 40000 }, { "epoch": 0.33, "learning_rate": 0.00020126281188315065, "loss": 3.0486, "step": 40100 }, { "epoch": 0.33, "learning_rate": 0.00020101503976742454, "loss": 3.0451, "step": 40200 }, { "epoch": 0.33, "learning_rate": 0.00020076726765169846, "loss": 3.0468, "step": 40300 }, { "epoch": 0.33, "learning_rate": 0.00020051949553597236, "loss": 3.0454, "step": 40400 }, { "epoch": 0.33, "learning_rate": 0.00020027172342024625, "loss": 3.0439, "step": 40500 }, { "epoch": 0.33, "learning_rate": 0.00020002642902567744, "loss": 3.0444, "step": 40600 }, { "epoch": 0.34, "learning_rate": 0.00019977865690995136, "loss": 3.0425, "step": 40700 }, { "epoch": 0.34, "learning_rate": 0.00019953088479422525, "loss": 3.0405, "step": 40800 }, { "epoch": 0.34, "learning_rate": 0.00019928311267849912, "loss": 3.0408, "step": 40900 }, { "epoch": 0.34, "learning_rate": 0.00019903534056277307, "loss": 3.0484, "step": 41000 }, { "epoch": 0.34, "eval_accuracy": 0.42364643272309593, "eval_loss": 3.0696725845336914, "eval_runtime": 36.5687, "eval_samples_per_second": 310.813, "eval_steps_per_second": 2.598, "step": 41000 }, { "epoch": 0.34, "learning_rate": 0.00019878756844704694, "loss": 3.0496, "step": 41100 }, { "epoch": 0.34, "learning_rate": 0.00019853979633132084, "loss": 3.0508, "step": 41200 }, { "epoch": 0.34, "learning_rate": 0.00019829202421559476, "loss": 3.0486, "step": 41300 }, { "epoch": 0.34, "learning_rate": 0.00019804425209986865, "loss": 3.0481, "step": 41400 }, { "epoch": 0.34, "learning_rate": 0.00019779647998414255, "loss": 3.032, "step": 41500 }, { "epoch": 0.34, "learning_rate": 0.00019754870786841647, "loss": 3.0444, "step": 41600 }, { "epoch": 0.34, "learning_rate": 0.00019730093575269037, "loss": 3.0446, "step": 41700 }, { "epoch": 0.34, "learning_rate": 0.0001970531636369643, "loss": 3.0434, "step": 41800 }, { "epoch": 0.35, "learning_rate": 0.00019680539152123818, "loss": 3.0394, "step": 41900 }, { "epoch": 0.35, "learning_rate": 0.00019656009712666937, "loss": 3.0441, "step": 42000 }, { "epoch": 0.35, "eval_accuracy": 0.4235577582539997, "eval_loss": 3.0694241523742676, "eval_runtime": 36.751, "eval_samples_per_second": 309.271, "eval_steps_per_second": 2.585, "step": 42000 }, { "epoch": 0.35, "learning_rate": 0.00019631232501094326, "loss": 3.0452, "step": 42100 }, { "epoch": 0.35, "learning_rate": 0.00019606455289521713, "loss": 3.0439, "step": 42200 }, { "epoch": 0.35, "learning_rate": 0.00019581678077949108, "loss": 3.0425, "step": 42300 }, { "epoch": 0.35, "learning_rate": 0.00019556900866376495, "loss": 3.0391, "step": 42400 }, { "epoch": 0.35, "learning_rate": 0.00019532123654803884, "loss": 3.0443, "step": 42500 }, { "epoch": 0.35, "learning_rate": 0.00019507346443231277, "loss": 3.0485, "step": 42600 }, { "epoch": 0.35, "learning_rate": 0.00019482817003774395, "loss": 3.046, "step": 42700 }, { "epoch": 0.35, "learning_rate": 0.00019458039792201784, "loss": 3.0427, "step": 42800 }, { "epoch": 0.35, "learning_rate": 0.00019433262580629174, "loss": 3.0363, "step": 42900 }, { "epoch": 0.35, "learning_rate": 0.00019408485369056566, "loss": 3.0292, "step": 43000 }, { "epoch": 0.35, "eval_accuracy": 0.4242269733879605, "eval_loss": 3.0662310123443604, "eval_runtime": 36.6029, "eval_samples_per_second": 310.522, "eval_steps_per_second": 2.595, "step": 43000 }, { "epoch": 0.36, "learning_rate": 0.00019383708157483956, "loss": 3.0414, "step": 43100 }, { "epoch": 0.36, "learning_rate": 0.00019358930945911345, "loss": 3.0474, "step": 43200 }, { "epoch": 0.36, "learning_rate": 0.00019334153734338737, "loss": 3.0427, "step": 43300 }, { "epoch": 0.36, "learning_rate": 0.00019309376522766127, "loss": 3.0452, "step": 43400 }, { "epoch": 0.36, "learning_rate": 0.0001928459931119352, "loss": 3.0393, "step": 43500 }, { "epoch": 0.36, "learning_rate": 0.0001925982209962091, "loss": 3.0402, "step": 43600 }, { "epoch": 0.36, "learning_rate": 0.00019235044888048295, "loss": 3.0395, "step": 43700 }, { "epoch": 0.36, "learning_rate": 0.00019210515448591414, "loss": 3.0315, "step": 43800 }, { "epoch": 0.36, "learning_rate": 0.00019185738237018803, "loss": 3.0371, "step": 43900 }, { "epoch": 0.36, "learning_rate": 0.00019160961025446196, "loss": 3.0384, "step": 44000 }, { "epoch": 0.36, "eval_accuracy": 0.4244334186363252, "eval_loss": 3.064300775527954, "eval_runtime": 37.8137, "eval_samples_per_second": 300.579, "eval_steps_per_second": 2.512, "step": 44000 }, { "epoch": 0.36, "learning_rate": 0.00019136183813873585, "loss": 3.0465, "step": 44100 }, { "epoch": 0.36, "learning_rate": 0.00019111406602300975, "loss": 3.0364, "step": 44200 }, { "epoch": 0.37, "learning_rate": 0.00019086629390728367, "loss": 3.0378, "step": 44300 }, { "epoch": 0.37, "learning_rate": 0.00019061852179155756, "loss": 3.0403, "step": 44400 }, { "epoch": 0.37, "learning_rate": 0.00019037074967583149, "loss": 3.0372, "step": 44500 }, { "epoch": 0.37, "learning_rate": 0.00019012297756010538, "loss": 3.0368, "step": 44600 }, { "epoch": 0.37, "learning_rate": 0.00018987520544437928, "loss": 3.0392, "step": 44700 }, { "epoch": 0.37, "learning_rate": 0.0001896274333286532, "loss": 3.0387, "step": 44800 }, { "epoch": 0.37, "learning_rate": 0.0001893796612129271, "loss": 3.0345, "step": 44900 }, { "epoch": 0.37, "learning_rate": 0.00018913188909720096, "loss": 3.0367, "step": 45000 }, { "epoch": 0.37, "eval_accuracy": 0.42403784737184114, "eval_loss": 3.062938928604126, "eval_runtime": 39.8903, "eval_samples_per_second": 284.931, "eval_steps_per_second": 2.382, "step": 45000 }, { "epoch": 0.37, "learning_rate": 0.00018888659470263214, "loss": 3.0391, "step": 45100 }, { "epoch": 0.37, "learning_rate": 0.00018863882258690607, "loss": 3.0395, "step": 45200 }, { "epoch": 0.37, "learning_rate": 0.00018839105047117996, "loss": 3.0362, "step": 45300 }, { "epoch": 0.37, "learning_rate": 0.00018814327835545386, "loss": 3.038, "step": 45400 }, { "epoch": 0.38, "learning_rate": 0.00018789550623972778, "loss": 3.0367, "step": 45500 }, { "epoch": 0.38, "learning_rate": 0.00018764773412400168, "loss": 3.0363, "step": 45600 }, { "epoch": 0.38, "learning_rate": 0.00018739996200827557, "loss": 3.0405, "step": 45700 }, { "epoch": 0.38, "learning_rate": 0.0001871521898925495, "loss": 3.0352, "step": 45800 }, { "epoch": 0.38, "learning_rate": 0.0001869044177768234, "loss": 3.0348, "step": 45900 }, { "epoch": 0.38, "learning_rate": 0.00018665664566109728, "loss": 3.0337, "step": 46000 }, { "epoch": 0.38, "eval_accuracy": 0.4246010688044603, "eval_loss": 3.0621790885925293, "eval_runtime": 36.366, "eval_samples_per_second": 312.545, "eval_steps_per_second": 2.612, "step": 46000 }, { "epoch": 0.38, "learning_rate": 0.0001864088735453712, "loss": 3.0339, "step": 46100 }, { "epoch": 0.38, "learning_rate": 0.00018616357915080236, "loss": 3.0373, "step": 46200 }, { "epoch": 0.38, "learning_rate": 0.00018591580703507626, "loss": 3.0311, "step": 46300 }, { "epoch": 0.38, "learning_rate": 0.00018566803491935015, "loss": 3.0392, "step": 46400 }, { "epoch": 0.38, "learning_rate": 0.00018542026280362407, "loss": 3.0461, "step": 46500 }, { "epoch": 0.38, "learning_rate": 0.00018517249068789797, "loss": 3.0335, "step": 46600 }, { "epoch": 0.39, "learning_rate": 0.00018492471857217186, "loss": 3.0345, "step": 46700 }, { "epoch": 0.39, "learning_rate": 0.0001846769464564458, "loss": 3.0327, "step": 46800 }, { "epoch": 0.39, "learning_rate": 0.00018442917434071968, "loss": 3.0338, "step": 46900 }, { "epoch": 0.39, "learning_rate": 0.00018418140222499358, "loss": 3.0385, "step": 47000 }, { "epoch": 0.39, "eval_accuracy": 0.4245407978762465, "eval_loss": 3.0599210262298584, "eval_runtime": 38.501, "eval_samples_per_second": 295.213, "eval_steps_per_second": 2.467, "step": 47000 }, { "epoch": 0.39, "learning_rate": 0.0001839336301092675, "loss": 3.0416, "step": 47100 }, { "epoch": 0.39, "learning_rate": 0.0001836858579935414, "loss": 3.0355, "step": 47200 }, { "epoch": 0.39, "learning_rate": 0.00018343808587781532, "loss": 3.0313, "step": 47300 }, { "epoch": 0.39, "learning_rate": 0.0001831903137620892, "loss": 3.0264, "step": 47400 }, { "epoch": 0.39, "learning_rate": 0.00018294254164636308, "loss": 3.0319, "step": 47500 }, { "epoch": 0.39, "learning_rate": 0.00018269476953063703, "loss": 3.0357, "step": 47600 }, { "epoch": 0.39, "learning_rate": 0.0001824469974149109, "loss": 3.035, "step": 47700 }, { "epoch": 0.39, "learning_rate": 0.0001821992252991848, "loss": 3.0333, "step": 47800 }, { "epoch": 0.39, "learning_rate": 0.00018195145318345872, "loss": 3.0326, "step": 47900 }, { "epoch": 0.4, "learning_rate": 0.0001817036810677326, "loss": 3.0319, "step": 48000 }, { "epoch": 0.4, "eval_accuracy": 0.4249543811422657, "eval_loss": 3.057422399520874, "eval_runtime": 36.2469, "eval_samples_per_second": 313.572, "eval_steps_per_second": 2.621, "step": 48000 }, { "epoch": 0.4, "learning_rate": 0.0001814559089520065, "loss": 3.0296, "step": 48100 }, { "epoch": 0.4, "learning_rate": 0.00018120813683628043, "loss": 3.0391, "step": 48200 }, { "epoch": 0.4, "learning_rate": 0.00018096036472055432, "loss": 3.0309, "step": 48300 }, { "epoch": 0.4, "learning_rate": 0.00018071259260482822, "loss": 3.0335, "step": 48400 }, { "epoch": 0.4, "learning_rate": 0.00018046482048910214, "loss": 3.0328, "step": 48500 }, { "epoch": 0.4, "learning_rate": 0.00018021704837337604, "loss": 3.0353, "step": 48600 }, { "epoch": 0.4, "learning_rate": 0.00017996927625764996, "loss": 3.0348, "step": 48700 }, { "epoch": 0.4, "learning_rate": 0.00017972150414192385, "loss": 3.0222, "step": 48800 }, { "epoch": 0.4, "learning_rate": 0.00017947373202619775, "loss": 3.0283, "step": 48900 }, { "epoch": 0.4, "learning_rate": 0.00017922595991047167, "loss": 3.0255, "step": 49000 }, { "epoch": 0.4, "eval_accuracy": 0.4249474534493676, "eval_loss": 3.05733585357666, "eval_runtime": 36.7665, "eval_samples_per_second": 309.14, "eval_steps_per_second": 2.584, "step": 49000 }, { "epoch": 0.4, "learning_rate": 0.00017897818779474557, "loss": 3.028, "step": 49100 }, { "epoch": 0.41, "learning_rate": 0.00017873289340017672, "loss": 3.0368, "step": 49200 }, { "epoch": 0.41, "learning_rate": 0.00017848512128445062, "loss": 3.0396, "step": 49300 }, { "epoch": 0.41, "learning_rate": 0.00017823734916872454, "loss": 3.0283, "step": 49400 }, { "epoch": 0.41, "learning_rate": 0.00017798957705299844, "loss": 3.0278, "step": 49500 }, { "epoch": 0.41, "learning_rate": 0.00017774180493727233, "loss": 3.0374, "step": 49600 }, { "epoch": 0.41, "learning_rate": 0.00017749403282154625, "loss": 3.0317, "step": 49700 }, { "epoch": 0.41, "learning_rate": 0.00017724626070582015, "loss": 3.0318, "step": 49800 }, { "epoch": 0.41, "learning_rate": 0.00017699848859009404, "loss": 3.0324, "step": 49900 }, { "epoch": 0.41, "learning_rate": 0.00017675071647436797, "loss": 3.021, "step": 50000 }, { "epoch": 0.41, "eval_accuracy": 0.4252966091714341, "eval_loss": 3.055666923522949, "eval_runtime": 36.7556, "eval_samples_per_second": 309.232, "eval_steps_per_second": 2.585, "step": 50000 }, { "epoch": 0.41, "learning_rate": 0.00017650294435864186, "loss": 3.0338, "step": 50100 }, { "epoch": 0.41, "learning_rate": 0.00017625517224291576, "loss": 3.0348, "step": 50200 }, { "epoch": 0.41, "learning_rate": 0.0001760098778483469, "loss": 3.0247, "step": 50300 }, { "epoch": 0.42, "learning_rate": 0.00017576210573262086, "loss": 3.0318, "step": 50400 }, { "epoch": 0.42, "learning_rate": 0.00017551433361689473, "loss": 3.023, "step": 50500 }, { "epoch": 0.42, "learning_rate": 0.00017526656150116863, "loss": 3.0271, "step": 50600 }, { "epoch": 0.42, "learning_rate": 0.00017501878938544255, "loss": 3.0305, "step": 50700 }, { "epoch": 0.42, "learning_rate": 0.00017477101726971644, "loss": 3.0332, "step": 50800 }, { "epoch": 0.42, "learning_rate": 0.00017452324515399034, "loss": 3.0292, "step": 50900 }, { "epoch": 0.42, "learning_rate": 0.00017427547303826426, "loss": 3.0305, "step": 51000 }, { "epoch": 0.42, "eval_accuracy": 0.42537212102402383, "eval_loss": 3.052976608276367, "eval_runtime": 36.1546, "eval_samples_per_second": 314.372, "eval_steps_per_second": 2.628, "step": 51000 }, { "epoch": 0.42, "learning_rate": 0.00017402770092253816, "loss": 3.0373, "step": 51100 }, { "epoch": 0.42, "learning_rate": 0.00017377992880681205, "loss": 3.0281, "step": 51200 }, { "epoch": 0.42, "learning_rate": 0.00017353215669108597, "loss": 3.0308, "step": 51300 }, { "epoch": 0.42, "learning_rate": 0.00017328438457535987, "loss": 3.0293, "step": 51400 }, { "epoch": 0.42, "learning_rate": 0.00017303661245963376, "loss": 3.0236, "step": 51500 }, { "epoch": 0.43, "learning_rate": 0.0001727888403439077, "loss": 3.0272, "step": 51600 }, { "epoch": 0.43, "learning_rate": 0.00017254354594933887, "loss": 3.0267, "step": 51700 }, { "epoch": 0.43, "learning_rate": 0.00017229577383361274, "loss": 3.0305, "step": 51800 }, { "epoch": 0.43, "learning_rate": 0.00017204800171788663, "loss": 3.0243, "step": 51900 }, { "epoch": 0.43, "learning_rate": 0.00017180022960216056, "loss": 3.0248, "step": 52000 }, { "epoch": 0.43, "eval_accuracy": 0.42570326474455517, "eval_loss": 3.0528042316436768, "eval_runtime": 36.159, "eval_samples_per_second": 314.334, "eval_steps_per_second": 2.627, "step": 52000 }, { "epoch": 0.43, "learning_rate": 0.00017155245748643445, "loss": 3.031, "step": 52100 }, { "epoch": 0.43, "learning_rate": 0.00017130468537070835, "loss": 3.0278, "step": 52200 }, { "epoch": 0.43, "learning_rate": 0.00017105691325498227, "loss": 3.0219, "step": 52300 }, { "epoch": 0.43, "learning_rate": 0.00017080914113925616, "loss": 3.0313, "step": 52400 }, { "epoch": 0.43, "learning_rate": 0.00017056136902353009, "loss": 3.0308, "step": 52500 }, { "epoch": 0.43, "learning_rate": 0.00017031359690780398, "loss": 3.0214, "step": 52600 }, { "epoch": 0.43, "learning_rate": 0.00017006582479207788, "loss": 3.0325, "step": 52700 }, { "epoch": 0.44, "learning_rate": 0.0001698180526763518, "loss": 3.0235, "step": 52800 }, { "epoch": 0.44, "learning_rate": 0.0001695702805606257, "loss": 3.0314, "step": 52900 }, { "epoch": 0.44, "learning_rate": 0.00016932498616605688, "loss": 3.0269, "step": 53000 }, { "epoch": 0.44, "eval_accuracy": 0.42607943846892443, "eval_loss": 3.049508571624756, "eval_runtime": 38.1346, "eval_samples_per_second": 298.05, "eval_steps_per_second": 2.491, "step": 53000 }, { "epoch": 0.44, "learning_rate": 0.00016907721405033074, "loss": 3.0216, "step": 53100 }, { "epoch": 0.44, "learning_rate": 0.0001688294419346047, "loss": 3.023, "step": 53200 }, { "epoch": 0.44, "learning_rate": 0.00016858166981887856, "loss": 3.0213, "step": 53300 }, { "epoch": 0.44, "learning_rate": 0.00016833389770315246, "loss": 3.0316, "step": 53400 }, { "epoch": 0.44, "learning_rate": 0.00016808612558742638, "loss": 3.0297, "step": 53500 }, { "epoch": 0.44, "learning_rate": 0.00016783835347170028, "loss": 3.0295, "step": 53600 }, { "epoch": 0.44, "learning_rate": 0.00016759058135597417, "loss": 3.0267, "step": 53700 }, { "epoch": 0.44, "learning_rate": 0.0001673428092402481, "loss": 3.0158, "step": 53800 }, { "epoch": 0.44, "learning_rate": 0.000167095037124522, "loss": 3.0209, "step": 53900 }, { "epoch": 0.45, "learning_rate": 0.00016684726500879588, "loss": 3.0136, "step": 54000 }, { "epoch": 0.45, "eval_accuracy": 0.425942962918831, "eval_loss": 3.048839569091797, "eval_runtime": 38.3696, "eval_samples_per_second": 296.224, "eval_steps_per_second": 2.476, "step": 54000 }, { "epoch": 0.45, "learning_rate": 0.0001665994928930698, "loss": 3.0268, "step": 54100 }, { "epoch": 0.45, "learning_rate": 0.0001663517207773437, "loss": 3.018, "step": 54200 }, { "epoch": 0.45, "learning_rate": 0.0001661039486616176, "loss": 3.026, "step": 54300 }, { "epoch": 0.45, "learning_rate": 0.00016585617654589152, "loss": 3.0194, "step": 54400 }, { "epoch": 0.45, "learning_rate": 0.00016560840443016541, "loss": 3.0255, "step": 54500 }, { "epoch": 0.45, "learning_rate": 0.00016536063231443934, "loss": 3.0169, "step": 54600 }, { "epoch": 0.45, "learning_rate": 0.00016511286019871323, "loss": 3.0285, "step": 54700 }, { "epoch": 0.45, "learning_rate": 0.00016486508808298713, "loss": 3.0255, "step": 54800 }, { "epoch": 0.45, "learning_rate": 0.00016461731596726105, "loss": 3.0233, "step": 54900 }, { "epoch": 0.45, "learning_rate": 0.00016437202157269218, "loss": 3.0156, "step": 55000 }, { "epoch": 0.45, "eval_accuracy": 0.42623946817487157, "eval_loss": 3.0467984676361084, "eval_runtime": 36.8565, "eval_samples_per_second": 308.385, "eval_steps_per_second": 2.578, "step": 55000 }, { "epoch": 0.45, "learning_rate": 0.00016412672717812336, "loss": 3.0207, "step": 55100 }, { "epoch": 0.46, "learning_rate": 0.00016387895506239728, "loss": 3.025, "step": 55200 }, { "epoch": 0.46, "learning_rate": 0.00016363118294667118, "loss": 3.019, "step": 55300 }, { "epoch": 0.46, "learning_rate": 0.00016338341083094507, "loss": 3.013, "step": 55400 }, { "epoch": 0.46, "learning_rate": 0.000163135638715219, "loss": 3.0253, "step": 55500 }, { "epoch": 0.46, "learning_rate": 0.0001628878665994929, "loss": 3.0123, "step": 55600 }, { "epoch": 0.46, "learning_rate": 0.00016264009448376676, "loss": 3.0203, "step": 55700 }, { "epoch": 0.46, "learning_rate": 0.00016239232236804068, "loss": 3.0326, "step": 55800 }, { "epoch": 0.46, "learning_rate": 0.00016214455025231458, "loss": 3.0228, "step": 55900 }, { "epoch": 0.46, "learning_rate": 0.00016189677813658847, "loss": 3.022, "step": 56000 }, { "epoch": 0.46, "eval_accuracy": 0.42677844268234727, "eval_loss": 3.0453531742095947, "eval_runtime": 36.3795, "eval_samples_per_second": 312.429, "eval_steps_per_second": 2.611, "step": 56000 }, { "epoch": 0.46, "learning_rate": 0.0001616490060208624, "loss": 3.0227, "step": 56100 }, { "epoch": 0.46, "learning_rate": 0.0001614012339051363, "loss": 3.0261, "step": 56200 }, { "epoch": 0.46, "learning_rate": 0.0001611534617894102, "loss": 3.0218, "step": 56300 }, { "epoch": 0.47, "learning_rate": 0.0001609056896736841, "loss": 3.0216, "step": 56400 }, { "epoch": 0.47, "learning_rate": 0.000160657917557958, "loss": 3.0183, "step": 56500 }, { "epoch": 0.47, "learning_rate": 0.00016041014544223192, "loss": 3.0174, "step": 56600 }, { "epoch": 0.47, "learning_rate": 0.00016016237332650582, "loss": 3.0241, "step": 56700 }, { "epoch": 0.47, "learning_rate": 0.00015991460121077972, "loss": 3.0181, "step": 56800 }, { "epoch": 0.47, "learning_rate": 0.00015966682909505364, "loss": 3.0194, "step": 56900 }, { "epoch": 0.47, "learning_rate": 0.00015941905697932753, "loss": 3.0193, "step": 57000 }, { "epoch": 0.47, "eval_accuracy": 0.42686365330499443, "eval_loss": 3.044196367263794, "eval_runtime": 36.1843, "eval_samples_per_second": 314.114, "eval_steps_per_second": 2.625, "step": 57000 }, { "epoch": 0.47, "learning_rate": 0.00015917128486360143, "loss": 3.0154, "step": 57100 }, { "epoch": 0.47, "learning_rate": 0.00015892351274787535, "loss": 3.0283, "step": 57200 }, { "epoch": 0.47, "learning_rate": 0.0001586782183533065, "loss": 3.0184, "step": 57300 }, { "epoch": 0.47, "learning_rate": 0.0001584304462375804, "loss": 3.0122, "step": 57400 }, { "epoch": 0.47, "learning_rate": 0.0001581826741218543, "loss": 3.0183, "step": 57500 }, { "epoch": 0.47, "learning_rate": 0.00015793490200612822, "loss": 3.0212, "step": 57600 }, { "epoch": 0.48, "learning_rate": 0.00015768712989040211, "loss": 3.0084, "step": 57700 }, { "epoch": 0.48, "learning_rate": 0.000157439357774676, "loss": 3.0144, "step": 57800 }, { "epoch": 0.48, "learning_rate": 0.0001571940633801072, "loss": 3.0292, "step": 57900 }, { "epoch": 0.48, "learning_rate": 0.00015694629126438111, "loss": 3.0222, "step": 58000 }, { "epoch": 0.48, "eval_accuracy": 0.42703615285815827, "eval_loss": 3.0416929721832275, "eval_runtime": 36.2277, "eval_samples_per_second": 313.737, "eval_steps_per_second": 2.622, "step": 58000 }, { "epoch": 0.48, "learning_rate": 0.000156698519148655, "loss": 3.0173, "step": 58100 }, { "epoch": 0.48, "learning_rate": 0.00015645074703292888, "loss": 3.0236, "step": 58200 }, { "epoch": 0.48, "learning_rate": 0.00015620297491720283, "loss": 3.0151, "step": 58300 }, { "epoch": 0.48, "learning_rate": 0.0001559552028014767, "loss": 3.0253, "step": 58400 }, { "epoch": 0.48, "learning_rate": 0.0001557074306857506, "loss": 3.0175, "step": 58500 }, { "epoch": 0.48, "learning_rate": 0.0001554596585700245, "loss": 3.0128, "step": 58600 }, { "epoch": 0.48, "learning_rate": 0.0001552118864542984, "loss": 3.0129, "step": 58700 }, { "epoch": 0.48, "learning_rate": 0.0001549641143385723, "loss": 3.0188, "step": 58800 }, { "epoch": 0.49, "learning_rate": 0.00015471634222284623, "loss": 3.0085, "step": 58900 }, { "epoch": 0.49, "learning_rate": 0.00015446857010712012, "loss": 3.0111, "step": 59000 }, { "epoch": 0.49, "eval_accuracy": 0.42760214536793667, "eval_loss": 3.039332389831543, "eval_runtime": 36.2138, "eval_samples_per_second": 313.859, "eval_steps_per_second": 2.623, "step": 59000 }, { "epoch": 0.49, "learning_rate": 0.00015422079799139402, "loss": 2.9987, "step": 59100 }, { "epoch": 0.49, "learning_rate": 0.0001539755035968252, "loss": 3.0151, "step": 59200 }, { "epoch": 0.49, "learning_rate": 0.00015372773148109912, "loss": 3.011, "step": 59300 }, { "epoch": 0.49, "learning_rate": 0.00015347995936537302, "loss": 3.0117, "step": 59400 }, { "epoch": 0.49, "learning_rate": 0.00015323218724964689, "loss": 3.0211, "step": 59500 }, { "epoch": 0.49, "learning_rate": 0.00015298441513392083, "loss": 3.0137, "step": 59600 }, { "epoch": 0.49, "learning_rate": 0.0001527366430181947, "loss": 3.0184, "step": 59700 }, { "epoch": 0.49, "learning_rate": 0.0001524888709024686, "loss": 3.009, "step": 59800 }, { "epoch": 0.49, "learning_rate": 0.00015224109878674252, "loss": 3.0189, "step": 59900 }, { "epoch": 0.49, "learning_rate": 0.00015199332667101642, "loss": 3.0148, "step": 60000 }, { "epoch": 0.49, "eval_accuracy": 0.4273499773464442, "eval_loss": 3.0384342670440674, "eval_runtime": 37.3986, "eval_samples_per_second": 303.915, "eval_steps_per_second": 2.54, "step": 60000 }, { "epoch": 0.5, "learning_rate": 0.00015174555455529034, "loss": 3.0113, "step": 60100 }, { "epoch": 0.5, "learning_rate": 0.00015149778243956423, "loss": 3.0134, "step": 60200 }, { "epoch": 0.5, "learning_rate": 0.00015125001032383813, "loss": 3.0105, "step": 60300 }, { "epoch": 0.5, "learning_rate": 0.0001510047159292693, "loss": 3.0107, "step": 60400 }, { "epoch": 0.5, "learning_rate": 0.0001507569438135432, "loss": 3.0143, "step": 60500 }, { "epoch": 0.5, "learning_rate": 0.00015050917169781713, "loss": 3.0049, "step": 60600 }, { "epoch": 0.5, "learning_rate": 0.00015026139958209102, "loss": 3.0171, "step": 60700 }, { "epoch": 0.5, "learning_rate": 0.00015001362746636495, "loss": 3.0145, "step": 60800 }, { "epoch": 0.5, "learning_rate": 0.00014976585535063884, "loss": 3.0095, "step": 60900 }, { "epoch": 0.5, "learning_rate": 0.0001495180832349127, "loss": 3.0077, "step": 61000 }, { "epoch": 0.5, "eval_accuracy": 0.4275841333664015, "eval_loss": 3.0363619327545166, "eval_runtime": 36.6829, "eval_samples_per_second": 309.845, "eval_steps_per_second": 2.59, "step": 61000 }, { "epoch": 0.5, "learning_rate": 0.00014927031111918663, "loss": 3.0168, "step": 61100 }, { "epoch": 0.5, "learning_rate": 0.00014902253900346053, "loss": 3.0083, "step": 61200 }, { "epoch": 0.51, "learning_rate": 0.00014877476688773445, "loss": 3.0024, "step": 61300 }, { "epoch": 0.51, "learning_rate": 0.00014852699477200835, "loss": 3.018, "step": 61400 }, { "epoch": 0.51, "learning_rate": 0.00014827922265628224, "loss": 3.0216, "step": 61500 }, { "epoch": 0.51, "learning_rate": 0.00014803145054055616, "loss": 3.0105, "step": 61600 }, { "epoch": 0.51, "learning_rate": 0.00014778615614598732, "loss": 3.0053, "step": 61700 }, { "epoch": 0.51, "learning_rate": 0.00014753838403026121, "loss": 3.0133, "step": 61800 }, { "epoch": 0.51, "learning_rate": 0.00014729061191453514, "loss": 3.0113, "step": 61900 }, { "epoch": 0.51, "learning_rate": 0.00014704283979880903, "loss": 3.0167, "step": 62000 }, { "epoch": 0.51, "eval_accuracy": 0.42764232598674595, "eval_loss": 3.0357508659362793, "eval_runtime": 36.3375, "eval_samples_per_second": 312.79, "eval_steps_per_second": 2.614, "step": 62000 }, { "epoch": 0.51, "learning_rate": 0.00014679506768308293, "loss": 3.0026, "step": 62100 }, { "epoch": 0.51, "learning_rate": 0.00014654729556735685, "loss": 3.0072, "step": 62200 }, { "epoch": 0.51, "learning_rate": 0.00014629952345163074, "loss": 3.0118, "step": 62300 }, { "epoch": 0.51, "learning_rate": 0.00014605175133590464, "loss": 3.0098, "step": 62400 }, { "epoch": 0.52, "learning_rate": 0.00014580397922017853, "loss": 3.0079, "step": 62500 }, { "epoch": 0.52, "learning_rate": 0.00014555620710445246, "loss": 3.008, "step": 62600 }, { "epoch": 0.52, "learning_rate": 0.00014530843498872635, "loss": 3.0051, "step": 62700 }, { "epoch": 0.52, "learning_rate": 0.00014506066287300025, "loss": 3.0086, "step": 62800 }, { "epoch": 0.52, "learning_rate": 0.00014481289075727417, "loss": 3.0124, "step": 62900 }, { "epoch": 0.52, "learning_rate": 0.00014456511864154807, "loss": 3.0049, "step": 63000 }, { "epoch": 0.52, "eval_accuracy": 0.42800118047886987, "eval_loss": 3.0342743396759033, "eval_runtime": 36.3466, "eval_samples_per_second": 312.711, "eval_steps_per_second": 2.614, "step": 63000 }, { "epoch": 0.52, "learning_rate": 0.00014431982424697922, "loss": 3.017, "step": 63100 }, { "epoch": 0.52, "learning_rate": 0.00014407205213125314, "loss": 3.0057, "step": 63200 }, { "epoch": 0.52, "learning_rate": 0.00014382428001552704, "loss": 3.0143, "step": 63300 }, { "epoch": 0.52, "learning_rate": 0.0001435789856209582, "loss": 3.0131, "step": 63400 }, { "epoch": 0.52, "learning_rate": 0.00014333121350523212, "loss": 3.0048, "step": 63500 }, { "epoch": 0.52, "learning_rate": 0.000143083441389506, "loss": 3.0061, "step": 63600 }, { "epoch": 0.53, "learning_rate": 0.00014283566927377993, "loss": 3.0144, "step": 63700 }, { "epoch": 0.53, "learning_rate": 0.0001425878971580538, "loss": 3.0082, "step": 63800 }, { "epoch": 0.53, "learning_rate": 0.00014234012504232772, "loss": 3.0089, "step": 63900 }, { "epoch": 0.53, "learning_rate": 0.00014209235292660162, "loss": 3.016, "step": 64000 }, { "epoch": 0.53, "eval_accuracy": 0.4281286500281957, "eval_loss": 3.032212257385254, "eval_runtime": 36.9386, "eval_samples_per_second": 307.699, "eval_steps_per_second": 2.572, "step": 64000 }, { "epoch": 0.53, "learning_rate": 0.00014184458081087552, "loss": 3.0087, "step": 64100 }, { "epoch": 0.53, "learning_rate": 0.00014159680869514944, "loss": 3.0083, "step": 64200 }, { "epoch": 0.53, "learning_rate": 0.00014134903657942333, "loss": 3.0078, "step": 64300 }, { "epoch": 0.53, "learning_rate": 0.00014110126446369726, "loss": 3.0104, "step": 64400 }, { "epoch": 0.53, "learning_rate": 0.00014085349234797115, "loss": 3.0069, "step": 64500 }, { "epoch": 0.53, "learning_rate": 0.00014060572023224505, "loss": 2.9968, "step": 64600 }, { "epoch": 0.53, "learning_rate": 0.00014035794811651897, "loss": 3.0041, "step": 64700 }, { "epoch": 0.53, "learning_rate": 0.00014011017600079286, "loss": 3.0097, "step": 64800 }, { "epoch": 0.54, "learning_rate": 0.00013986240388506676, "loss": 3.0105, "step": 64900 }, { "epoch": 0.54, "learning_rate": 0.00013961463176934068, "loss": 3.0103, "step": 65000 }, { "epoch": 0.54, "eval_accuracy": 0.4285138297533326, "eval_loss": 3.0296883583068848, "eval_runtime": 36.5876, "eval_samples_per_second": 310.652, "eval_steps_per_second": 2.597, "step": 65000 }, { "epoch": 0.54, "learning_rate": 0.00013936685965361458, "loss": 2.9976, "step": 65100 }, { "epoch": 0.54, "learning_rate": 0.00013911908753788847, "loss": 3.0013, "step": 65200 }, { "epoch": 0.54, "learning_rate": 0.00013887131542216237, "loss": 3.0069, "step": 65300 }, { "epoch": 0.54, "learning_rate": 0.0001386235433064363, "loss": 3.0034, "step": 65400 }, { "epoch": 0.54, "learning_rate": 0.00013837577119071018, "loss": 3.0107, "step": 65500 }, { "epoch": 0.54, "learning_rate": 0.00013812799907498408, "loss": 3.0011, "step": 65600 }, { "epoch": 0.54, "learning_rate": 0.000137880226959258, "loss": 3.0037, "step": 65700 }, { "epoch": 0.54, "learning_rate": 0.0001376324548435319, "loss": 2.9999, "step": 65800 }, { "epoch": 0.54, "learning_rate": 0.0001373846827278058, "loss": 3.0036, "step": 65900 }, { "epoch": 0.54, "learning_rate": 0.00013713691061207971, "loss": 3.0066, "step": 66000 }, { "epoch": 0.54, "eval_accuracy": 0.42835587835525485, "eval_loss": 3.0290277004241943, "eval_runtime": 36.1991, "eval_samples_per_second": 313.985, "eval_steps_per_second": 2.624, "step": 66000 }, { "epoch": 0.55, "learning_rate": 0.0001368891384963536, "loss": 3.0091, "step": 66100 }, { "epoch": 0.55, "learning_rate": 0.00013664136638062753, "loss": 3.0024, "step": 66200 }, { "epoch": 0.55, "learning_rate": 0.0001363935942649014, "loss": 2.9975, "step": 66300 }, { "epoch": 0.55, "learning_rate": 0.00013614829987033258, "loss": 3.0005, "step": 66400 }, { "epoch": 0.55, "learning_rate": 0.00013590052775460648, "loss": 3.0091, "step": 66500 }, { "epoch": 0.55, "learning_rate": 0.00013565275563888037, "loss": 3.0056, "step": 66600 }, { "epoch": 0.55, "learning_rate": 0.0001354049835231543, "loss": 3.0058, "step": 66700 }, { "epoch": 0.55, "learning_rate": 0.0001351572114074282, "loss": 3.0075, "step": 66800 }, { "epoch": 0.55, "learning_rate": 0.0001349094392917021, "loss": 3.0081, "step": 66900 }, { "epoch": 0.55, "learning_rate": 0.000134661667175976, "loss": 2.9958, "step": 67000 }, { "epoch": 0.55, "eval_accuracy": 0.42845632990227794, "eval_loss": 3.0280661582946777, "eval_runtime": 36.2004, "eval_samples_per_second": 313.975, "eval_steps_per_second": 2.624, "step": 67000 }, { "epoch": 0.55, "learning_rate": 0.0001344138950602499, "loss": 2.9959, "step": 67100 }, { "epoch": 0.55, "learning_rate": 0.00013416612294452383, "loss": 3.0026, "step": 67200 }, { "epoch": 0.55, "learning_rate": 0.00013391835082879772, "loss": 2.9972, "step": 67300 }, { "epoch": 0.56, "learning_rate": 0.00013367057871307162, "loss": 3.0, "step": 67400 }, { "epoch": 0.56, "learning_rate": 0.00013342280659734554, "loss": 3.0026, "step": 67500 }, { "epoch": 0.56, "learning_rate": 0.0001331750344816194, "loss": 3.0058, "step": 67600 }, { "epoch": 0.56, "learning_rate": 0.00013292726236589333, "loss": 3.0031, "step": 67700 }, { "epoch": 0.56, "learning_rate": 0.00013268196797132449, "loss": 2.9992, "step": 67800 }, { "epoch": 0.56, "learning_rate": 0.0001324341958555984, "loss": 3.0035, "step": 67900 }, { "epoch": 0.56, "learning_rate": 0.0001321864237398723, "loss": 3.0062, "step": 68000 }, { "epoch": 0.56, "eval_accuracy": 0.428811027778663, "eval_loss": 3.0265986919403076, "eval_runtime": 36.5821, "eval_samples_per_second": 310.698, "eval_steps_per_second": 2.597, "step": 68000 }, { "epoch": 0.56, "learning_rate": 0.0001319386516241462, "loss": 2.9966, "step": 68100 }, { "epoch": 0.56, "learning_rate": 0.00013169087950842012, "loss": 3.0086, "step": 68200 }, { "epoch": 0.56, "learning_rate": 0.00013144310739269402, "loss": 3.0005, "step": 68300 }, { "epoch": 0.56, "learning_rate": 0.0001311953352769679, "loss": 2.9978, "step": 68400 }, { "epoch": 0.56, "learning_rate": 0.00013094756316124183, "loss": 3.0039, "step": 68500 }, { "epoch": 0.57, "learning_rate": 0.00013069979104551573, "loss": 3.0034, "step": 68600 }, { "epoch": 0.57, "learning_rate": 0.00013045201892978962, "loss": 2.997, "step": 68700 }, { "epoch": 0.57, "learning_rate": 0.00013020424681406355, "loss": 2.9994, "step": 68800 }, { "epoch": 0.57, "learning_rate": 0.00012995647469833744, "loss": 2.9964, "step": 68900 }, { "epoch": 0.57, "learning_rate": 0.00012970870258261134, "loss": 2.9985, "step": 69000 }, { "epoch": 0.57, "eval_accuracy": 0.42893919009727866, "eval_loss": 3.0245213508605957, "eval_runtime": 36.3523, "eval_samples_per_second": 312.662, "eval_steps_per_second": 2.613, "step": 69000 }, { "epoch": 0.57, "learning_rate": 0.00012946093046688523, "loss": 2.9977, "step": 69100 }, { "epoch": 0.57, "learning_rate": 0.00012921315835115915, "loss": 3.0076, "step": 69200 }, { "epoch": 0.57, "learning_rate": 0.00012896538623543305, "loss": 3.0036, "step": 69300 }, { "epoch": 0.57, "learning_rate": 0.00012871761411970695, "loss": 2.9946, "step": 69400 }, { "epoch": 0.57, "learning_rate": 0.00012846984200398087, "loss": 3.0045, "step": 69500 }, { "epoch": 0.57, "learning_rate": 0.00012822206988825476, "loss": 3.0005, "step": 69600 }, { "epoch": 0.57, "learning_rate": 0.00012797429777252866, "loss": 3.0016, "step": 69700 }, { "epoch": 0.58, "learning_rate": 0.00012772900337795984, "loss": 2.996, "step": 69800 }, { "epoch": 0.58, "learning_rate": 0.000127483708983391, "loss": 3.0003, "step": 69900 }, { "epoch": 0.58, "learning_rate": 0.0001272359368676649, "loss": 3.0031, "step": 70000 }, { "epoch": 0.58, "eval_accuracy": 0.42916849673220725, "eval_loss": 3.0224156379699707, "eval_runtime": 36.4457, "eval_samples_per_second": 311.861, "eval_steps_per_second": 2.607, "step": 70000 }, { "epoch": 0.58, "learning_rate": 0.00012698816475193881, "loss": 3.001, "step": 70100 }, { "epoch": 0.58, "learning_rate": 0.0001267403926362127, "loss": 3.0014, "step": 70200 }, { "epoch": 0.58, "learning_rate": 0.00012649262052048663, "loss": 3.0013, "step": 70300 }, { "epoch": 0.58, "learning_rate": 0.0001262448484047605, "loss": 2.9942, "step": 70400 }, { "epoch": 0.58, "learning_rate": 0.00012599707628903442, "loss": 3.0038, "step": 70500 }, { "epoch": 0.58, "learning_rate": 0.00012574930417330832, "loss": 2.9929, "step": 70600 }, { "epoch": 0.58, "learning_rate": 0.0001255015320575822, "loss": 2.9973, "step": 70700 }, { "epoch": 0.58, "learning_rate": 0.00012525375994185614, "loss": 2.9922, "step": 70800 }, { "epoch": 0.58, "learning_rate": 0.00012500598782613003, "loss": 2.9953, "step": 70900 }, { "epoch": 0.59, "learning_rate": 0.00012476069343156121, "loss": 2.9894, "step": 71000 }, { "epoch": 0.59, "eval_accuracy": 0.4295162669156941, "eval_loss": 3.0213873386383057, "eval_runtime": 36.4185, "eval_samples_per_second": 312.094, "eval_steps_per_second": 2.609, "step": 71000 }, { "epoch": 0.59, "learning_rate": 0.0001245129213158351, "loss": 2.9974, "step": 71100 }, { "epoch": 0.59, "learning_rate": 0.000124265149200109, "loss": 3.0013, "step": 71200 }, { "epoch": 0.59, "learning_rate": 0.00012401737708438293, "loss": 3.0012, "step": 71300 }, { "epoch": 0.59, "learning_rate": 0.00012376960496865682, "loss": 2.9985, "step": 71400 }, { "epoch": 0.59, "learning_rate": 0.00012352183285293072, "loss": 3.0029, "step": 71500 }, { "epoch": 0.59, "learning_rate": 0.00012327406073720464, "loss": 2.9952, "step": 71600 }, { "epoch": 0.59, "learning_rate": 0.00012302628862147853, "loss": 3.0009, "step": 71700 }, { "epoch": 0.59, "learning_rate": 0.00012277851650575243, "loss": 2.993, "step": 71800 }, { "epoch": 0.59, "learning_rate": 0.00012253074439002633, "loss": 2.994, "step": 71900 }, { "epoch": 0.59, "learning_rate": 0.00012228297227430025, "loss": 2.9929, "step": 72000 }, { "epoch": 0.59, "eval_accuracy": 0.4295869293832552, "eval_loss": 3.0192549228668213, "eval_runtime": 36.4669, "eval_samples_per_second": 311.68, "eval_steps_per_second": 2.605, "step": 72000 }, { "epoch": 0.59, "learning_rate": 0.00012203520015857414, "loss": 2.996, "step": 72100 }, { "epoch": 0.6, "learning_rate": 0.00012178742804284805, "loss": 3.002, "step": 72200 }, { "epoch": 0.6, "learning_rate": 0.00012153965592712196, "loss": 3.0022, "step": 72300 }, { "epoch": 0.6, "learning_rate": 0.00012129188381139587, "loss": 2.9964, "step": 72400 }, { "epoch": 0.6, "learning_rate": 0.00012104411169566975, "loss": 2.9914, "step": 72500 }, { "epoch": 0.6, "learning_rate": 0.00012079633957994366, "loss": 2.9934, "step": 72600 }, { "epoch": 0.6, "learning_rate": 0.00012054856746421757, "loss": 2.9932, "step": 72700 }, { "epoch": 0.6, "learning_rate": 0.00012030079534849146, "loss": 2.9928, "step": 72800 }, { "epoch": 0.6, "learning_rate": 0.00012005550095392263, "loss": 2.9955, "step": 72900 }, { "epoch": 0.6, "learning_rate": 0.00011980772883819654, "loss": 2.9904, "step": 73000 }, { "epoch": 0.6, "eval_accuracy": 0.4296423509264404, "eval_loss": 3.0176117420196533, "eval_runtime": 36.4739, "eval_samples_per_second": 311.62, "eval_steps_per_second": 2.605, "step": 73000 }, { "epoch": 0.6, "learning_rate": 0.00011955995672247044, "loss": 3.0003, "step": 73100 }, { "epoch": 0.6, "learning_rate": 0.00011931218460674435, "loss": 2.9891, "step": 73200 }, { "epoch": 0.6, "learning_rate": 0.00011906441249101825, "loss": 2.9927, "step": 73300 }, { "epoch": 0.61, "learning_rate": 0.00011881664037529216, "loss": 2.9979, "step": 73400 }, { "epoch": 0.61, "learning_rate": 0.00011856886825956606, "loss": 2.9961, "step": 73500 }, { "epoch": 0.61, "learning_rate": 0.00011832109614383997, "loss": 2.9888, "step": 73600 }, { "epoch": 0.61, "learning_rate": 0.00011807332402811386, "loss": 2.9908, "step": 73700 }, { "epoch": 0.61, "learning_rate": 0.00011782555191238777, "loss": 2.9999, "step": 73800 }, { "epoch": 0.61, "learning_rate": 0.00011757777979666167, "loss": 2.997, "step": 73900 }, { "epoch": 0.61, "learning_rate": 0.00011733000768093558, "loss": 2.9989, "step": 74000 }, { "epoch": 0.61, "eval_accuracy": 0.43006701850109663, "eval_loss": 3.0170629024505615, "eval_runtime": 36.6925, "eval_samples_per_second": 309.764, "eval_steps_per_second": 2.589, "step": 74000 }, { "epoch": 0.61, "learning_rate": 0.00011708223556520948, "loss": 2.9978, "step": 74100 }, { "epoch": 0.61, "learning_rate": 0.00011683446344948338, "loss": 2.9962, "step": 74200 }, { "epoch": 0.61, "learning_rate": 0.00011658669133375729, "loss": 3.0012, "step": 74300 }, { "epoch": 0.61, "learning_rate": 0.0001163389192180312, "loss": 2.9862, "step": 74400 }, { "epoch": 0.61, "learning_rate": 0.0001160911471023051, "loss": 2.9931, "step": 74500 }, { "epoch": 0.62, "learning_rate": 0.000115843374986579, "loss": 2.9931, "step": 74600 }, { "epoch": 0.62, "learning_rate": 0.00011559808059201017, "loss": 2.9819, "step": 74700 }, { "epoch": 0.62, "learning_rate": 0.00011535030847628408, "loss": 2.9885, "step": 74800 }, { "epoch": 0.62, "learning_rate": 0.00011510253636055796, "loss": 2.9905, "step": 74900 }, { "epoch": 0.62, "learning_rate": 0.00011485476424483187, "loss": 2.9959, "step": 75000 }, { "epoch": 0.62, "eval_accuracy": 0.4301397592765272, "eval_loss": 3.015258550643921, "eval_runtime": 36.8121, "eval_samples_per_second": 308.757, "eval_steps_per_second": 2.581, "step": 75000 }, { "epoch": 0.62, "learning_rate": 0.00011460699212910578, "loss": 2.9742, "step": 75100 }, { "epoch": 0.62, "learning_rate": 0.00011435922001337967, "loss": 2.9913, "step": 75200 }, { "epoch": 0.62, "learning_rate": 0.00011411144789765358, "loss": 2.9913, "step": 75300 }, { "epoch": 0.62, "learning_rate": 0.00011386367578192749, "loss": 2.9888, "step": 75400 }, { "epoch": 0.62, "learning_rate": 0.0001136159036662014, "loss": 2.9901, "step": 75500 }, { "epoch": 0.62, "learning_rate": 0.0001133681315504753, "loss": 2.9909, "step": 75600 }, { "epoch": 0.62, "learning_rate": 0.0001131203594347492, "loss": 2.9901, "step": 75700 }, { "epoch": 0.63, "learning_rate": 0.00011287506504018037, "loss": 2.984, "step": 75800 }, { "epoch": 0.63, "learning_rate": 0.00011262729292445427, "loss": 2.9969, "step": 75900 }, { "epoch": 0.63, "learning_rate": 0.00011237952080872818, "loss": 2.9847, "step": 76000 }, { "epoch": 0.63, "eval_accuracy": 0.43058243885271863, "eval_loss": 3.0142199993133545, "eval_runtime": 36.2994, "eval_samples_per_second": 313.118, "eval_steps_per_second": 2.617, "step": 76000 }, { "epoch": 0.63, "learning_rate": 0.00011213174869300209, "loss": 2.994, "step": 76100 }, { "epoch": 0.63, "learning_rate": 0.000111883976577276, "loss": 2.9892, "step": 76200 }, { "epoch": 0.63, "learning_rate": 0.00011163620446154988, "loss": 2.9926, "step": 76300 }, { "epoch": 0.63, "learning_rate": 0.00011138843234582379, "loss": 2.989, "step": 76400 }, { "epoch": 0.63, "learning_rate": 0.0001111406602300977, "loss": 2.992, "step": 76500 }, { "epoch": 0.63, "learning_rate": 0.00011089288811437159, "loss": 2.9784, "step": 76600 }, { "epoch": 0.63, "learning_rate": 0.0001106451159986455, "loss": 2.9865, "step": 76700 }, { "epoch": 0.63, "learning_rate": 0.00011039734388291941, "loss": 2.9886, "step": 76800 }, { "epoch": 0.63, "learning_rate": 0.00011014957176719332, "loss": 2.9855, "step": 76900 }, { "epoch": 0.63, "learning_rate": 0.00010990179965146721, "loss": 2.9892, "step": 77000 }, { "epoch": 0.63, "eval_accuracy": 0.4308013539483, "eval_loss": 3.0127484798431396, "eval_runtime": 37.0779, "eval_samples_per_second": 306.544, "eval_steps_per_second": 2.562, "step": 77000 }, { "epoch": 0.64, "learning_rate": 0.00010965402753574112, "loss": 2.986, "step": 77100 }, { "epoch": 0.64, "learning_rate": 0.00010940625542001503, "loss": 2.9875, "step": 77200 }, { "epoch": 0.64, "learning_rate": 0.00010916096102544618, "loss": 2.9868, "step": 77300 }, { "epoch": 0.64, "learning_rate": 0.0001089131889097201, "loss": 2.9904, "step": 77400 }, { "epoch": 0.64, "learning_rate": 0.000108665416793994, "loss": 2.9861, "step": 77500 }, { "epoch": 0.64, "learning_rate": 0.00010841764467826791, "loss": 2.9866, "step": 77600 }, { "epoch": 0.64, "learning_rate": 0.00010816987256254179, "loss": 2.9794, "step": 77700 }, { "epoch": 0.64, "learning_rate": 0.0001079221004468157, "loss": 2.9848, "step": 77800 }, { "epoch": 0.64, "learning_rate": 0.00010767432833108961, "loss": 2.982, "step": 77900 }, { "epoch": 0.64, "learning_rate": 0.0001074265562153635, "loss": 2.9924, "step": 78000 }, { "epoch": 0.64, "eval_accuracy": 0.4309537631920592, "eval_loss": 3.010981321334839, "eval_runtime": 36.5579, "eval_samples_per_second": 310.904, "eval_steps_per_second": 2.599, "step": 78000 }, { "epoch": 0.64, "learning_rate": 0.00010717878409963741, "loss": 2.9795, "step": 78100 }, { "epoch": 0.64, "learning_rate": 0.00010693101198391132, "loss": 2.9867, "step": 78200 }, { "epoch": 0.65, "learning_rate": 0.00010668323986818523, "loss": 2.9761, "step": 78300 }, { "epoch": 0.65, "learning_rate": 0.00010643546775245913, "loss": 2.9887, "step": 78400 }, { "epoch": 0.65, "learning_rate": 0.00010618769563673304, "loss": 2.9941, "step": 78500 }, { "epoch": 0.65, "learning_rate": 0.00010593992352100695, "loss": 2.9888, "step": 78600 }, { "epoch": 0.65, "learning_rate": 0.00010569215140528084, "loss": 2.9859, "step": 78700 }, { "epoch": 0.65, "learning_rate": 0.00010544437928955474, "loss": 2.9828, "step": 78800 }, { "epoch": 0.65, "learning_rate": 0.00010519660717382864, "loss": 2.9819, "step": 78900 }, { "epoch": 0.65, "learning_rate": 0.0001049513127792598, "loss": 2.991, "step": 79000 }, { "epoch": 0.65, "eval_accuracy": 0.4311969252127841, "eval_loss": 3.009610652923584, "eval_runtime": 36.4211, "eval_samples_per_second": 312.071, "eval_steps_per_second": 2.608, "step": 79000 }, { "epoch": 0.65, "learning_rate": 0.00010470354066353371, "loss": 2.9902, "step": 79100 }, { "epoch": 0.65, "learning_rate": 0.00010445576854780762, "loss": 2.9855, "step": 79200 }, { "epoch": 0.65, "learning_rate": 0.00010420799643208153, "loss": 2.9842, "step": 79300 }, { "epoch": 0.65, "learning_rate": 0.00010396022431635542, "loss": 2.9881, "step": 79400 }, { "epoch": 0.66, "learning_rate": 0.00010371245220062933, "loss": 2.9834, "step": 79500 }, { "epoch": 0.66, "learning_rate": 0.00010346468008490324, "loss": 2.9834, "step": 79600 }, { "epoch": 0.66, "learning_rate": 0.00010321690796917713, "loss": 2.9728, "step": 79700 }, { "epoch": 0.66, "learning_rate": 0.00010296913585345104, "loss": 2.988, "step": 79800 }, { "epoch": 0.66, "learning_rate": 0.00010272136373772495, "loss": 2.988, "step": 79900 }, { "epoch": 0.66, "learning_rate": 0.00010247606934315612, "loss": 2.9824, "step": 80000 }, { "epoch": 0.66, "eval_accuracy": 0.4311297265916721, "eval_loss": 3.0079753398895264, "eval_runtime": 37.4243, "eval_samples_per_second": 303.707, "eval_steps_per_second": 2.538, "step": 80000 }, { "epoch": 0.66, "learning_rate": 0.00010222829722743002, "loss": 2.9804, "step": 80100 }, { "epoch": 0.66, "learning_rate": 0.00010198052511170393, "loss": 2.9889, "step": 80200 }, { "epoch": 0.66, "learning_rate": 0.00010173275299597783, "loss": 2.9813, "step": 80300 }, { "epoch": 0.66, "learning_rate": 0.00010148498088025172, "loss": 2.9886, "step": 80400 }, { "epoch": 0.66, "learning_rate": 0.00010123720876452562, "loss": 2.9851, "step": 80500 }, { "epoch": 0.66, "learning_rate": 0.00010098943664879953, "loss": 2.982, "step": 80600 }, { "epoch": 0.67, "learning_rate": 0.00010074166453307344, "loss": 2.9822, "step": 80700 }, { "epoch": 0.67, "learning_rate": 0.00010049389241734734, "loss": 2.9797, "step": 80800 }, { "epoch": 0.67, "learning_rate": 0.00010024612030162125, "loss": 2.9913, "step": 80900 }, { "epoch": 0.67, "learning_rate": 9.999834818589516e-05, "loss": 2.9879, "step": 81000 }, { "epoch": 0.67, "eval_accuracy": 0.43145879200433396, "eval_loss": 3.0059893131256104, "eval_runtime": 37.641, "eval_samples_per_second": 301.958, "eval_steps_per_second": 2.524, "step": 81000 }, { "epoch": 0.67, "learning_rate": 9.975057607016905e-05, "loss": 2.9876, "step": 81100 }, { "epoch": 0.67, "learning_rate": 9.950280395444296e-05, "loss": 2.9798, "step": 81200 }, { "epoch": 0.67, "learning_rate": 9.925503183871687e-05, "loss": 2.9783, "step": 81300 }, { "epoch": 0.67, "learning_rate": 9.900973744414804e-05, "loss": 2.9856, "step": 81400 }, { "epoch": 0.67, "learning_rate": 9.876196532842193e-05, "loss": 2.99, "step": 81500 }, { "epoch": 0.67, "learning_rate": 9.851419321269584e-05, "loss": 2.9815, "step": 81600 }, { "epoch": 0.67, "learning_rate": 9.826642109696975e-05, "loss": 2.9816, "step": 81700 }, { "epoch": 0.67, "learning_rate": 9.801864898124363e-05, "loss": 2.9838, "step": 81800 }, { "epoch": 0.68, "learning_rate": 9.777087686551754e-05, "loss": 2.9795, "step": 81900 }, { "epoch": 0.68, "learning_rate": 9.752310474979145e-05, "loss": 2.9764, "step": 82000 }, { "epoch": 0.68, "eval_accuracy": 0.4320621940557624, "eval_loss": 3.004152774810791, "eval_runtime": 37.3223, "eval_samples_per_second": 304.537, "eval_steps_per_second": 2.545, "step": 82000 }, { "epoch": 0.68, "learning_rate": 9.727533263406536e-05, "loss": 2.9796, "step": 82100 }, { "epoch": 0.68, "learning_rate": 9.702756051833925e-05, "loss": 2.9796, "step": 82200 }, { "epoch": 0.68, "learning_rate": 9.677978840261316e-05, "loss": 2.9842, "step": 82300 }, { "epoch": 0.68, "learning_rate": 9.653201628688707e-05, "loss": 2.9867, "step": 82400 }, { "epoch": 0.68, "learning_rate": 9.628424417116097e-05, "loss": 2.9787, "step": 82500 }, { "epoch": 0.68, "learning_rate": 9.603647205543488e-05, "loss": 2.9812, "step": 82600 }, { "epoch": 0.68, "learning_rate": 9.578869993970878e-05, "loss": 2.9854, "step": 82700 }, { "epoch": 0.68, "learning_rate": 9.554092782398269e-05, "loss": 2.9859, "step": 82800 }, { "epoch": 0.68, "learning_rate": 9.529315570825657e-05, "loss": 2.9781, "step": 82900 }, { "epoch": 0.68, "learning_rate": 9.504538359253048e-05, "loss": 2.9827, "step": 83000 }, { "epoch": 0.68, "eval_accuracy": 0.43152391231757653, "eval_loss": 3.0029940605163574, "eval_runtime": 37.7162, "eval_samples_per_second": 301.356, "eval_steps_per_second": 2.519, "step": 83000 }, { "epoch": 0.69, "learning_rate": 9.479761147680439e-05, "loss": 2.9819, "step": 83100 }, { "epoch": 0.69, "learning_rate": 9.454983936107829e-05, "loss": 2.9774, "step": 83200 }, { "epoch": 0.69, "learning_rate": 9.430454496650946e-05, "loss": 2.9792, "step": 83300 }, { "epoch": 0.69, "learning_rate": 9.405677285078337e-05, "loss": 2.9814, "step": 83400 }, { "epoch": 0.69, "learning_rate": 9.380900073505726e-05, "loss": 2.9847, "step": 83500 }, { "epoch": 0.69, "learning_rate": 9.356370634048843e-05, "loss": 2.9777, "step": 83600 }, { "epoch": 0.69, "learning_rate": 9.331593422476234e-05, "loss": 2.9804, "step": 83700 }, { "epoch": 0.69, "learning_rate": 9.306816210903625e-05, "loss": 2.9823, "step": 83800 }, { "epoch": 0.69, "learning_rate": 9.282038999331014e-05, "loss": 2.9715, "step": 83900 }, { "epoch": 0.69, "learning_rate": 9.257261787758405e-05, "loss": 2.9769, "step": 84000 }, { "epoch": 0.69, "eval_accuracy": 0.4324085787006696, "eval_loss": 3.0011510848999023, "eval_runtime": 36.6978, "eval_samples_per_second": 309.719, "eval_steps_per_second": 2.589, "step": 84000 }, { "epoch": 0.69, "learning_rate": 9.232484576185796e-05, "loss": 2.9795, "step": 84100 }, { "epoch": 0.69, "learning_rate": 9.207707364613186e-05, "loss": 2.9795, "step": 84200 }, { "epoch": 0.7, "learning_rate": 9.182930153040575e-05, "loss": 2.9859, "step": 84300 }, { "epoch": 0.7, "learning_rate": 9.158152941467966e-05, "loss": 2.9744, "step": 84400 }, { "epoch": 0.7, "learning_rate": 9.133375729895357e-05, "loss": 2.9785, "step": 84500 }, { "epoch": 0.7, "learning_rate": 9.108598518322746e-05, "loss": 2.9776, "step": 84600 }, { "epoch": 0.7, "learning_rate": 9.083821306750137e-05, "loss": 2.9782, "step": 84700 }, { "epoch": 0.7, "learning_rate": 9.059044095177528e-05, "loss": 2.9744, "step": 84800 }, { "epoch": 0.7, "learning_rate": 9.034266883604918e-05, "loss": 2.9852, "step": 84900 }, { "epoch": 0.7, "learning_rate": 9.009489672032309e-05, "loss": 2.9788, "step": 85000 }, { "epoch": 0.7, "eval_accuracy": 0.43223330807034654, "eval_loss": 3.0002310276031494, "eval_runtime": 36.492, "eval_samples_per_second": 311.466, "eval_steps_per_second": 2.603, "step": 85000 }, { "epoch": 0.7, "learning_rate": 8.9847124604597e-05, "loss": 2.982, "step": 85100 }, { "epoch": 0.7, "learning_rate": 8.95993524888709e-05, "loss": 2.974, "step": 85200 }, { "epoch": 0.7, "learning_rate": 8.93515803731448e-05, "loss": 2.9794, "step": 85300 }, { "epoch": 0.7, "learning_rate": 8.910380825741871e-05, "loss": 2.973, "step": 85400 }, { "epoch": 0.7, "learning_rate": 8.885603614169262e-05, "loss": 2.9741, "step": 85500 }, { "epoch": 0.71, "learning_rate": 8.86082640259665e-05, "loss": 2.9796, "step": 85600 }, { "epoch": 0.71, "learning_rate": 8.83604919102404e-05, "loss": 2.9687, "step": 85700 }, { "epoch": 0.71, "learning_rate": 8.811271979451432e-05, "loss": 2.9755, "step": 85800 }, { "epoch": 0.71, "learning_rate": 8.786494767878822e-05, "loss": 2.9721, "step": 85900 }, { "epoch": 0.71, "learning_rate": 8.761717556306212e-05, "loss": 2.9734, "step": 86000 }, { "epoch": 0.71, "eval_accuracy": 0.43254782532792235, "eval_loss": 2.998685121536255, "eval_runtime": 36.4931, "eval_samples_per_second": 311.456, "eval_steps_per_second": 2.603, "step": 86000 }, { "epoch": 0.71, "learning_rate": 8.736940344733603e-05, "loss": 2.9778, "step": 86100 }, { "epoch": 0.71, "learning_rate": 8.712163133160994e-05, "loss": 2.9772, "step": 86200 }, { "epoch": 0.71, "learning_rate": 8.687385921588383e-05, "loss": 2.9683, "step": 86300 }, { "epoch": 0.71, "learning_rate": 8.662608710015774e-05, "loss": 2.9746, "step": 86400 }, { "epoch": 0.71, "learning_rate": 8.637831498443165e-05, "loss": 2.979, "step": 86500 }, { "epoch": 0.71, "learning_rate": 8.613054286870556e-05, "loss": 2.9792, "step": 86600 }, { "epoch": 0.71, "learning_rate": 8.588524847413671e-05, "loss": 2.967, "step": 86700 }, { "epoch": 0.72, "learning_rate": 8.563747635841062e-05, "loss": 2.9794, "step": 86800 }, { "epoch": 0.72, "learning_rate": 8.538970424268453e-05, "loss": 2.9738, "step": 86900 }, { "epoch": 0.72, "learning_rate": 8.514193212695841e-05, "loss": 2.9769, "step": 87000 }, { "epoch": 0.72, "eval_accuracy": 0.4327799030400102, "eval_loss": 2.997548818588257, "eval_runtime": 39.3891, "eval_samples_per_second": 288.557, "eval_steps_per_second": 2.412, "step": 87000 }, { "epoch": 0.72, "learning_rate": 8.489416001123232e-05, "loss": 2.9792, "step": 87100 }, { "epoch": 0.72, "learning_rate": 8.464638789550623e-05, "loss": 2.9722, "step": 87200 }, { "epoch": 0.72, "learning_rate": 8.439861577978014e-05, "loss": 2.9805, "step": 87300 }, { "epoch": 0.72, "learning_rate": 8.415084366405404e-05, "loss": 2.9765, "step": 87400 }, { "epoch": 0.72, "learning_rate": 8.390307154832794e-05, "loss": 2.9794, "step": 87500 }, { "epoch": 0.72, "learning_rate": 8.365529943260185e-05, "loss": 2.9697, "step": 87600 }, { "epoch": 0.72, "learning_rate": 8.341000503803301e-05, "loss": 2.9733, "step": 87700 }, { "epoch": 0.72, "learning_rate": 8.316223292230692e-05, "loss": 2.9746, "step": 87800 }, { "epoch": 0.72, "learning_rate": 8.291446080658083e-05, "loss": 2.9655, "step": 87900 }, { "epoch": 0.73, "learning_rate": 8.266668869085474e-05, "loss": 2.9676, "step": 88000 }, { "epoch": 0.73, "eval_accuracy": 0.432616409487614, "eval_loss": 2.9959168434143066, "eval_runtime": 36.4885, "eval_samples_per_second": 311.495, "eval_steps_per_second": 2.604, "step": 88000 }, { "epoch": 0.73, "learning_rate": 8.241891657512863e-05, "loss": 2.9681, "step": 88100 }, { "epoch": 0.73, "learning_rate": 8.21736221805598e-05, "loss": 2.9675, "step": 88200 }, { "epoch": 0.73, "learning_rate": 8.192585006483371e-05, "loss": 2.9789, "step": 88300 }, { "epoch": 0.73, "learning_rate": 8.167807794910759e-05, "loss": 2.9689, "step": 88400 }, { "epoch": 0.73, "learning_rate": 8.14303058333815e-05, "loss": 2.9683, "step": 88500 }, { "epoch": 0.73, "learning_rate": 8.118253371765541e-05, "loss": 2.9729, "step": 88600 }, { "epoch": 0.73, "learning_rate": 8.09347616019293e-05, "loss": 2.9676, "step": 88700 }, { "epoch": 0.73, "learning_rate": 8.068698948620321e-05, "loss": 2.9708, "step": 88800 }, { "epoch": 0.73, "learning_rate": 8.043921737047712e-05, "loss": 2.9669, "step": 88900 }, { "epoch": 0.73, "learning_rate": 8.019144525475103e-05, "loss": 2.9677, "step": 89000 }, { "epoch": 0.73, "eval_accuracy": 0.43304869752445824, "eval_loss": 2.9942848682403564, "eval_runtime": 36.9235, "eval_samples_per_second": 307.826, "eval_steps_per_second": 2.573, "step": 89000 }, { "epoch": 0.73, "learning_rate": 7.994367313902492e-05, "loss": 2.9678, "step": 89100 }, { "epoch": 0.74, "learning_rate": 7.969590102329883e-05, "loss": 2.9667, "step": 89200 }, { "epoch": 0.74, "learning_rate": 7.945060662873e-05, "loss": 2.9667, "step": 89300 }, { "epoch": 0.74, "learning_rate": 7.92028345130039e-05, "loss": 2.9772, "step": 89400 }, { "epoch": 0.74, "learning_rate": 7.895506239727781e-05, "loss": 2.9686, "step": 89500 }, { "epoch": 0.74, "learning_rate": 7.870729028155172e-05, "loss": 2.9651, "step": 89600 }, { "epoch": 0.74, "learning_rate": 7.845951816582562e-05, "loss": 2.9738, "step": 89700 }, { "epoch": 0.74, "learning_rate": 7.82117460500995e-05, "loss": 2.9674, "step": 89800 }, { "epoch": 0.74, "learning_rate": 7.796397393437342e-05, "loss": 2.9731, "step": 89900 }, { "epoch": 0.74, "learning_rate": 7.771620181864732e-05, "loss": 2.9739, "step": 90000 }, { "epoch": 0.74, "eval_accuracy": 0.4330424626008499, "eval_loss": 2.993264675140381, "eval_runtime": 36.5912, "eval_samples_per_second": 310.621, "eval_steps_per_second": 2.596, "step": 90000 }, { "epoch": 0.74, "learning_rate": 7.746842970292122e-05, "loss": 2.966, "step": 90100 }, { "epoch": 0.74, "learning_rate": 7.722065758719513e-05, "loss": 2.9695, "step": 90200 }, { "epoch": 0.74, "learning_rate": 7.697288547146904e-05, "loss": 2.9719, "step": 90300 }, { "epoch": 0.75, "learning_rate": 7.672511335574295e-05, "loss": 2.9691, "step": 90400 }, { "epoch": 0.75, "learning_rate": 7.647734124001684e-05, "loss": 2.9707, "step": 90500 }, { "epoch": 0.75, "learning_rate": 7.622956912429075e-05, "loss": 2.9685, "step": 90600 }, { "epoch": 0.75, "learning_rate": 7.598179700856466e-05, "loss": 2.9652, "step": 90700 }, { "epoch": 0.75, "learning_rate": 7.573402489283854e-05, "loss": 2.9687, "step": 90800 }, { "epoch": 0.75, "learning_rate": 7.548625277711245e-05, "loss": 2.9723, "step": 90900 }, { "epoch": 0.75, "learning_rate": 7.523848066138636e-05, "loss": 2.9691, "step": 91000 }, { "epoch": 0.75, "eval_accuracy": 0.4334020098622636, "eval_loss": 2.9914395809173584, "eval_runtime": 37.2256, "eval_samples_per_second": 305.327, "eval_steps_per_second": 2.552, "step": 91000 }, { "epoch": 0.75, "learning_rate": 7.499070854566027e-05, "loss": 2.9663, "step": 91100 }, { "epoch": 0.75, "learning_rate": 7.474293642993418e-05, "loss": 2.9696, "step": 91200 }, { "epoch": 0.75, "learning_rate": 7.449764203536533e-05, "loss": 2.969, "step": 91300 }, { "epoch": 0.75, "learning_rate": 7.42523476407965e-05, "loss": 2.9687, "step": 91400 }, { "epoch": 0.75, "learning_rate": 7.400705324622766e-05, "loss": 2.9709, "step": 91500 }, { "epoch": 0.76, "learning_rate": 7.375928113050156e-05, "loss": 2.9647, "step": 91600 }, { "epoch": 0.76, "learning_rate": 7.351150901477547e-05, "loss": 2.9715, "step": 91700 }, { "epoch": 0.76, "learning_rate": 7.326373689904938e-05, "loss": 2.9707, "step": 91800 }, { "epoch": 0.76, "learning_rate": 7.301596478332328e-05, "loss": 2.9665, "step": 91900 }, { "epoch": 0.76, "learning_rate": 7.276819266759719e-05, "loss": 2.969, "step": 92000 }, { "epoch": 0.76, "eval_accuracy": 0.4335689672611089, "eval_loss": 2.990133285522461, "eval_runtime": 36.2216, "eval_samples_per_second": 313.79, "eval_steps_per_second": 2.623, "step": 92000 }, { "epoch": 0.76, "learning_rate": 7.252042055187108e-05, "loss": 2.9663, "step": 92100 }, { "epoch": 0.76, "learning_rate": 7.227264843614499e-05, "loss": 2.9625, "step": 92200 }, { "epoch": 0.76, "learning_rate": 7.20248763204189e-05, "loss": 2.9639, "step": 92300 }, { "epoch": 0.76, "learning_rate": 7.17771042046928e-05, "loss": 2.9647, "step": 92400 }, { "epoch": 0.76, "learning_rate": 7.15293320889667e-05, "loss": 2.9621, "step": 92500 }, { "epoch": 0.76, "learning_rate": 7.12815599732406e-05, "loss": 2.9648, "step": 92600 }, { "epoch": 0.76, "learning_rate": 7.103378785751451e-05, "loss": 2.9671, "step": 92700 }, { "epoch": 0.77, "learning_rate": 7.078601574178842e-05, "loss": 2.9601, "step": 92800 }, { "epoch": 0.77, "learning_rate": 7.053824362606231e-05, "loss": 2.9639, "step": 92900 }, { "epoch": 0.77, "learning_rate": 7.029047151033622e-05, "loss": 2.9602, "step": 93000 }, { "epoch": 0.77, "eval_accuracy": 0.43370405727262273, "eval_loss": 2.9889016151428223, "eval_runtime": 37.7589, "eval_samples_per_second": 301.015, "eval_steps_per_second": 2.516, "step": 93000 }, { "epoch": 0.77, "learning_rate": 7.004269939461012e-05, "loss": 2.9696, "step": 93100 }, { "epoch": 0.77, "learning_rate": 6.979492727888402e-05, "loss": 2.966, "step": 93200 }, { "epoch": 0.77, "learning_rate": 6.954715516315793e-05, "loss": 2.9673, "step": 93300 }, { "epoch": 0.77, "learning_rate": 6.929938304743184e-05, "loss": 2.9631, "step": 93400 }, { "epoch": 0.77, "learning_rate": 6.905161093170574e-05, "loss": 2.9601, "step": 93500 }, { "epoch": 0.77, "learning_rate": 6.880383881597965e-05, "loss": 2.959, "step": 93600 }, { "epoch": 0.77, "learning_rate": 6.855854442141082e-05, "loss": 2.9669, "step": 93700 }, { "epoch": 0.77, "learning_rate": 6.831077230568471e-05, "loss": 2.9667, "step": 93800 }, { "epoch": 0.77, "learning_rate": 6.806300018995862e-05, "loss": 2.9612, "step": 93900 }, { "epoch": 0.78, "learning_rate": 6.781522807423251e-05, "loss": 2.965, "step": 94000 }, { "epoch": 0.78, "eval_accuracy": 0.43392020129104486, "eval_loss": 2.987159490585327, "eval_runtime": 37.9274, "eval_samples_per_second": 299.678, "eval_steps_per_second": 2.505, "step": 94000 }, { "epoch": 0.78, "learning_rate": 6.756745595850642e-05, "loss": 2.9569, "step": 94100 }, { "epoch": 0.78, "learning_rate": 6.731968384278033e-05, "loss": 2.9578, "step": 94200 }, { "epoch": 0.78, "learning_rate": 6.707191172705423e-05, "loss": 2.9567, "step": 94300 }, { "epoch": 0.78, "learning_rate": 6.682413961132814e-05, "loss": 2.9798, "step": 94400 }, { "epoch": 0.78, "learning_rate": 6.657636749560203e-05, "loss": 2.9641, "step": 94500 }, { "epoch": 0.78, "learning_rate": 6.632859537987594e-05, "loss": 2.959, "step": 94600 }, { "epoch": 0.78, "learning_rate": 6.608082326414985e-05, "loss": 2.9565, "step": 94700 }, { "epoch": 0.78, "learning_rate": 6.583305114842376e-05, "loss": 2.9648, "step": 94800 }, { "epoch": 0.78, "learning_rate": 6.558775675385491e-05, "loss": 2.9584, "step": 94900 }, { "epoch": 0.78, "learning_rate": 6.533998463812882e-05, "loss": 2.9627, "step": 95000 }, { "epoch": 0.78, "eval_accuracy": 0.4341079417685846, "eval_loss": 2.985278606414795, "eval_runtime": 36.3639, "eval_samples_per_second": 312.562, "eval_steps_per_second": 2.612, "step": 95000 }, { "epoch": 0.78, "learning_rate": 6.509221252240273e-05, "loss": 2.9647, "step": 95100 }, { "epoch": 0.78, "learning_rate": 6.484444040667663e-05, "loss": 2.9584, "step": 95200 }, { "epoch": 0.79, "learning_rate": 6.459666829095054e-05, "loss": 2.9659, "step": 95300 }, { "epoch": 0.79, "learning_rate": 6.434889617522443e-05, "loss": 2.9635, "step": 95400 }, { "epoch": 0.79, "learning_rate": 6.410112405949834e-05, "loss": 2.9615, "step": 95500 }, { "epoch": 0.79, "learning_rate": 6.385335194377225e-05, "loss": 2.9669, "step": 95600 }, { "epoch": 0.79, "learning_rate": 6.360557982804614e-05, "loss": 2.9563, "step": 95700 }, { "epoch": 0.79, "learning_rate": 6.335780771232005e-05, "loss": 2.9603, "step": 95800 }, { "epoch": 0.79, "learning_rate": 6.311003559659395e-05, "loss": 2.9576, "step": 95900 }, { "epoch": 0.79, "learning_rate": 6.286226348086786e-05, "loss": 2.9542, "step": 96000 }, { "epoch": 0.79, "eval_accuracy": 0.4340469780710809, "eval_loss": 2.984398603439331, "eval_runtime": 36.0861, "eval_samples_per_second": 314.969, "eval_steps_per_second": 2.633, "step": 96000 }, { "epoch": 0.79, "learning_rate": 6.261449136514177e-05, "loss": 2.9567, "step": 96100 }, { "epoch": 0.79, "learning_rate": 6.236671924941566e-05, "loss": 2.9652, "step": 96200 }, { "epoch": 0.79, "learning_rate": 6.211894713368957e-05, "loss": 2.9593, "step": 96300 }, { "epoch": 0.79, "learning_rate": 6.187117501796346e-05, "loss": 2.9532, "step": 96400 }, { "epoch": 0.8, "learning_rate": 6.162340290223737e-05, "loss": 2.9562, "step": 96500 }, { "epoch": 0.8, "learning_rate": 6.137563078651128e-05, "loss": 2.9667, "step": 96600 }, { "epoch": 0.8, "learning_rate": 6.112785867078519e-05, "loss": 2.9647, "step": 96700 }, { "epoch": 0.8, "learning_rate": 6.0880086555059086e-05, "loss": 2.9587, "step": 96800 }, { "epoch": 0.8, "learning_rate": 6.0632314439332995e-05, "loss": 2.961, "step": 96900 }, { "epoch": 0.8, "learning_rate": 6.03845423236069e-05, "loss": 2.9552, "step": 97000 }, { "epoch": 0.8, "eval_accuracy": 0.43437950733019187, "eval_loss": 2.9822094440460205, "eval_runtime": 36.4432, "eval_samples_per_second": 311.883, "eval_steps_per_second": 2.607, "step": 97000 }, { "epoch": 0.8, "learning_rate": 6.013924792903806e-05, "loss": 2.9531, "step": 97100 }, { "epoch": 0.8, "learning_rate": 5.989147581331197e-05, "loss": 2.9497, "step": 97200 }, { "epoch": 0.8, "learning_rate": 5.964618141874313e-05, "loss": 2.9569, "step": 97300 }, { "epoch": 0.8, "learning_rate": 5.939840930301703e-05, "loss": 2.9532, "step": 97400 }, { "epoch": 0.8, "learning_rate": 5.915063718729094e-05, "loss": 2.9589, "step": 97500 }, { "epoch": 0.8, "learning_rate": 5.890286507156484e-05, "loss": 2.9626, "step": 97600 }, { "epoch": 0.81, "learning_rate": 5.8655092955838746e-05, "loss": 2.9566, "step": 97700 }, { "epoch": 0.81, "learning_rate": 5.840732084011265e-05, "loss": 2.9493, "step": 97800 }, { "epoch": 0.81, "learning_rate": 5.8159548724386557e-05, "loss": 2.9595, "step": 97900 }, { "epoch": 0.81, "learning_rate": 5.791177660866046e-05, "loss": 2.9576, "step": 98000 }, { "epoch": 0.81, "eval_accuracy": 0.43472242812865003, "eval_loss": 2.98115611076355, "eval_runtime": 36.3414, "eval_samples_per_second": 312.757, "eval_steps_per_second": 2.614, "step": 98000 }, { "epoch": 0.81, "learning_rate": 5.7664004492934354e-05, "loss": 2.9558, "step": 98100 }, { "epoch": 0.81, "learning_rate": 5.741623237720826e-05, "loss": 2.9584, "step": 98200 }, { "epoch": 0.81, "learning_rate": 5.7168460261482165e-05, "loss": 2.9556, "step": 98300 }, { "epoch": 0.81, "learning_rate": 5.6920688145756073e-05, "loss": 2.9576, "step": 98400 }, { "epoch": 0.81, "learning_rate": 5.6672916030029976e-05, "loss": 2.9546, "step": 98500 }, { "epoch": 0.81, "learning_rate": 5.6425143914303884e-05, "loss": 2.9607, "step": 98600 }, { "epoch": 0.81, "learning_rate": 5.6177371798577786e-05, "loss": 2.9464, "step": 98700 }, { "epoch": 0.81, "learning_rate": 5.593207740400895e-05, "loss": 2.9614, "step": 98800 }, { "epoch": 0.82, "learning_rate": 5.568430528828286e-05, "loss": 2.9592, "step": 98900 }, { "epoch": 0.82, "learning_rate": 5.543653317255675e-05, "loss": 2.9579, "step": 99000 }, { "epoch": 0.82, "eval_accuracy": 0.4348104098284565, "eval_loss": 2.9802134037017822, "eval_runtime": 37.635, "eval_samples_per_second": 302.006, "eval_steps_per_second": 2.524, "step": 99000 }, { "epoch": 0.82, "learning_rate": 5.518876105683066e-05, "loss": 2.9605, "step": 99100 }, { "epoch": 0.82, "learning_rate": 5.4940988941104564e-05, "loss": 2.9602, "step": 99200 }, { "epoch": 0.82, "learning_rate": 5.4693216825378466e-05, "loss": 2.9587, "step": 99300 }, { "epoch": 0.82, "learning_rate": 5.4445444709652375e-05, "loss": 2.9472, "step": 99400 }, { "epoch": 0.82, "learning_rate": 5.419767259392627e-05, "loss": 2.9589, "step": 99500 }, { "epoch": 0.82, "learning_rate": 5.394990047820018e-05, "loss": 2.9673, "step": 99600 }, { "epoch": 0.82, "learning_rate": 5.370212836247408e-05, "loss": 2.9532, "step": 99700 }, { "epoch": 0.82, "learning_rate": 5.345435624674799e-05, "loss": 2.9508, "step": 99800 }, { "epoch": 0.82, "learning_rate": 5.320658413102189e-05, "loss": 2.9546, "step": 99900 }, { "epoch": 0.82, "learning_rate": 5.2961289736453054e-05, "loss": 2.9508, "step": 100000 }, { "epoch": 0.82, "eval_accuracy": 0.4348886927582055, "eval_loss": 2.9783637523651123, "eval_runtime": 36.4236, "eval_samples_per_second": 312.05, "eval_steps_per_second": 2.608, "step": 100000 }, { "epoch": 0.83, "learning_rate": 5.271599534188422e-05, "loss": 2.9495, "step": 100100 }, { "epoch": 0.83, "learning_rate": 5.2468223226158125e-05, "loss": 2.9514, "step": 100200 }, { "epoch": 0.83, "learning_rate": 5.222045111043202e-05, "loss": 2.9546, "step": 100300 }, { "epoch": 0.83, "learning_rate": 5.197267899470593e-05, "loss": 2.9544, "step": 100400 }, { "epoch": 0.83, "learning_rate": 5.172490687897983e-05, "loss": 2.9568, "step": 100500 }, { "epoch": 0.83, "learning_rate": 5.147713476325374e-05, "loss": 2.9591, "step": 100600 }, { "epoch": 0.83, "learning_rate": 5.122936264752764e-05, "loss": 2.9574, "step": 100700 }, { "epoch": 0.83, "learning_rate": 5.098159053180155e-05, "loss": 2.9591, "step": 100800 }, { "epoch": 0.83, "learning_rate": 5.073381841607545e-05, "loss": 2.9438, "step": 100900 }, { "epoch": 0.83, "learning_rate": 5.048604630034935e-05, "loss": 2.9551, "step": 101000 }, { "epoch": 0.83, "eval_accuracy": 0.43532513741078865, "eval_loss": 2.977102041244507, "eval_runtime": 36.649, "eval_samples_per_second": 310.131, "eval_steps_per_second": 2.592, "step": 101000 }, { "epoch": 0.83, "learning_rate": 5.023827418462326e-05, "loss": 2.9481, "step": 101100 }, { "epoch": 0.83, "learning_rate": 4.999050206889716e-05, "loss": 2.9504, "step": 101200 }, { "epoch": 0.84, "learning_rate": 4.974272995317107e-05, "loss": 2.9566, "step": 101300 }, { "epoch": 0.84, "learning_rate": 4.949495783744497e-05, "loss": 2.9571, "step": 101400 }, { "epoch": 0.84, "learning_rate": 4.924718572171888e-05, "loss": 2.9469, "step": 101500 }, { "epoch": 0.84, "learning_rate": 4.8999413605992774e-05, "loss": 2.9507, "step": 101600 }, { "epoch": 0.84, "learning_rate": 4.875164149026668e-05, "loss": 2.9506, "step": 101700 }, { "epoch": 0.84, "learning_rate": 4.8503869374540585e-05, "loss": 2.944, "step": 101800 }, { "epoch": 0.84, "learning_rate": 4.825609725881449e-05, "loss": 2.9513, "step": 101900 }, { "epoch": 0.84, "learning_rate": 4.8008325143088396e-05, "loss": 2.9535, "step": 102000 }, { "epoch": 0.84, "eval_accuracy": 0.4356812208257533, "eval_loss": 2.975886106491089, "eval_runtime": 37.6554, "eval_samples_per_second": 301.843, "eval_steps_per_second": 2.523, "step": 102000 }, { "epoch": 0.84, "learning_rate": 4.77605530273623e-05, "loss": 2.9564, "step": 102100 }, { "epoch": 0.84, "learning_rate": 4.751525863279347e-05, "loss": 2.9465, "step": 102200 }, { "epoch": 0.84, "learning_rate": 4.726748651706736e-05, "loss": 2.9458, "step": 102300 }, { "epoch": 0.84, "learning_rate": 4.7019714401341264e-05, "loss": 2.9552, "step": 102400 }, { "epoch": 0.85, "learning_rate": 4.677194228561517e-05, "loss": 2.9505, "step": 102500 }, { "epoch": 0.85, "learning_rate": 4.6524170169889075e-05, "loss": 2.9547, "step": 102600 }, { "epoch": 0.85, "learning_rate": 4.6276398054162984e-05, "loss": 2.9493, "step": 102700 }, { "epoch": 0.85, "learning_rate": 4.6028625938436886e-05, "loss": 2.9543, "step": 102800 }, { "epoch": 0.85, "learning_rate": 4.5780853822710795e-05, "loss": 2.9447, "step": 102900 }, { "epoch": 0.85, "learning_rate": 4.553308170698469e-05, "loss": 2.9479, "step": 103000 }, { "epoch": 0.85, "eval_accuracy": 0.43567152205569587, "eval_loss": 2.9743244647979736, "eval_runtime": 36.3875, "eval_samples_per_second": 312.36, "eval_steps_per_second": 2.611, "step": 103000 }, { "epoch": 0.85, "learning_rate": 4.528530959125859e-05, "loss": 2.9493, "step": 103100 }, { "epoch": 0.85, "learning_rate": 4.50375374755325e-05, "loss": 2.952, "step": 103200 }, { "epoch": 0.85, "learning_rate": 4.47897653598064e-05, "loss": 2.954, "step": 103300 }, { "epoch": 0.85, "learning_rate": 4.454199324408031e-05, "loss": 2.9462, "step": 103400 }, { "epoch": 0.85, "learning_rate": 4.429422112835421e-05, "loss": 2.9468, "step": 103500 }, { "epoch": 0.85, "learning_rate": 4.4046449012628116e-05, "loss": 2.9514, "step": 103600 }, { "epoch": 0.86, "learning_rate": 4.379867689690202e-05, "loss": 2.9504, "step": 103700 }, { "epoch": 0.86, "learning_rate": 4.355090478117592e-05, "loss": 2.9568, "step": 103800 }, { "epoch": 0.86, "learning_rate": 4.330313266544983e-05, "loss": 2.9525, "step": 103900 }, { "epoch": 0.86, "learning_rate": 4.305536054972373e-05, "loss": 2.9542, "step": 104000 }, { "epoch": 0.86, "eval_accuracy": 0.43588905161269764, "eval_loss": 2.973242998123169, "eval_runtime": 36.37, "eval_samples_per_second": 312.51, "eval_steps_per_second": 2.612, "step": 104000 }, { "epoch": 0.86, "learning_rate": 4.280758843399764e-05, "loss": 2.9467, "step": 104100 }, { "epoch": 0.86, "learning_rate": 4.25622940394288e-05, "loss": 2.9516, "step": 104200 }, { "epoch": 0.86, "learning_rate": 4.23145219237027e-05, "loss": 2.9421, "step": 104300 }, { "epoch": 0.86, "learning_rate": 4.2066749807976606e-05, "loss": 2.9451, "step": 104400 }, { "epoch": 0.86, "learning_rate": 4.182145541340777e-05, "loss": 2.9481, "step": 104500 }, { "epoch": 0.86, "learning_rate": 4.157368329768168e-05, "loss": 2.9514, "step": 104600 }, { "epoch": 0.86, "learning_rate": 4.132591118195558e-05, "loss": 2.9486, "step": 104700 }, { "epoch": 0.86, "learning_rate": 4.107813906622948e-05, "loss": 2.9429, "step": 104800 }, { "epoch": 0.86, "learning_rate": 4.083036695050339e-05, "loss": 2.948, "step": 104900 }, { "epoch": 0.87, "learning_rate": 4.0582594834777285e-05, "loss": 2.9481, "step": 105000 }, { "epoch": 0.87, "eval_accuracy": 0.4360442319336161, "eval_loss": 2.971482992172241, "eval_runtime": 36.0911, "eval_samples_per_second": 314.925, "eval_steps_per_second": 2.632, "step": 105000 }, { "epoch": 0.87, "learning_rate": 4.0334822719051194e-05, "loss": 2.9432, "step": 105100 }, { "epoch": 0.87, "learning_rate": 4.0087050603325096e-05, "loss": 2.9543, "step": 105200 }, { "epoch": 0.87, "learning_rate": 3.9839278487599005e-05, "loss": 2.9504, "step": 105300 }, { "epoch": 0.87, "learning_rate": 3.959150637187291e-05, "loss": 2.9536, "step": 105400 }, { "epoch": 0.87, "learning_rate": 3.9343734256146816e-05, "loss": 2.945, "step": 105500 }, { "epoch": 0.87, "learning_rate": 3.909596214042071e-05, "loss": 2.9532, "step": 105600 }, { "epoch": 0.87, "learning_rate": 3.884819002469461e-05, "loss": 2.9493, "step": 105700 }, { "epoch": 0.87, "learning_rate": 3.860041790896852e-05, "loss": 2.948, "step": 105800 }, { "epoch": 0.87, "learning_rate": 3.8352645793242424e-05, "loss": 2.9508, "step": 105900 }, { "epoch": 0.87, "learning_rate": 3.810735139867359e-05, "loss": 2.941, "step": 106000 }, { "epoch": 0.87, "eval_accuracy": 0.4362319724111558, "eval_loss": 2.969744920730591, "eval_runtime": 36.2819, "eval_samples_per_second": 313.269, "eval_steps_per_second": 2.618, "step": 106000 }, { "epoch": 0.87, "learning_rate": 3.7859579282947495e-05, "loss": 2.944, "step": 106100 }, { "epoch": 0.88, "learning_rate": 3.76118071672214e-05, "loss": 2.9433, "step": 106200 }, { "epoch": 0.88, "learning_rate": 3.7364035051495306e-05, "loss": 2.9442, "step": 106300 }, { "epoch": 0.88, "learning_rate": 3.711626293576921e-05, "loss": 2.9525, "step": 106400 }, { "epoch": 0.88, "learning_rate": 3.686849082004311e-05, "loss": 2.9423, "step": 106500 }, { "epoch": 0.88, "learning_rate": 3.662071870431701e-05, "loss": 2.9505, "step": 106600 }, { "epoch": 0.88, "learning_rate": 3.6372946588590914e-05, "loss": 2.948, "step": 106700 }, { "epoch": 0.88, "learning_rate": 3.612517447286482e-05, "loss": 2.9446, "step": 106800 }, { "epoch": 0.88, "learning_rate": 3.5877402357138725e-05, "loss": 2.9429, "step": 106900 }, { "epoch": 0.88, "learning_rate": 3.562963024141263e-05, "loss": 2.9435, "step": 107000 }, { "epoch": 0.88, "eval_accuracy": 0.4365194716664288, "eval_loss": 2.9684131145477295, "eval_runtime": 36.3894, "eval_samples_per_second": 312.343, "eval_steps_per_second": 2.611, "step": 107000 }, { "epoch": 0.88, "learning_rate": 3.5381858125686536e-05, "loss": 2.9433, "step": 107100 }, { "epoch": 0.88, "learning_rate": 3.513408600996043e-05, "loss": 2.9447, "step": 107200 }, { "epoch": 0.88, "learning_rate": 3.488631389423434e-05, "loss": 2.9476, "step": 107300 }, { "epoch": 0.89, "learning_rate": 3.463854177850824e-05, "loss": 2.9508, "step": 107400 }, { "epoch": 0.89, "learning_rate": 3.4390769662782144e-05, "loss": 2.9423, "step": 107500 }, { "epoch": 0.89, "learning_rate": 3.414299754705605e-05, "loss": 2.9491, "step": 107600 }, { "epoch": 0.89, "learning_rate": 3.3897703152487215e-05, "loss": 2.943, "step": 107700 }, { "epoch": 0.89, "learning_rate": 3.364993103676112e-05, "loss": 2.9444, "step": 107800 }, { "epoch": 0.89, "learning_rate": 3.3402158921035026e-05, "loss": 2.9475, "step": 107900 }, { "epoch": 0.89, "learning_rate": 3.315438680530893e-05, "loss": 2.9403, "step": 108000 }, { "epoch": 0.89, "eval_accuracy": 0.4367591698407046, "eval_loss": 2.9674224853515625, "eval_runtime": 36.6777, "eval_samples_per_second": 309.888, "eval_steps_per_second": 2.59, "step": 108000 }, { "epoch": 0.89, "learning_rate": 3.290661468958283e-05, "loss": 2.9395, "step": 108100 }, { "epoch": 0.89, "learning_rate": 3.265884257385674e-05, "loss": 2.9413, "step": 108200 }, { "epoch": 0.89, "learning_rate": 3.241107045813064e-05, "loss": 2.9495, "step": 108300 }, { "epoch": 0.89, "learning_rate": 3.216329834240454e-05, "loss": 2.9461, "step": 108400 }, { "epoch": 0.89, "learning_rate": 3.1915526226678445e-05, "loss": 2.939, "step": 108500 }, { "epoch": 0.9, "learning_rate": 3.166775411095235e-05, "loss": 2.9436, "step": 108600 }, { "epoch": 0.9, "learning_rate": 3.1419981995226256e-05, "loss": 2.9414, "step": 108700 }, { "epoch": 0.9, "learning_rate": 3.117220987950016e-05, "loss": 2.9388, "step": 108800 }, { "epoch": 0.9, "learning_rate": 3.092691548493133e-05, "loss": 2.9425, "step": 108900 }, { "epoch": 0.9, "learning_rate": 3.067914336920522e-05, "loss": 2.9453, "step": 109000 }, { "epoch": 0.9, "eval_accuracy": 0.43670790491325834, "eval_loss": 2.9661126136779785, "eval_runtime": 37.1173, "eval_samples_per_second": 306.218, "eval_steps_per_second": 2.559, "step": 109000 }, { "epoch": 0.9, "learning_rate": 3.0433848974636392e-05, "loss": 2.9453, "step": 109100 }, { "epoch": 0.9, "learning_rate": 3.0186076858910297e-05, "loss": 2.9398, "step": 109200 }, { "epoch": 0.9, "learning_rate": 2.99383047431842e-05, "loss": 2.9452, "step": 109300 }, { "epoch": 0.9, "learning_rate": 2.9690532627458105e-05, "loss": 2.9423, "step": 109400 }, { "epoch": 0.9, "learning_rate": 2.9442760511732007e-05, "loss": 2.945, "step": 109500 }, { "epoch": 0.9, "learning_rate": 2.919498839600591e-05, "loss": 2.941, "step": 109600 }, { "epoch": 0.9, "learning_rate": 2.8947216280279814e-05, "loss": 2.9404, "step": 109700 }, { "epoch": 0.91, "learning_rate": 2.869944416455372e-05, "loss": 2.9441, "step": 109800 }, { "epoch": 0.91, "learning_rate": 2.845167204882762e-05, "loss": 2.9427, "step": 109900 }, { "epoch": 0.91, "learning_rate": 2.8203899933101527e-05, "loss": 2.9396, "step": 110000 }, { "epoch": 0.91, "eval_accuracy": 0.43717206033743405, "eval_loss": 2.964357852935791, "eval_runtime": 36.5503, "eval_samples_per_second": 310.969, "eval_steps_per_second": 2.599, "step": 110000 }, { "epoch": 0.91, "learning_rate": 2.7956127817375432e-05, "loss": 2.9405, "step": 110100 }, { "epoch": 0.91, "learning_rate": 2.7708355701649334e-05, "loss": 2.9487, "step": 110200 }, { "epoch": 0.91, "learning_rate": 2.7460583585923236e-05, "loss": 2.9415, "step": 110300 }, { "epoch": 0.91, "learning_rate": 2.7212811470197142e-05, "loss": 2.9365, "step": 110400 }, { "epoch": 0.91, "learning_rate": 2.6965039354471044e-05, "loss": 2.9468, "step": 110500 }, { "epoch": 0.91, "learning_rate": 2.671726723874495e-05, "loss": 2.9311, "step": 110600 }, { "epoch": 0.91, "learning_rate": 2.6469495123018855e-05, "loss": 2.9402, "step": 110700 }, { "epoch": 0.91, "learning_rate": 2.6221723007292757e-05, "loss": 2.9361, "step": 110800 }, { "epoch": 0.91, "learning_rate": 2.5973950891566662e-05, "loss": 2.9328, "step": 110900 }, { "epoch": 0.92, "learning_rate": 2.5726178775840564e-05, "loss": 2.9375, "step": 111000 }, { "epoch": 0.92, "eval_accuracy": 0.43724757219002386, "eval_loss": 2.9633212089538574, "eval_runtime": 36.4613, "eval_samples_per_second": 311.728, "eval_steps_per_second": 2.606, "step": 111000 }, { "epoch": 0.92, "learning_rate": 2.5478406660114466e-05, "loss": 2.9379, "step": 111100 }, { "epoch": 0.92, "learning_rate": 2.523063454438837e-05, "loss": 2.933, "step": 111200 }, { "epoch": 0.92, "learning_rate": 2.4982862428662277e-05, "loss": 2.9385, "step": 111300 }, { "epoch": 0.92, "learning_rate": 2.473509031293618e-05, "loss": 2.9362, "step": 111400 }, { "epoch": 0.92, "learning_rate": 2.4487318197210084e-05, "loss": 2.9342, "step": 111500 }, { "epoch": 0.92, "learning_rate": 2.423954608148399e-05, "loss": 2.9392, "step": 111600 }, { "epoch": 0.92, "learning_rate": 2.3991773965757892e-05, "loss": 2.9382, "step": 111700 }, { "epoch": 0.92, "learning_rate": 2.3746479571189054e-05, "loss": 2.9376, "step": 111800 }, { "epoch": 0.92, "learning_rate": 2.349870745546296e-05, "loss": 2.9352, "step": 111900 }, { "epoch": 0.92, "learning_rate": 2.3250935339736865e-05, "loss": 2.9284, "step": 112000 }, { "epoch": 0.92, "eval_accuracy": 0.43742907774395523, "eval_loss": 2.96207857131958, "eval_runtime": 36.2575, "eval_samples_per_second": 313.48, "eval_steps_per_second": 2.62, "step": 112000 }, { "epoch": 0.92, "learning_rate": 2.300316322401077e-05, "loss": 2.9384, "step": 112100 }, { "epoch": 0.93, "learning_rate": 2.2755391108284673e-05, "loss": 2.933, "step": 112200 }, { "epoch": 0.93, "learning_rate": 2.2507618992558575e-05, "loss": 2.935, "step": 112300 }, { "epoch": 0.93, "learning_rate": 2.2259846876832477e-05, "loss": 2.938, "step": 112400 }, { "epoch": 0.93, "learning_rate": 2.2012074761106382e-05, "loss": 2.9392, "step": 112500 }, { "epoch": 0.93, "learning_rate": 2.1764302645380288e-05, "loss": 2.9409, "step": 112600 }, { "epoch": 0.93, "learning_rate": 2.151653052965419e-05, "loss": 2.9445, "step": 112700 }, { "epoch": 0.93, "learning_rate": 2.1268758413928095e-05, "loss": 2.9378, "step": 112800 }, { "epoch": 0.93, "learning_rate": 2.1020986298202e-05, "loss": 2.9357, "step": 112900 }, { "epoch": 0.93, "learning_rate": 2.0773214182475906e-05, "loss": 2.9418, "step": 113000 }, { "epoch": 0.93, "eval_accuracy": 0.4375648605247589, "eval_loss": 2.9606027603149414, "eval_runtime": 38.0645, "eval_samples_per_second": 298.598, "eval_steps_per_second": 2.496, "step": 113000 }, { "epoch": 0.93, "learning_rate": 2.0525442066749805e-05, "loss": 2.9426, "step": 113100 }, { "epoch": 0.93, "learning_rate": 2.028014767218097e-05, "loss": 2.9436, "step": 113200 }, { "epoch": 0.93, "learning_rate": 2.0032375556454876e-05, "loss": 2.9355, "step": 113300 }, { "epoch": 0.94, "learning_rate": 1.978460344072878e-05, "loss": 2.9296, "step": 113400 }, { "epoch": 0.94, "learning_rate": 1.9536831325002683e-05, "loss": 2.9288, "step": 113500 }, { "epoch": 0.94, "learning_rate": 1.9289059209276585e-05, "loss": 2.9385, "step": 113600 }, { "epoch": 0.94, "learning_rate": 1.904128709355049e-05, "loss": 2.9348, "step": 113700 }, { "epoch": 0.94, "learning_rate": 1.8793514977824393e-05, "loss": 2.9296, "step": 113800 }, { "epoch": 0.94, "learning_rate": 1.8545742862098298e-05, "loss": 2.9354, "step": 113900 }, { "epoch": 0.94, "learning_rate": 1.8297970746372204e-05, "loss": 2.934, "step": 114000 }, { "epoch": 0.94, "eval_accuracy": 0.4376666976103616, "eval_loss": 2.9594342708587646, "eval_runtime": 36.6834, "eval_samples_per_second": 309.841, "eval_steps_per_second": 2.59, "step": 114000 }, { "epoch": 0.94, "learning_rate": 1.8050198630646106e-05, "loss": 2.9292, "step": 114100 }, { "epoch": 0.94, "learning_rate": 1.7802426514920008e-05, "loss": 2.9371, "step": 114200 }, { "epoch": 0.94, "learning_rate": 1.7557132120351174e-05, "loss": 2.9319, "step": 114300 }, { "epoch": 0.94, "learning_rate": 1.730936000462508e-05, "loss": 2.9402, "step": 114400 }, { "epoch": 0.94, "learning_rate": 1.706158788889898e-05, "loss": 2.9303, "step": 114500 }, { "epoch": 0.94, "learning_rate": 1.6813815773172883e-05, "loss": 2.9395, "step": 114600 }, { "epoch": 0.95, "learning_rate": 1.656604365744679e-05, "loss": 2.9359, "step": 114700 }, { "epoch": 0.95, "learning_rate": 1.6318271541720694e-05, "loss": 2.9392, "step": 114800 }, { "epoch": 0.95, "learning_rate": 1.60704994259946e-05, "loss": 2.9353, "step": 114900 }, { "epoch": 0.95, "learning_rate": 1.58227273102685e-05, "loss": 2.9374, "step": 115000 }, { "epoch": 0.95, "eval_accuracy": 0.4379791365600679, "eval_loss": 2.958286762237549, "eval_runtime": 36.2601, "eval_samples_per_second": 313.458, "eval_steps_per_second": 2.62, "step": 115000 }, { "epoch": 0.95, "learning_rate": 1.5574955194542403e-05, "loss": 2.9304, "step": 115100 }, { "epoch": 0.95, "learning_rate": 1.532718307881631e-05, "loss": 2.9345, "step": 115200 }, { "epoch": 0.95, "learning_rate": 1.5079410963090212e-05, "loss": 2.9344, "step": 115300 }, { "epoch": 0.95, "learning_rate": 1.4834116568521378e-05, "loss": 2.9346, "step": 115400 }, { "epoch": 0.95, "learning_rate": 1.458634445279528e-05, "loss": 2.936, "step": 115500 }, { "epoch": 0.95, "learning_rate": 1.4338572337069184e-05, "loss": 2.9414, "step": 115600 }, { "epoch": 0.95, "learning_rate": 1.409080022134309e-05, "loss": 2.938, "step": 115700 }, { "epoch": 0.95, "learning_rate": 1.3843028105616993e-05, "loss": 2.9345, "step": 115800 }, { "epoch": 0.96, "learning_rate": 1.3595255989890895e-05, "loss": 2.9356, "step": 115900 }, { "epoch": 0.96, "learning_rate": 1.33474838741648e-05, "loss": 2.9302, "step": 116000 }, { "epoch": 0.96, "eval_accuracy": 0.43818142519269376, "eval_loss": 2.9568593502044678, "eval_runtime": 36.5356, "eval_samples_per_second": 311.094, "eval_steps_per_second": 2.6, "step": 116000 }, { "epoch": 0.96, "learning_rate": 1.3102189479595965e-05, "loss": 2.9352, "step": 116100 }, { "epoch": 0.96, "learning_rate": 1.2854417363869869e-05, "loss": 2.9384, "step": 116200 }, { "epoch": 0.96, "learning_rate": 1.2606645248143774e-05, "loss": 2.9256, "step": 116300 }, { "epoch": 0.96, "learning_rate": 1.2358873132417676e-05, "loss": 2.9308, "step": 116400 }, { "epoch": 0.96, "learning_rate": 1.211110101669158e-05, "loss": 2.9365, "step": 116500 }, { "epoch": 0.96, "learning_rate": 1.1863328900965485e-05, "loss": 2.9291, "step": 116600 }, { "epoch": 0.96, "learning_rate": 1.1615556785239389e-05, "loss": 2.9293, "step": 116700 }, { "epoch": 0.96, "learning_rate": 1.1367784669513291e-05, "loss": 2.9411, "step": 116800 }, { "epoch": 0.96, "learning_rate": 1.1120012553787196e-05, "loss": 2.9315, "step": 116900 }, { "epoch": 0.96, "learning_rate": 1.08722404380611e-05, "loss": 2.9273, "step": 117000 }, { "epoch": 0.96, "eval_accuracy": 0.4382222985807928, "eval_loss": 2.9559996128082275, "eval_runtime": 36.528, "eval_samples_per_second": 311.158, "eval_steps_per_second": 2.601, "step": 117000 }, { "epoch": 0.97, "learning_rate": 1.0624468322335004e-05, "loss": 2.9355, "step": 117100 }, { "epoch": 0.97, "learning_rate": 1.0376696206608908e-05, "loss": 2.9227, "step": 117200 }, { "epoch": 0.97, "learning_rate": 1.0128924090882811e-05, "loss": 2.9323, "step": 117300 }, { "epoch": 0.97, "learning_rate": 9.881151975156715e-06, "loss": 2.9335, "step": 117400 }, { "epoch": 0.97, "learning_rate": 9.63337985943062e-06, "loss": 2.9313, "step": 117500 }, { "epoch": 0.97, "learning_rate": 9.385607743704522e-06, "loss": 2.932, "step": 117600 }, { "epoch": 0.97, "learning_rate": 9.137835627978426e-06, "loss": 2.934, "step": 117700 }, { "epoch": 0.97, "learning_rate": 8.89006351225233e-06, "loss": 2.9361, "step": 117800 }, { "epoch": 0.97, "learning_rate": 8.642291396526234e-06, "loss": 2.9318, "step": 117900 }, { "epoch": 0.97, "learning_rate": 8.394519280800137e-06, "loss": 2.9338, "step": 118000 }, { "epoch": 0.97, "eval_accuracy": 0.4383601596694659, "eval_loss": 2.9548416137695312, "eval_runtime": 36.5258, "eval_samples_per_second": 311.177, "eval_steps_per_second": 2.601, "step": 118000 }, { "epoch": 0.97, "learning_rate": 8.146747165074043e-06, "loss": 2.9222, "step": 118100 }, { "epoch": 0.97, "learning_rate": 7.898975049347945e-06, "loss": 2.9346, "step": 118200 }, { "epoch": 0.98, "learning_rate": 7.65120293362185e-06, "loss": 2.9239, "step": 118300 }, { "epoch": 0.98, "learning_rate": 7.403430817895753e-06, "loss": 2.9302, "step": 118400 }, { "epoch": 0.98, "learning_rate": 7.155658702169658e-06, "loss": 2.9323, "step": 118500 }, { "epoch": 0.98, "learning_rate": 6.9078865864435605e-06, "loss": 2.9315, "step": 118600 }, { "epoch": 0.98, "learning_rate": 6.660114470717465e-06, "loss": 2.9308, "step": 118700 }, { "epoch": 0.98, "learning_rate": 6.412342354991369e-06, "loss": 2.9289, "step": 118800 }, { "epoch": 0.98, "learning_rate": 6.1645702392652725e-06, "loss": 2.9301, "step": 118900 }, { "epoch": 0.98, "learning_rate": 5.916798123539176e-06, "loss": 2.9304, "step": 119000 }, { "epoch": 0.98, "eval_accuracy": 0.43854651460842603, "eval_loss": 2.953854560852051, "eval_runtime": 37.1399, "eval_samples_per_second": 306.032, "eval_steps_per_second": 2.558, "step": 119000 }, { "epoch": 0.98, "learning_rate": 5.669026007813081e-06, "loss": 2.9354, "step": 119100 }, { "epoch": 0.98, "learning_rate": 5.423731613244245e-06, "loss": 2.9288, "step": 119200 }, { "epoch": 0.98, "learning_rate": 5.175959497518149e-06, "loss": 2.9323, "step": 119300 }, { "epoch": 0.98, "learning_rate": 4.928187381792053e-06, "loss": 2.9298, "step": 119400 }, { "epoch": 0.99, "learning_rate": 4.680415266065956e-06, "loss": 2.9226, "step": 119500 }, { "epoch": 0.99, "learning_rate": 4.435120871497121e-06, "loss": 2.9276, "step": 119600 }, { "epoch": 0.99, "learning_rate": 4.187348755771026e-06, "loss": 2.926, "step": 119700 }, { "epoch": 0.99, "learning_rate": 3.9395766400449295e-06, "loss": 2.9256, "step": 119800 }, { "epoch": 0.99, "learning_rate": 3.6918045243188332e-06, "loss": 2.9231, "step": 119900 }, { "epoch": 0.99, "learning_rate": 3.4440324085927365e-06, "loss": 2.9361, "step": 120000 }, { "epoch": 0.99, "eval_accuracy": 0.43850217737387787, "eval_loss": 2.9531476497650146, "eval_runtime": 36.7198, "eval_samples_per_second": 309.534, "eval_steps_per_second": 2.587, "step": 120000 }, { "epoch": 0.99, "learning_rate": 3.1962602928666402e-06, "loss": 2.9236, "step": 120100 }, { "epoch": 0.99, "learning_rate": 2.948488177140544e-06, "loss": 2.9354, "step": 120200 }, { "epoch": 0.99, "learning_rate": 2.700716061414448e-06, "loss": 2.9264, "step": 120300 }, { "epoch": 0.99, "learning_rate": 2.452943945688352e-06, "loss": 2.925, "step": 120400 }, { "epoch": 0.99, "learning_rate": 2.205171829962256e-06, "loss": 2.932, "step": 120500 }, { "epoch": 0.99, "learning_rate": 1.9573997142361597e-06, "loss": 2.9291, "step": 120600 }, { "epoch": 1.0, "learning_rate": 1.7096275985100634e-06, "loss": 2.9221, "step": 120700 }, { "epoch": 1.0, "learning_rate": 1.4643332039412285e-06, "loss": 2.9315, "step": 120800 }, { "epoch": 1.0, "learning_rate": 1.2165610882151322e-06, "loss": 2.9287, "step": 120900 }, { "epoch": 1.0, "learning_rate": 9.68788972489036e-07, "loss": 2.927, "step": 121000 }, { "epoch": 1.0, "eval_accuracy": 0.43872455631590834, "eval_loss": 2.9526402950286865, "eval_runtime": 36.2748, "eval_samples_per_second": 313.331, "eval_steps_per_second": 2.619, "step": 121000 }, { "epoch": 1.0, "learning_rate": 7.210168567629398e-07, "loss": 2.9372, "step": 121100 }, { "epoch": 1.0, "learning_rate": 4.732447410368437e-07, "loss": 2.9285, "step": 121200 }, { "epoch": 1.0, "step": 121279, "total_flos": 1.477015264316925e+20, "train_loss": 3.0377933233989878, "train_runtime": 170019.0203, "train_samples_per_second": 171.199, "train_steps_per_second": 0.713 } ], "logging_steps": 100, "max_steps": 121279, "num_train_epochs": 1, "save_steps": 12128, "total_flos": 1.477015264316925e+20, "trial_name": null, "trial_params": null }