{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.02, "grad_norm": 1.0315879583358765, "learning_rate": 9.6e-05, "loss": 0.8325, "step": 25 }, { "epoch": 0.04, "grad_norm": 0.20474183559417725, "learning_rate": 0.00019600000000000002, "loss": 0.4556, "step": 50 }, { "epoch": 0.06, "grad_norm": 0.22688248753547668, "learning_rate": 0.000296, "loss": 0.4015, "step": 75 }, { "epoch": 0.08, "grad_norm": 0.19172126054763794, "learning_rate": 0.00039600000000000003, "loss": 0.3645, "step": 100 }, { "epoch": 0.1, "grad_norm": 0.15229463577270508, "learning_rate": 0.000496, "loss": 0.3602, "step": 125 }, { "epoch": 0.12, "grad_norm": 0.20351602137088776, "learning_rate": 0.000596, "loss": 0.3475, "step": 150 }, { "epoch": 0.14, "grad_norm": 0.19812420010566711, "learning_rate": 0.000696, "loss": 0.345, "step": 175 }, { "epoch": 0.16, "grad_norm": 0.18347425758838654, "learning_rate": 0.000796, "loss": 0.3327, "step": 200 }, { "epoch": 0.18, "grad_norm": 0.2595237195491791, "learning_rate": 0.000896, "loss": 0.3302, "step": 225 }, { "epoch": 0.2, "grad_norm": 0.2259773463010788, "learning_rate": 0.000996, "loss": 0.3392, "step": 250 }, { "epoch": 0.22, "grad_norm": 0.2506314516067505, "learning_rate": 0.0009997192908557321, "loss": 0.328, "step": 275 }, { "epoch": 0.24, "grad_norm": 0.22288699448108673, "learning_rate": 0.000998830238119205, "loss": 0.3248, "step": 300 }, { "epoch": 0.26, "grad_norm": 0.29342198371887207, "learning_rate": 0.000997333437576437, "loss": 0.3206, "step": 325 }, { "epoch": 0.28, "grad_norm": 0.23638910055160522, "learning_rate": 0.0009952307128483257, "loss": 0.3119, "step": 350 }, { "epoch": 0.3, "grad_norm": 0.20430760085582733, "learning_rate": 0.0009925246257810518, "loss": 0.3029, "step": 375 }, { "epoch": 0.32, "grad_norm": 0.2517499327659607, "learning_rate": 0.0009892184733248665, "loss": 0.3218, "step": 400 }, { "epoch": 0.34, "grad_norm": 0.2284773737192154, "learning_rate": 0.0009853162835172637, "loss": 0.2969, "step": 425 }, { "epoch": 0.36, "grad_norm": 0.26538360118865967, "learning_rate": 0.0009808228105754376, "loss": 0.3134, "step": 450 }, { "epoch": 0.38, "grad_norm": 0.24021762609481812, "learning_rate": 0.0009757435291040016, "loss": 0.3076, "step": 475 }, { "epoch": 0.4, "grad_norm": 0.27008429169654846, "learning_rate": 0.0009700846274250251, "loss": 0.2964, "step": 500 }, { "epoch": 0.4, "eval_loss": 0.2879575788974762, "eval_runtime": 18.9021, "eval_samples_per_second": 54.121, "eval_steps_per_second": 0.846, "step": 500 }, { "epoch": 0.42, "grad_norm": 0.2501317858695984, "learning_rate": 0.000963853000038517, "loss": 0.2981, "step": 525 }, { "epoch": 0.44, "grad_norm": 0.19082719087600708, "learning_rate": 0.0009570562392225395, "loss": 0.2809, "step": 550 }, { "epoch": 0.46, "grad_norm": 0.24488045275211334, "learning_rate": 0.0009497026257831855, "loss": 0.291, "step": 575 }, { "epoch": 0.48, "grad_norm": 0.21422015130519867, "learning_rate": 0.0009418011189656941, "loss": 0.2851, "step": 600 }, { "epoch": 0.5, "grad_norm": 0.19914306700229645, "learning_rate": 0.0009333613455389882, "loss": 0.2841, "step": 625 }, { "epoch": 0.52, "grad_norm": 0.2367725670337677, "learning_rate": 0.000924393588066941, "loss": 0.2879, "step": 650 }, { "epoch": 0.54, "grad_norm": 0.24288254976272583, "learning_rate": 0.0009149087723806549, "loss": 0.2986, "step": 675 }, { "epoch": 0.56, "grad_norm": 0.22942474484443665, "learning_rate": 0.0009049184542670199, "loss": 0.2842, "step": 700 }, { "epoch": 0.58, "grad_norm": 0.23413586616516113, "learning_rate": 0.0008944348053897671, "loss": 0.2925, "step": 725 }, { "epoch": 0.6, "grad_norm": 0.22139714658260345, "learning_rate": 0.0008834705984601709, "loss": 0.2873, "step": 750 }, { "epoch": 0.62, "grad_norm": 0.2082182914018631, "learning_rate": 0.0008720391916754683, "loss": 0.2712, "step": 775 }, { "epoch": 0.64, "grad_norm": 0.24570266902446747, "learning_rate": 0.0008601545124439535, "loss": 0.2813, "step": 800 }, { "epoch": 0.66, "grad_norm": 0.20145930349826813, "learning_rate": 0.0008478310404165754, "loss": 0.2678, "step": 825 }, { "epoch": 0.68, "grad_norm": 0.1796109676361084, "learning_rate": 0.0008350837898457143, "loss": 0.2732, "step": 850 }, { "epoch": 0.7, "grad_norm": 0.22070881724357605, "learning_rate": 0.0008219282912926269, "loss": 0.2744, "step": 875 }, { "epoch": 0.72, "grad_norm": 0.18907737731933594, "learning_rate": 0.0008083805727058513, "loss": 0.2834, "step": 900 }, { "epoch": 0.74, "grad_norm": 0.1631525754928589, "learning_rate": 0.0007944571398936193, "loss": 0.2615, "step": 925 }, { "epoch": 0.76, "grad_norm": 0.1881309598684311, "learning_rate": 0.0007801749564140723, "loss": 0.2732, "step": 950 }, { "epoch": 0.78, "grad_norm": 0.19089096784591675, "learning_rate": 0.0007655514229077783, "loss": 0.2705, "step": 975 }, { "epoch": 0.8, "grad_norm": 0.17820408940315247, "learning_rate": 0.0007506043558977322, "loss": 0.2716, "step": 1000 }, { "epoch": 0.8, "eval_loss": 0.2520262598991394, "eval_runtime": 18.7237, "eval_samples_per_second": 54.637, "eval_steps_per_second": 0.855, "step": 1000 }, { "epoch": 0.82, "grad_norm": 0.1798166185617447, "learning_rate": 0.0007353519660826664, "loss": 0.2546, "step": 1025 }, { "epoch": 0.84, "grad_norm": 0.21515925228595734, "learning_rate": 0.00071981283615012, "loss": 0.2647, "step": 1050 }, { "epoch": 0.86, "grad_norm": 0.1794830709695816, "learning_rate": 0.0007040058981362964, "loss": 0.2577, "step": 1075 }, { "epoch": 0.88, "grad_norm": 0.18966132402420044, "learning_rate": 0.0006879504103602934, "loss": 0.2594, "step": 1100 }, { "epoch": 0.9, "grad_norm": 0.21163836121559143, "learning_rate": 0.0006716659339608077, "loss": 0.2725, "step": 1125 }, { "epoch": 0.92, "grad_norm": 0.17575646936893463, "learning_rate": 0.0006551723090639006, "loss": 0.2483, "step": 1150 }, { "epoch": 0.94, "grad_norm": 0.17414060235023499, "learning_rate": 0.0006384896306108612, "loss": 0.2555, "step": 1175 }, { "epoch": 0.96, "grad_norm": 0.16540151834487915, "learning_rate": 0.0006216382238756146, "loss": 0.2427, "step": 1200 }, { "epoch": 0.98, "grad_norm": 0.16461016237735748, "learning_rate": 0.0006046386197015076, "loss": 0.251, "step": 1225 }, { "epoch": 1.0, "grad_norm": 0.16811122000217438, "learning_rate": 0.0005875115294876381, "loss": 0.2451, "step": 1250 }, { "epoch": 1.02, "grad_norm": 0.17021119594573975, "learning_rate": 0.0005702778199552054, "loss": 0.2227, "step": 1275 }, { "epoch": 1.04, "grad_norm": 0.16884687542915344, "learning_rate": 0.000552958487724626, "loss": 0.2164, "step": 1300 }, { "epoch": 1.06, "grad_norm": 0.17195427417755127, "learning_rate": 0.0005355746337343836, "loss": 0.2406, "step": 1325 }, { "epoch": 1.08, "grad_norm": 0.15978871285915375, "learning_rate": 0.0005181474375327879, "loss": 0.2138, "step": 1350 }, { "epoch": 1.1, "grad_norm": 0.1938290148973465, "learning_rate": 0.0005006981314739573, "loss": 0.2322, "step": 1375 }, { "epoch": 1.12, "grad_norm": 0.1653754562139511, "learning_rate": 0.00048324797484946424, "loss": 0.2285, "step": 1400 }, { "epoch": 1.1400000000000001, "grad_norm": 0.16597716510295868, "learning_rate": 0.0004658182279871657, "loss": 0.2195, "step": 1425 }, { "epoch": 1.16, "grad_norm": 0.1771618276834488, "learning_rate": 0.00044843012634876645, "loss": 0.2311, "step": 1450 }, { "epoch": 1.18, "grad_norm": 0.1888245791196823, "learning_rate": 0.000431104854657681, "loss": 0.2224, "step": 1475 }, { "epoch": 1.2, "grad_norm": 0.19004148244857788, "learning_rate": 0.0004138635210887117, "loss": 0.2261, "step": 1500 }, { "epoch": 1.2, "eval_loss": 0.23447048664093018, "eval_runtime": 18.7321, "eval_samples_per_second": 54.612, "eval_steps_per_second": 0.854, "step": 1500 }, { "epoch": 1.22, "grad_norm": 0.15263979136943817, "learning_rate": 0.0003967271315509884, "loss": 0.2227, "step": 1525 }, { "epoch": 1.24, "grad_norm": 0.16230420768260956, "learning_rate": 0.0003797165640955041, "loss": 0.218, "step": 1550 }, { "epoch": 1.26, "grad_norm": 0.1780393123626709, "learning_rate": 0.0003628525434784268, "loss": 0.2228, "step": 1575 }, { "epoch": 1.28, "grad_norm": 0.1682538390159607, "learning_rate": 0.0003461556159111748, "loss": 0.2274, "step": 1600 }, { "epoch": 1.3, "grad_norm": 0.17813384532928467, "learning_rate": 0.0003296461240280242, "loss": 0.2095, "step": 1625 }, { "epoch": 1.32, "grad_norm": 0.17569252848625183, "learning_rate": 0.00031334418210174266, "loss": 0.2325, "step": 1650 }, { "epoch": 1.34, "grad_norm": 0.14148516952991486, "learning_rate": 0.0002972696515374455, "loss": 0.2152, "step": 1675 }, { "epoch": 1.3599999999999999, "grad_norm": 0.14034417271614075, "learning_rate": 0.00028144211667453366, "loss": 0.2189, "step": 1700 }, { "epoch": 1.38, "grad_norm": 0.16050834953784943, "learning_rate": 0.00026588086092619277, "loss": 0.2252, "step": 1725 }, { "epoch": 1.4, "grad_norm": 0.1896321177482605, "learning_rate": 0.00025060484328552466, "loss": 0.2219, "step": 1750 }, { "epoch": 1.42, "grad_norm": 0.16782040894031525, "learning_rate": 0.00023563267522693415, "loss": 0.2139, "step": 1775 }, { "epoch": 1.44, "grad_norm": 0.1926344931125641, "learning_rate": 0.0002209825980309151, "loss": 0.2211, "step": 1800 }, { "epoch": 1.46, "grad_norm": 0.16422989964485168, "learning_rate": 0.00020667246055985938, "loss": 0.2215, "step": 1825 }, { "epoch": 1.48, "grad_norm": 0.16321636736392975, "learning_rate": 0.00019271969751196778, "loss": 0.2066, "step": 1850 }, { "epoch": 1.5, "grad_norm": 0.15386155247688293, "learning_rate": 0.00017914130817975592, "loss": 0.2179, "step": 1875 }, { "epoch": 1.52, "grad_norm": 0.1478998363018036, "learning_rate": 0.00016595383573903412, "loss": 0.2117, "step": 1900 }, { "epoch": 1.54, "grad_norm": 0.16174481809139252, "learning_rate": 0.0001531733470935976, "loss": 0.2149, "step": 1925 }, { "epoch": 1.56, "grad_norm": 0.13468210399150848, "learning_rate": 0.00014081541330017704, "loss": 0.2037, "step": 1950 }, { "epoch": 1.58, "grad_norm": 0.1549725979566574, "learning_rate": 0.00012889509059750602, "loss": 0.2096, "step": 1975 }, { "epoch": 1.6, "grad_norm": 0.15235114097595215, "learning_rate": 0.00011742690206261292, "loss": 0.215, "step": 2000 }, { "epoch": 1.6, "eval_loss": 0.22170303761959076, "eval_runtime": 18.729, "eval_samples_per_second": 54.621, "eval_steps_per_second": 0.854, "step": 2000 }, { "epoch": 1.62, "grad_norm": 0.16818706691265106, "learning_rate": 0.0001064248199166884, "loss": 0.2024, "step": 2025 }, { "epoch": 1.6400000000000001, "grad_norm": 0.1615205854177475, "learning_rate": 9.590224850208645e-05, "loss": 0.2116, "step": 2050 }, { "epoch": 1.6600000000000001, "grad_norm": 0.15136006474494934, "learning_rate": 8.587200795119792e-05, "loss": 0.2159, "step": 2075 }, { "epoch": 1.6800000000000002, "grad_norm": 0.17629875242710114, "learning_rate": 7.634631856709389e-05, "loss": 0.2173, "step": 2100 }, { "epoch": 1.7, "grad_norm": 0.17757442593574524, "learning_rate": 6.733678593496901e-05, "loss": 0.2165, "step": 2125 }, { "epoch": 1.72, "grad_norm": 0.17096401751041412, "learning_rate": 5.885438678252342e-05, "loss": 0.2104, "step": 2150 }, { "epoch": 1.74, "grad_norm": 0.1552375704050064, "learning_rate": 5.0909455606510726e-05, "loss": 0.2065, "step": 2175 }, { "epoch": 1.76, "grad_norm": 0.2247905433177948, "learning_rate": 4.3511672081746386e-05, "loss": 0.211, "step": 2200 }, { "epoch": 1.78, "grad_norm": 0.15804603695869446, "learning_rate": 3.667004926791395e-05, "loss": 0.2135, "step": 2225 }, { "epoch": 1.8, "grad_norm": 0.15289010107517242, "learning_rate": 3.0392922628540875e-05, "loss": 0.2079, "step": 2250 }, { "epoch": 1.8199999999999998, "grad_norm": 0.148426815867424, "learning_rate": 2.468793987551998e-05, "loss": 0.2048, "step": 2275 }, { "epoch": 1.8399999999999999, "grad_norm": 0.1801871359348297, "learning_rate": 1.9562051651550784e-05, "loss": 0.1998, "step": 2300 }, { "epoch": 1.8599999999999999, "grad_norm": 0.15381543338298798, "learning_rate": 1.5021503061851349e-05, "loss": 0.2127, "step": 2325 }, { "epoch": 1.88, "grad_norm": 0.1602175533771515, "learning_rate": 1.1071826065460589e-05, "loss": 0.1991, "step": 2350 }, { "epoch": 1.9, "grad_norm": 0.17019180953502655, "learning_rate": 7.717832735397334e-06, "loss": 0.2036, "step": 2375 }, { "epoch": 1.92, "grad_norm": 0.19569824635982513, "learning_rate": 4.963609395891299e-06, "loss": 0.2034, "step": 2400 }, { "epoch": 1.94, "grad_norm": 0.15439940989017487, "learning_rate": 2.81251164382601e-06, "loss": 0.2063, "step": 2425 }, { "epoch": 1.96, "grad_norm": 0.1678876131772995, "learning_rate": 1.267160260461253e-06, "loss": 0.1939, "step": 2450 }, { "epoch": 1.98, "grad_norm": 0.14893606305122375, "learning_rate": 3.2943801841439634e-07, "loss": 0.203, "step": 2475 }, { "epoch": 2.0, "grad_norm": 0.13442523777484894, "learning_rate": 4.873877924582715e-10, "loss": 0.2036, "step": 2500 }, { "epoch": 2.0, "eval_loss": 0.21816210448741913, "eval_runtime": 18.7265, "eval_samples_per_second": 54.629, "eval_steps_per_second": 0.854, "step": 2500 }, { "epoch": 2.0, "step": 2500, "total_flos": 1.62588235137024e+18, "train_loss": 0.26149349727630616, "train_runtime": 2422.0842, "train_samples_per_second": 33.029, "train_steps_per_second": 1.032 } ], "logging_steps": 25, "max_steps": 2500, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.62588235137024e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }