{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 367, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.027285129604365622, "grad_norm": 2.3641693958998617, "learning_rate": 2.432432432432433e-06, "loss": 0.2044, "step": 10 }, { "epoch": 0.054570259208731244, "grad_norm": 1.4337953155689682, "learning_rate": 5.135135135135135e-06, "loss": 0.1107, "step": 20 }, { "epoch": 0.08185538881309687, "grad_norm": 0.76068821729334, "learning_rate": 7.837837837837838e-06, "loss": 0.0463, "step": 30 }, { "epoch": 0.10914051841746249, "grad_norm": 0.6475659229874231, "learning_rate": 9.99909372761763e-06, "loss": 0.0478, "step": 40 }, { "epoch": 0.1364256480218281, "grad_norm": 0.9686226673295716, "learning_rate": 9.96740867674275e-06, "loss": 0.0585, "step": 50 }, { "epoch": 0.16371077762619374, "grad_norm": 0.6271034418616805, "learning_rate": 9.890738003669029e-06, "loss": 0.0495, "step": 60 }, { "epoch": 0.19099590723055934, "grad_norm": 0.8008519457996116, "learning_rate": 9.769776049884564e-06, "loss": 0.0441, "step": 70 }, { "epoch": 0.21828103683492497, "grad_norm": 0.8781262839923529, "learning_rate": 9.60561826557425e-06, "loss": 0.0494, "step": 80 }, { "epoch": 0.24556616643929058, "grad_norm": 0.705533350341626, "learning_rate": 9.399751289053267e-06, "loss": 0.0342, "step": 90 }, { "epoch": 0.2728512960436562, "grad_norm": 0.8249561019911485, "learning_rate": 9.154039483540273e-06, "loss": 0.0465, "step": 100 }, { "epoch": 0.30013642564802184, "grad_norm": 0.7887431237474309, "learning_rate": 8.870708053195414e-06, "loss": 0.0375, "step": 110 }, { "epoch": 0.3274215552523875, "grad_norm": 0.339672156088185, "learning_rate": 8.552322891326846e-06, "loss": 0.0316, "step": 120 }, { "epoch": 0.35470668485675305, "grad_norm": 0.6938753582388089, "learning_rate": 8.201767343263612e-06, "loss": 0.0414, "step": 130 }, { "epoch": 0.3819918144611187, "grad_norm": 0.9113487394533759, "learning_rate": 7.822216094333847e-06, "loss": 0.0427, "step": 140 }, { "epoch": 0.4092769440654843, "grad_norm": 0.9002304401720267, "learning_rate": 7.4171064194228196e-06, "loss": 0.0508, "step": 150 }, { "epoch": 0.43656207366984995, "grad_norm": 0.46493723524279185, "learning_rate": 6.990107054479313e-06, "loss": 0.0353, "step": 160 }, { "epoch": 0.4638472032742155, "grad_norm": 0.5562245099793953, "learning_rate": 6.545084971874738e-06, "loss": 0.0509, "step": 170 }, { "epoch": 0.49113233287858116, "grad_norm": 0.7505136690416153, "learning_rate": 6.08607036050254e-06, "loss": 0.046, "step": 180 }, { "epoch": 0.5184174624829468, "grad_norm": 0.5036723905882816, "learning_rate": 5.617220127763474e-06, "loss": 0.0426, "step": 190 }, { "epoch": 0.5457025920873124, "grad_norm": 0.5892958090446017, "learning_rate": 5.142780253968481e-06, "loss": 0.0382, "step": 200 }, { "epoch": 0.572987721691678, "grad_norm": 0.43754682825635627, "learning_rate": 4.667047340083481e-06, "loss": 0.031, "step": 210 }, { "epoch": 0.6002728512960437, "grad_norm": 0.611056086304531, "learning_rate": 4.194329697045681e-06, "loss": 0.0372, "step": 220 }, { "epoch": 0.6275579809004093, "grad_norm": 0.4841019826566699, "learning_rate": 3.7289083290325668e-06, "loss": 0.0361, "step": 230 }, { "epoch": 0.654843110504775, "grad_norm": 0.44138946851724803, "learning_rate": 3.274998164025148e-06, "loss": 0.0335, "step": 240 }, { "epoch": 0.6821282401091405, "grad_norm": 0.6831939247936597, "learning_rate": 2.8367098827674575e-06, "loss": 0.0346, "step": 250 }, { "epoch": 0.7094133697135061, "grad_norm": 0.9062857258621547, "learning_rate": 2.418012691805191e-06, "loss": 0.0321, "step": 260 }, { "epoch": 0.7366984993178718, "grad_norm": 0.546275501732938, "learning_rate": 2.0226983777365604e-06, "loss": 0.0331, "step": 270 }, { "epoch": 0.7639836289222374, "grad_norm": 0.3539679256575418, "learning_rate": 1.6543469682057105e-06, "loss": 0.0357, "step": 280 }, { "epoch": 0.791268758526603, "grad_norm": 0.5908726655712924, "learning_rate": 1.3162943106179748e-06, "loss": 0.0333, "step": 290 }, { "epoch": 0.8185538881309686, "grad_norm": 0.5732288853145052, "learning_rate": 1.0116018621892237e-06, "loss": 0.0396, "step": 300 }, { "epoch": 0.8458390177353342, "grad_norm": 0.5804321672176804, "learning_rate": 7.430289649152156e-07, "loss": 0.0292, "step": 310 }, { "epoch": 0.8731241473396999, "grad_norm": 0.4784774157551655, "learning_rate": 5.130078565432089e-07, "loss": 0.0276, "step": 320 }, { "epoch": 0.9004092769440655, "grad_norm": 0.4332276557892971, "learning_rate": 3.2362164385026704e-07, "loss": 0.0378, "step": 330 }, { "epoch": 0.927694406548431, "grad_norm": 0.8116661660491021, "learning_rate": 1.765854377057219e-07, "loss": 0.0261, "step": 340 }, { "epoch": 0.9549795361527967, "grad_norm": 0.7533605649573977, "learning_rate": 7.32308207615351e-08, "loss": 0.0333, "step": 350 }, { "epoch": 0.9822646657571623, "grad_norm": 0.518345083092868, "learning_rate": 1.449378843361271e-08, "loss": 0.0308, "step": 360 } ], "logging_steps": 10, "max_steps": 367, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 11191955800064.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }