{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 4.761904761904762, "eval_steps": 10, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.23809523809523808, "grad_norm": 107.2943344116211, "learning_rate": 7e-06, "loss": 4.5568, "step": 10 }, { "epoch": 0.23809523809523808, "eval_loss": 4.329397678375244, "eval_runtime": 2.6051, "eval_samples_per_second": 982.322, "eval_steps_per_second": 15.355, "step": 10 }, { "epoch": 0.47619047619047616, "grad_norm": 82.67948150634766, "learning_rate": 1.7e-05, "loss": 3.937, "step": 20 }, { "epoch": 0.47619047619047616, "eval_loss": 3.166707754135132, "eval_runtime": 2.6689, "eval_samples_per_second": 958.832, "eval_steps_per_second": 14.988, "step": 20 }, { "epoch": 0.7142857142857143, "grad_norm": 81.49203491210938, "learning_rate": 1.9222222222222225e-05, "loss": 2.7135, "step": 30 }, { "epoch": 0.7142857142857143, "eval_loss": 1.7821017503738403, "eval_runtime": 3.0515, "eval_samples_per_second": 838.597, "eval_steps_per_second": 13.108, "step": 30 }, { "epoch": 0.9523809523809523, "grad_norm": 67.98638153076172, "learning_rate": 1.8111111111111112e-05, "loss": 1.6075, "step": 40 }, { "epoch": 0.9523809523809523, "eval_loss": 0.9218789339065552, "eval_runtime": 2.6581, "eval_samples_per_second": 962.702, "eval_steps_per_second": 15.048, "step": 40 }, { "epoch": 1.1904761904761905, "grad_norm": 41.94712448120117, "learning_rate": 1.7e-05, "loss": 0.7666, "step": 50 }, { "epoch": 1.1904761904761905, "eval_loss": 0.5395554304122925, "eval_runtime": 2.7233, "eval_samples_per_second": 939.664, "eval_steps_per_second": 14.688, "step": 50 }, { "epoch": 1.4285714285714286, "grad_norm": 45.76442337036133, "learning_rate": 1.588888888888889e-05, "loss": 0.4077, "step": 60 }, { "epoch": 1.4285714285714286, "eval_loss": 0.395812064409256, "eval_runtime": 2.6568, "eval_samples_per_second": 963.196, "eval_steps_per_second": 15.056, "step": 60 }, { "epoch": 1.6666666666666665, "grad_norm": 36.85358428955078, "learning_rate": 1.477777777777778e-05, "loss": 0.2843, "step": 70 }, { "epoch": 1.6666666666666665, "eval_loss": 0.3538413345813751, "eval_runtime": 2.958, "eval_samples_per_second": 865.099, "eval_steps_per_second": 13.522, "step": 70 }, { "epoch": 1.9047619047619047, "grad_norm": 17.167036056518555, "learning_rate": 1.3666666666666667e-05, "loss": 0.2165, "step": 80 }, { "epoch": 1.9047619047619047, "eval_loss": 0.3396788537502289, "eval_runtime": 3.0374, "eval_samples_per_second": 842.51, "eval_steps_per_second": 13.169, "step": 80 }, { "epoch": 2.142857142857143, "grad_norm": 28.207691192626953, "learning_rate": 1.2555555555555557e-05, "loss": 0.1979, "step": 90 }, { "epoch": 2.142857142857143, "eval_loss": 0.32209891080856323, "eval_runtime": 2.6763, "eval_samples_per_second": 956.167, "eval_steps_per_second": 14.946, "step": 90 }, { "epoch": 2.380952380952381, "grad_norm": 20.979476928710938, "learning_rate": 1.1444444444444444e-05, "loss": 0.1476, "step": 100 }, { "epoch": 2.380952380952381, "eval_loss": 0.30111950635910034, "eval_runtime": 2.7597, "eval_samples_per_second": 927.283, "eval_steps_per_second": 14.494, "step": 100 }, { "epoch": 2.619047619047619, "grad_norm": 14.82142448425293, "learning_rate": 1.0333333333333335e-05, "loss": 0.1061, "step": 110 }, { "epoch": 2.619047619047619, "eval_loss": 0.30764198303222656, "eval_runtime": 2.701, "eval_samples_per_second": 947.422, "eval_steps_per_second": 14.809, "step": 110 }, { "epoch": 2.857142857142857, "grad_norm": 19.38449478149414, "learning_rate": 9.222222222222224e-06, "loss": 0.0907, "step": 120 }, { "epoch": 2.857142857142857, "eval_loss": 0.31123870611190796, "eval_runtime": 2.7275, "eval_samples_per_second": 938.22, "eval_steps_per_second": 14.665, "step": 120 }, { "epoch": 3.0952380952380953, "grad_norm": 15.523360252380371, "learning_rate": 8.111111111111112e-06, "loss": 0.0573, "step": 130 }, { "epoch": 3.0952380952380953, "eval_loss": 0.306456595659256, "eval_runtime": 3.037, "eval_samples_per_second": 842.596, "eval_steps_per_second": 13.171, "step": 130 }, { "epoch": 3.3333333333333335, "grad_norm": 9.912215232849121, "learning_rate": 7e-06, "loss": 0.0638, "step": 140 }, { "epoch": 3.3333333333333335, "eval_loss": 0.3030768036842346, "eval_runtime": 2.784, "eval_samples_per_second": 919.181, "eval_steps_per_second": 14.368, "step": 140 }, { "epoch": 3.571428571428571, "grad_norm": 32.101985931396484, "learning_rate": 5.88888888888889e-06, "loss": 0.0716, "step": 150 }, { "epoch": 3.571428571428571, "eval_loss": 0.2990337312221527, "eval_runtime": 2.8128, "eval_samples_per_second": 909.761, "eval_steps_per_second": 14.221, "step": 150 }, { "epoch": 3.8095238095238093, "grad_norm": 17.976055145263672, "learning_rate": 4.777777777777778e-06, "loss": 0.0841, "step": 160 }, { "epoch": 3.8095238095238093, "eval_loss": 0.3003748655319214, "eval_runtime": 2.7098, "eval_samples_per_second": 944.346, "eval_steps_per_second": 14.761, "step": 160 }, { "epoch": 4.0476190476190474, "grad_norm": 16.045225143432617, "learning_rate": 3.6666666666666666e-06, "loss": 0.096, "step": 170 }, { "epoch": 4.0476190476190474, "eval_loss": 0.3065829873085022, "eval_runtime": 2.7428, "eval_samples_per_second": 933.004, "eval_steps_per_second": 14.584, "step": 170 }, { "epoch": 4.285714285714286, "grad_norm": 24.23316764831543, "learning_rate": 2.5555555555555557e-06, "loss": 0.0728, "step": 180 }, { "epoch": 4.285714285714286, "eval_loss": 0.3128698170185089, "eval_runtime": 3.0252, "eval_samples_per_second": 845.905, "eval_steps_per_second": 13.222, "step": 180 }, { "epoch": 4.523809523809524, "grad_norm": 10.853073120117188, "learning_rate": 1.4444444444444445e-06, "loss": 0.0594, "step": 190 }, { "epoch": 4.523809523809524, "eval_loss": 0.3163248300552368, "eval_runtime": 2.763, "eval_samples_per_second": 926.153, "eval_steps_per_second": 14.477, "step": 190 }, { "epoch": 4.761904761904762, "grad_norm": 18.616168975830078, "learning_rate": 3.3333333333333335e-07, "loss": 0.071, "step": 200 }, { "epoch": 4.761904761904762, "eval_loss": 0.3181891143321991, "eval_runtime": 3.0801, "eval_samples_per_second": 830.819, "eval_steps_per_second": 12.987, "step": 200 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }