{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3529411764705883, "eval_steps": 10, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11764705882352941, "grad_norm": 169.93887329101562, "learning_rate": 6e-06, "loss": 4.8387, "step": 10 }, { "epoch": 0.11764705882352941, "eval_loss": 4.6919026374816895, "eval_runtime": 15.5369, "eval_samples_per_second": 350.456, "eval_steps_per_second": 5.535, "step": 10 }, { "epoch": 0.23529411764705882, "grad_norm": 103.59686279296875, "learning_rate": 1.6000000000000003e-05, "loss": 4.3444, "step": 20 }, { "epoch": 0.23529411764705882, "eval_loss": 3.9225239753723145, "eval_runtime": 16.2767, "eval_samples_per_second": 334.526, "eval_steps_per_second": 5.284, "step": 20 }, { "epoch": 0.35294117647058826, "grad_norm": 96.45588684082031, "learning_rate": 1.9333333333333333e-05, "loss": 3.5537, "step": 30 }, { "epoch": 0.35294117647058826, "eval_loss": 3.1923811435699463, "eval_runtime": 17.7651, "eval_samples_per_second": 306.5, "eval_steps_per_second": 4.841, "step": 30 }, { "epoch": 0.47058823529411764, "grad_norm": 116.93628692626953, "learning_rate": 1.8222222222222224e-05, "loss": 2.4677, "step": 40 }, { "epoch": 0.47058823529411764, "eval_loss": 2.3630847930908203, "eval_runtime": 19.3532, "eval_samples_per_second": 281.349, "eval_steps_per_second": 4.444, "step": 40 }, { "epoch": 0.5882352941176471, "grad_norm": 123.0512924194336, "learning_rate": 1.7111111111111112e-05, "loss": 1.5596, "step": 50 }, { "epoch": 0.5882352941176471, "eval_loss": 1.779540777206421, "eval_runtime": 17.9711, "eval_samples_per_second": 302.987, "eval_steps_per_second": 4.785, "step": 50 }, { "epoch": 0.7058823529411765, "grad_norm": 116.81210327148438, "learning_rate": 1.6000000000000003e-05, "loss": 0.9314, "step": 60 }, { "epoch": 0.7058823529411765, "eval_loss": 1.3096877336502075, "eval_runtime": 17.7299, "eval_samples_per_second": 307.109, "eval_steps_per_second": 4.851, "step": 60 }, { "epoch": 0.8235294117647058, "grad_norm": 78.21733093261719, "learning_rate": 1.488888888888889e-05, "loss": 0.4562, "step": 70 }, { "epoch": 0.8235294117647058, "eval_loss": 1.0128556489944458, "eval_runtime": 18.4122, "eval_samples_per_second": 295.729, "eval_steps_per_second": 4.671, "step": 70 }, { "epoch": 0.9411764705882353, "grad_norm": 65.50968933105469, "learning_rate": 1.377777777777778e-05, "loss": 0.3587, "step": 80 }, { "epoch": 0.9411764705882353, "eval_loss": 0.9688291549682617, "eval_runtime": 18.5333, "eval_samples_per_second": 293.795, "eval_steps_per_second": 4.64, "step": 80 }, { "epoch": 1.0588235294117647, "grad_norm": 61.12400817871094, "learning_rate": 1.2666666666666667e-05, "loss": 0.3667, "step": 90 }, { "epoch": 1.0588235294117647, "eval_loss": 0.9096461534500122, "eval_runtime": 18.1026, "eval_samples_per_second": 300.785, "eval_steps_per_second": 4.751, "step": 90 }, { "epoch": 1.1764705882352942, "grad_norm": 50.41886901855469, "learning_rate": 1.1555555555555556e-05, "loss": 0.3104, "step": 100 }, { "epoch": 1.1764705882352942, "eval_loss": 0.9075976610183716, "eval_runtime": 18.2417, "eval_samples_per_second": 298.492, "eval_steps_per_second": 4.714, "step": 100 }, { "epoch": 1.2941176470588236, "grad_norm": 49.723411560058594, "learning_rate": 1.0444444444444445e-05, "loss": 0.3057, "step": 110 }, { "epoch": 1.2941176470588236, "eval_loss": 0.8963654637336731, "eval_runtime": 18.3711, "eval_samples_per_second": 296.389, "eval_steps_per_second": 4.681, "step": 110 }, { "epoch": 1.4117647058823528, "grad_norm": 66.96435546875, "learning_rate": 9.333333333333334e-06, "loss": 0.2852, "step": 120 }, { "epoch": 1.4117647058823528, "eval_loss": 0.8938003778457642, "eval_runtime": 18.4942, "eval_samples_per_second": 294.417, "eval_steps_per_second": 4.65, "step": 120 }, { "epoch": 1.5294117647058822, "grad_norm": 67.33085632324219, "learning_rate": 8.222222222222222e-06, "loss": 0.2527, "step": 130 }, { "epoch": 1.5294117647058822, "eval_loss": 0.9134606122970581, "eval_runtime": 18.4294, "eval_samples_per_second": 295.452, "eval_steps_per_second": 4.666, "step": 130 }, { "epoch": 1.6470588235294117, "grad_norm": 60.442684173583984, "learning_rate": 7.111111111111112e-06, "loss": 0.3592, "step": 140 }, { "epoch": 1.6470588235294117, "eval_loss": 0.8834967017173767, "eval_runtime": 18.3629, "eval_samples_per_second": 296.522, "eval_steps_per_second": 4.683, "step": 140 }, { "epoch": 1.7647058823529411, "grad_norm": 59.17316818237305, "learning_rate": 6e-06, "loss": 0.1998, "step": 150 }, { "epoch": 1.7647058823529411, "eval_loss": 0.862636923789978, "eval_runtime": 18.2959, "eval_samples_per_second": 297.608, "eval_steps_per_second": 4.701, "step": 150 }, { "epoch": 1.8823529411764706, "grad_norm": 33.52590560913086, "learning_rate": 4.888888888888889e-06, "loss": 0.2258, "step": 160 }, { "epoch": 1.8823529411764706, "eval_loss": 0.8706350326538086, "eval_runtime": 18.3223, "eval_samples_per_second": 297.179, "eval_steps_per_second": 4.694, "step": 160 }, { "epoch": 2.0, "grad_norm": 48.038719177246094, "learning_rate": 3.777777777777778e-06, "loss": 0.1933, "step": 170 }, { "epoch": 2.0, "eval_loss": 0.875869870185852, "eval_runtime": 18.2474, "eval_samples_per_second": 298.399, "eval_steps_per_second": 4.713, "step": 170 }, { "epoch": 2.1176470588235294, "grad_norm": 42.39509201049805, "learning_rate": 2.666666666666667e-06, "loss": 0.1987, "step": 180 }, { "epoch": 2.1176470588235294, "eval_loss": 0.875755250453949, "eval_runtime": 18.2272, "eval_samples_per_second": 298.729, "eval_steps_per_second": 4.718, "step": 180 }, { "epoch": 2.235294117647059, "grad_norm": 24.651744842529297, "learning_rate": 1.5555555555555558e-06, "loss": 0.1684, "step": 190 }, { "epoch": 2.235294117647059, "eval_loss": 0.8625762462615967, "eval_runtime": 18.1576, "eval_samples_per_second": 299.874, "eval_steps_per_second": 4.736, "step": 190 }, { "epoch": 2.3529411764705883, "grad_norm": 27.64773941040039, "learning_rate": 4.444444444444445e-07, "loss": 0.1612, "step": 200 }, { "epoch": 2.3529411764705883, "eval_loss": 0.8577004075050354, "eval_runtime": 18.1098, "eval_samples_per_second": 300.666, "eval_steps_per_second": 4.749, "step": 200 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }