{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3529411764705883, "eval_steps": 10, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11764705882352941, "grad_norm": 99.34697723388672, "learning_rate": 7e-06, "loss": 4.6344, "step": 10 }, { "epoch": 0.11764705882352941, "eval_loss": 4.356701850891113, "eval_runtime": 7.4823, "eval_samples_per_second": 727.722, "eval_steps_per_second": 11.494, "step": 10 }, { "epoch": 0.23529411764705882, "grad_norm": 68.6648941040039, "learning_rate": 1.7e-05, "loss": 4.1147, "step": 20 }, { "epoch": 0.23529411764705882, "eval_loss": 3.802903652191162, "eval_runtime": 7.4967, "eval_samples_per_second": 726.322, "eval_steps_per_second": 11.472, "step": 20 }, { "epoch": 0.35294117647058826, "grad_norm": 73.34857940673828, "learning_rate": 1.9222222222222225e-05, "loss": 3.3201, "step": 30 }, { "epoch": 0.35294117647058826, "eval_loss": 2.9514710903167725, "eval_runtime": 7.5247, "eval_samples_per_second": 723.619, "eval_steps_per_second": 11.429, "step": 30 }, { "epoch": 0.47058823529411764, "grad_norm": 61.95991134643555, "learning_rate": 1.8111111111111112e-05, "loss": 2.1842, "step": 40 }, { "epoch": 0.47058823529411764, "eval_loss": 2.131948947906494, "eval_runtime": 7.7586, "eval_samples_per_second": 701.798, "eval_steps_per_second": 11.084, "step": 40 }, { "epoch": 0.5882352941176471, "grad_norm": 48.887203216552734, "learning_rate": 1.7e-05, "loss": 1.2624, "step": 50 }, { "epoch": 0.5882352941176471, "eval_loss": 1.3057191371917725, "eval_runtime": 7.9165, "eval_samples_per_second": 687.801, "eval_steps_per_second": 10.863, "step": 50 }, { "epoch": 0.7058823529411765, "grad_norm": 35.66941833496094, "learning_rate": 1.588888888888889e-05, "loss": 0.6907, "step": 60 }, { "epoch": 0.7058823529411765, "eval_loss": 0.9652644991874695, "eval_runtime": 7.9447, "eval_samples_per_second": 685.363, "eval_steps_per_second": 10.825, "step": 60 }, { "epoch": 0.8235294117647058, "grad_norm": 29.34256935119629, "learning_rate": 1.477777777777778e-05, "loss": 0.3527, "step": 70 }, { "epoch": 0.8235294117647058, "eval_loss": 0.8976885080337524, "eval_runtime": 8.0315, "eval_samples_per_second": 677.953, "eval_steps_per_second": 10.708, "step": 70 }, { "epoch": 0.9411764705882353, "grad_norm": 32.28174591064453, "learning_rate": 1.3666666666666667e-05, "loss": 0.2828, "step": 80 }, { "epoch": 0.9411764705882353, "eval_loss": 0.8685462474822998, "eval_runtime": 8.14, "eval_samples_per_second": 668.919, "eval_steps_per_second": 10.565, "step": 80 }, { "epoch": 1.0588235294117647, "grad_norm": 38.87677764892578, "learning_rate": 1.2555555555555557e-05, "loss": 0.2758, "step": 90 }, { "epoch": 1.0588235294117647, "eval_loss": 0.8556678295135498, "eval_runtime": 8.2601, "eval_samples_per_second": 659.194, "eval_steps_per_second": 10.412, "step": 90 }, { "epoch": 1.1764705882352942, "grad_norm": 33.91325378417969, "learning_rate": 1.1444444444444444e-05, "loss": 0.2576, "step": 100 }, { "epoch": 1.1764705882352942, "eval_loss": 0.8554092049598694, "eval_runtime": 8.4912, "eval_samples_per_second": 641.252, "eval_steps_per_second": 10.128, "step": 100 }, { "epoch": 1.2941176470588236, "grad_norm": 27.338443756103516, "learning_rate": 1.0333333333333335e-05, "loss": 0.2877, "step": 110 }, { "epoch": 1.2941176470588236, "eval_loss": 0.8585284948348999, "eval_runtime": 8.7035, "eval_samples_per_second": 625.611, "eval_steps_per_second": 9.881, "step": 110 }, { "epoch": 1.4117647058823528, "grad_norm": 33.70604705810547, "learning_rate": 9.222222222222224e-06, "loss": 0.2109, "step": 120 }, { "epoch": 1.4117647058823528, "eval_loss": 0.8407207727432251, "eval_runtime": 8.9789, "eval_samples_per_second": 606.424, "eval_steps_per_second": 9.578, "step": 120 }, { "epoch": 1.5294117647058822, "grad_norm": 38.09708786010742, "learning_rate": 8.111111111111112e-06, "loss": 0.206, "step": 130 }, { "epoch": 1.5294117647058822, "eval_loss": 0.8386306166648865, "eval_runtime": 9.2342, "eval_samples_per_second": 589.658, "eval_steps_per_second": 9.313, "step": 130 }, { "epoch": 1.6470588235294117, "grad_norm": 47.18006896972656, "learning_rate": 7e-06, "loss": 0.2828, "step": 140 }, { "epoch": 1.6470588235294117, "eval_loss": 0.8333184719085693, "eval_runtime": 9.1122, "eval_samples_per_second": 597.55, "eval_steps_per_second": 9.438, "step": 140 }, { "epoch": 1.7647058823529411, "grad_norm": 24.672765731811523, "learning_rate": 5.88888888888889e-06, "loss": 0.2045, "step": 150 }, { "epoch": 1.7647058823529411, "eval_loss": 0.8294004201889038, "eval_runtime": 8.9028, "eval_samples_per_second": 611.608, "eval_steps_per_second": 9.66, "step": 150 }, { "epoch": 1.8823529411764706, "grad_norm": 21.150001525878906, "learning_rate": 4.777777777777778e-06, "loss": 0.1932, "step": 160 }, { "epoch": 1.8823529411764706, "eval_loss": 0.828009843826294, "eval_runtime": 8.7354, "eval_samples_per_second": 623.324, "eval_steps_per_second": 9.845, "step": 160 }, { "epoch": 2.0, "grad_norm": 55.02903366088867, "learning_rate": 3.6666666666666666e-06, "loss": 0.1763, "step": 170 }, { "epoch": 2.0, "eval_loss": 0.824001133441925, "eval_runtime": 8.6395, "eval_samples_per_second": 630.244, "eval_steps_per_second": 9.954, "step": 170 }, { "epoch": 2.1176470588235294, "grad_norm": 15.967198371887207, "learning_rate": 2.5555555555555557e-06, "loss": 0.1603, "step": 180 }, { "epoch": 2.1176470588235294, "eval_loss": 0.8176314830780029, "eval_runtime": 8.7379, "eval_samples_per_second": 623.15, "eval_steps_per_second": 9.842, "step": 180 }, { "epoch": 2.235294117647059, "grad_norm": 9.89770221710205, "learning_rate": 1.4444444444444445e-06, "loss": 0.1445, "step": 190 }, { "epoch": 2.235294117647059, "eval_loss": 0.8145530223846436, "eval_runtime": 8.8692, "eval_samples_per_second": 613.923, "eval_steps_per_second": 9.696, "step": 190 }, { "epoch": 2.3529411764705883, "grad_norm": 31.038183212280273, "learning_rate": 3.3333333333333335e-07, "loss": 0.1534, "step": 200 }, { "epoch": 2.3529411764705883, "eval_loss": 0.8133436441421509, "eval_runtime": 9.0044, "eval_samples_per_second": 604.702, "eval_steps_per_second": 9.551, "step": 200 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }