{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.3529411764705883, "eval_steps": 10, "global_step": 200, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.11764705882352941, "grad_norm": 123.9454574584961, "learning_rate": 7e-06, "loss": 4.7042, "step": 10 }, { "epoch": 0.11764705882352941, "eval_loss": 4.442946434020996, "eval_runtime": 9.6408, "eval_samples_per_second": 564.786, "eval_steps_per_second": 8.92, "step": 10 }, { "epoch": 0.23529411764705882, "grad_norm": 92.6566390991211, "learning_rate": 1.7e-05, "loss": 4.168, "step": 20 }, { "epoch": 0.23529411764705882, "eval_loss": 3.8164174556732178, "eval_runtime": 9.6135, "eval_samples_per_second": 566.388, "eval_steps_per_second": 8.946, "step": 20 }, { "epoch": 0.35294117647058826, "grad_norm": 79.20145416259766, "learning_rate": 1.9222222222222225e-05, "loss": 3.4261, "step": 30 }, { "epoch": 0.35294117647058826, "eval_loss": 3.0521063804626465, "eval_runtime": 9.6635, "eval_samples_per_second": 563.46, "eval_steps_per_second": 8.899, "step": 30 }, { "epoch": 0.47058823529411764, "grad_norm": 74.91513061523438, "learning_rate": 1.8111111111111112e-05, "loss": 2.4283, "step": 40 }, { "epoch": 0.47058823529411764, "eval_loss": 2.2814009189605713, "eval_runtime": 9.6413, "eval_samples_per_second": 564.758, "eval_steps_per_second": 8.92, "step": 40 }, { "epoch": 0.5882352941176471, "grad_norm": 54.24647903442383, "learning_rate": 1.7e-05, "loss": 1.5579, "step": 50 }, { "epoch": 0.5882352941176471, "eval_loss": 1.4840773344039917, "eval_runtime": 9.7205, "eval_samples_per_second": 560.159, "eval_steps_per_second": 8.847, "step": 50 }, { "epoch": 0.7058823529411765, "grad_norm": 41.453800201416016, "learning_rate": 1.588888888888889e-05, "loss": 0.8196, "step": 60 }, { "epoch": 0.7058823529411765, "eval_loss": 1.028959035873413, "eval_runtime": 9.7872, "eval_samples_per_second": 556.341, "eval_steps_per_second": 8.787, "step": 60 }, { "epoch": 0.8235294117647058, "grad_norm": 31.6336612701416, "learning_rate": 1.477777777777778e-05, "loss": 0.4275, "step": 70 }, { "epoch": 0.8235294117647058, "eval_loss": 0.9401000142097473, "eval_runtime": 9.8905, "eval_samples_per_second": 550.53, "eval_steps_per_second": 8.695, "step": 70 }, { "epoch": 0.9411764705882353, "grad_norm": 31.739768981933594, "learning_rate": 1.3666666666666667e-05, "loss": 0.3215, "step": 80 }, { "epoch": 0.9411764705882353, "eval_loss": 0.876151978969574, "eval_runtime": 9.9605, "eval_samples_per_second": 546.659, "eval_steps_per_second": 8.634, "step": 80 }, { "epoch": 1.0588235294117647, "grad_norm": 32.853572845458984, "learning_rate": 1.2555555555555557e-05, "loss": 0.3068, "step": 90 }, { "epoch": 1.0588235294117647, "eval_loss": 0.872206449508667, "eval_runtime": 10.0229, "eval_samples_per_second": 543.258, "eval_steps_per_second": 8.58, "step": 90 }, { "epoch": 1.1764705882352942, "grad_norm": 41.155303955078125, "learning_rate": 1.1444444444444444e-05, "loss": 0.2794, "step": 100 }, { "epoch": 1.1764705882352942, "eval_loss": 0.8592934608459473, "eval_runtime": 10.1063, "eval_samples_per_second": 538.774, "eval_steps_per_second": 8.51, "step": 100 }, { "epoch": 1.2941176470588236, "grad_norm": 40.7620735168457, "learning_rate": 1.0333333333333335e-05, "loss": 0.2882, "step": 110 }, { "epoch": 1.2941176470588236, "eval_loss": 0.8575130701065063, "eval_runtime": 10.1379, "eval_samples_per_second": 537.092, "eval_steps_per_second": 8.483, "step": 110 }, { "epoch": 1.4117647058823528, "grad_norm": 36.15983963012695, "learning_rate": 9.222222222222224e-06, "loss": 0.2279, "step": 120 }, { "epoch": 1.4117647058823528, "eval_loss": 0.834524929523468, "eval_runtime": 10.3032, "eval_samples_per_second": 528.475, "eval_steps_per_second": 8.347, "step": 120 }, { "epoch": 1.5294117647058822, "grad_norm": 39.788455963134766, "learning_rate": 8.111111111111112e-06, "loss": 0.2172, "step": 130 }, { "epoch": 1.5294117647058822, "eval_loss": 0.8397492170333862, "eval_runtime": 10.2209, "eval_samples_per_second": 532.731, "eval_steps_per_second": 8.414, "step": 130 }, { "epoch": 1.6470588235294117, "grad_norm": 39.91408157348633, "learning_rate": 7e-06, "loss": 0.3203, "step": 140 }, { "epoch": 1.6470588235294117, "eval_loss": 0.8325490355491638, "eval_runtime": 10.216, "eval_samples_per_second": 532.989, "eval_steps_per_second": 8.418, "step": 140 }, { "epoch": 1.7647058823529411, "grad_norm": 32.257728576660156, "learning_rate": 5.88888888888889e-06, "loss": 0.2022, "step": 150 }, { "epoch": 1.7647058823529411, "eval_loss": 0.8284817337989807, "eval_runtime": 10.1662, "eval_samples_per_second": 535.596, "eval_steps_per_second": 8.459, "step": 150 }, { "epoch": 1.8823529411764706, "grad_norm": 19.03813934326172, "learning_rate": 4.777777777777778e-06, "loss": 0.2078, "step": 160 }, { "epoch": 1.8823529411764706, "eval_loss": 0.8244841694831848, "eval_runtime": 10.1785, "eval_samples_per_second": 534.951, "eval_steps_per_second": 8.449, "step": 160 }, { "epoch": 2.0, "grad_norm": 28.51860237121582, "learning_rate": 3.6666666666666666e-06, "loss": 0.1584, "step": 170 }, { "epoch": 2.0, "eval_loss": 0.8192431926727295, "eval_runtime": 10.15, "eval_samples_per_second": 536.454, "eval_steps_per_second": 8.473, "step": 170 }, { "epoch": 2.1176470588235294, "grad_norm": 21.15154266357422, "learning_rate": 2.5555555555555557e-06, "loss": 0.1569, "step": 180 }, { "epoch": 2.1176470588235294, "eval_loss": 0.8104143142700195, "eval_runtime": 10.1376, "eval_samples_per_second": 537.111, "eval_steps_per_second": 8.483, "step": 180 }, { "epoch": 2.235294117647059, "grad_norm": 27.973337173461914, "learning_rate": 1.4444444444444445e-06, "loss": 0.1517, "step": 190 }, { "epoch": 2.235294117647059, "eval_loss": 0.8075307607650757, "eval_runtime": 10.2062, "eval_samples_per_second": 533.499, "eval_steps_per_second": 8.426, "step": 190 }, { "epoch": 2.3529411764705883, "grad_norm": 42.8331298828125, "learning_rate": 3.3333333333333335e-07, "loss": 0.1646, "step": 200 }, { "epoch": 2.3529411764705883, "eval_loss": 0.807045042514801, "eval_runtime": 10.3431, "eval_samples_per_second": 526.439, "eval_steps_per_second": 8.315, "step": 200 } ], "logging_steps": 10, "max_steps": 200, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 2500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 64, "trial_name": null, "trial_params": null }