| { |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.5037783375314862, |
| "eval_steps": 500, |
| "global_step": 450, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.00559753708368318, |
| "grad_norm": 0.89453125, |
| "learning_rate": 0.0001, |
| "loss": 3.791, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.01119507416736636, |
| "grad_norm": 0.55078125, |
| "learning_rate": 0.0001, |
| "loss": 2.5709, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.016792611251049538, |
| "grad_norm": 0.52734375, |
| "learning_rate": 0.0001, |
| "loss": 2.0527, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.02239014833473272, |
| "grad_norm": 0.265625, |
| "learning_rate": 0.0001, |
| "loss": 1.854, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.027987685418415897, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.0001, |
| "loss": 1.7825, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.033585222502099076, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.0001, |
| "loss": 1.6752, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.039182759585782254, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.0001, |
| "loss": 1.6574, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.04478029666946544, |
| "grad_norm": 0.234375, |
| "learning_rate": 0.0001, |
| "loss": 1.6349, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.05037783375314862, |
| "grad_norm": 0.25390625, |
| "learning_rate": 0.0001, |
| "loss": 1.6088, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.055975370836831795, |
| "grad_norm": 0.2353515625, |
| "learning_rate": 0.0001, |
| "loss": 1.5264, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.06157290792051497, |
| "grad_norm": 0.21484375, |
| "learning_rate": 0.0001, |
| "loss": 1.5305, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.06717044500419815, |
| "grad_norm": 0.19140625, |
| "learning_rate": 0.0001, |
| "loss": 1.5403, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.07276798208788134, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.0001, |
| "loss": 1.5827, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.07836551917156451, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 1.5901, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.08396305625524769, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.0001, |
| "loss": 1.609, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.08956059333893088, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.0001, |
| "loss": 1.5113, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.09515813042261405, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.0001, |
| "loss": 1.5215, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.10075566750629723, |
| "grad_norm": 0.287109375, |
| "learning_rate": 0.0001, |
| "loss": 1.5761, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.1063532045899804, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.0001, |
| "loss": 1.5121, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.11195074167366359, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.0001, |
| "loss": 1.4994, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.11754827875734676, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.0001, |
| "loss": 1.5273, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.12314581584102995, |
| "grad_norm": 0.193359375, |
| "learning_rate": 0.0001, |
| "loss": 1.5585, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.12874335292471312, |
| "grad_norm": 0.177734375, |
| "learning_rate": 0.0001, |
| "loss": 1.5368, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.1343408900083963, |
| "grad_norm": 0.189453125, |
| "learning_rate": 0.0001, |
| "loss": 1.4332, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.1399384270920795, |
| "grad_norm": 0.2138671875, |
| "learning_rate": 0.0001, |
| "loss": 1.4477, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.14553596417576267, |
| "grad_norm": 0.2060546875, |
| "learning_rate": 0.0001, |
| "loss": 1.4687, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.15113350125944586, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.0001, |
| "loss": 1.5834, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.15673103834312901, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.0001, |
| "loss": 1.4358, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.1623285754268122, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.0001, |
| "loss": 1.5451, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.16792611251049538, |
| "grad_norm": 0.328125, |
| "learning_rate": 0.0001, |
| "loss": 1.3199, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.17352364959417857, |
| "grad_norm": 0.40625, |
| "learning_rate": 0.0001, |
| "loss": 1.5433, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.17912118667786175, |
| "grad_norm": 0.27734375, |
| "learning_rate": 0.0001, |
| "loss": 1.524, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.1847187237615449, |
| "grad_norm": 0.1904296875, |
| "learning_rate": 0.0001, |
| "loss": 1.5498, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.1903162608452281, |
| "grad_norm": 0.2265625, |
| "learning_rate": 0.0001, |
| "loss": 1.4975, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.19591379792891128, |
| "grad_norm": 0.1982421875, |
| "learning_rate": 0.0001, |
| "loss": 1.5066, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.20151133501259447, |
| "grad_norm": 0.1875, |
| "learning_rate": 0.0001, |
| "loss": 1.4614, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.20710887209627762, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 1.5087, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.2127064091799608, |
| "grad_norm": 0.251953125, |
| "learning_rate": 0.0001, |
| "loss": 1.4268, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.218303946263644, |
| "grad_norm": 0.205078125, |
| "learning_rate": 0.0001, |
| "loss": 1.4212, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.22390148334732718, |
| "grad_norm": 0.2734375, |
| "learning_rate": 0.0001, |
| "loss": 1.3205, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22949902043101036, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.0001, |
| "loss": 1.4604, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.23509655751469352, |
| "grad_norm": 0.248046875, |
| "learning_rate": 0.0001, |
| "loss": 1.5345, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.2406940945983767, |
| "grad_norm": 0.2294921875, |
| "learning_rate": 0.0001, |
| "loss": 1.4651, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.2462916316820599, |
| "grad_norm": 0.26171875, |
| "learning_rate": 0.0001, |
| "loss": 1.5094, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.2518891687657431, |
| "grad_norm": 0.3203125, |
| "learning_rate": 0.0001, |
| "loss": 1.508, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.25748670584942623, |
| "grad_norm": 0.201171875, |
| "learning_rate": 0.0001, |
| "loss": 1.5213, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.26308424293310945, |
| "grad_norm": 0.2021484375, |
| "learning_rate": 0.0001, |
| "loss": 1.5179, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.2686817800167926, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.0001, |
| "loss": 1.5109, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.2742793171004758, |
| "grad_norm": 0.203125, |
| "learning_rate": 0.0001, |
| "loss": 1.4169, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.279876854184159, |
| "grad_norm": 0.255859375, |
| "learning_rate": 0.0001, |
| "loss": 1.3441, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.28547439126784213, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.0001, |
| "loss": 1.4849, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.29107192835152534, |
| "grad_norm": 0.2099609375, |
| "learning_rate": 0.0001, |
| "loss": 1.5213, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.2966694654352085, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.0001, |
| "loss": 1.5736, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.3022670025188917, |
| "grad_norm": 0.208984375, |
| "learning_rate": 0.0001, |
| "loss": 1.4562, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.30786453960257487, |
| "grad_norm": 0.1953125, |
| "learning_rate": 0.0001, |
| "loss": 1.57, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.31346207668625803, |
| "grad_norm": 0.20703125, |
| "learning_rate": 0.0001, |
| "loss": 1.4832, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.31905961376994124, |
| "grad_norm": 0.197265625, |
| "learning_rate": 0.0001, |
| "loss": 1.4355, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.3246571508536244, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.0001, |
| "loss": 1.5262, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.3302546879373076, |
| "grad_norm": 0.244140625, |
| "learning_rate": 0.0001, |
| "loss": 1.4061, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.33585222502099077, |
| "grad_norm": 0.427734375, |
| "learning_rate": 0.0001, |
| "loss": 1.311, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.3414497621046739, |
| "grad_norm": 0.333984375, |
| "learning_rate": 0.0001, |
| "loss": 1.4673, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.34704729918835714, |
| "grad_norm": 0.22265625, |
| "learning_rate": 0.0001, |
| "loss": 1.5237, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.3526448362720403, |
| "grad_norm": 0.2314453125, |
| "learning_rate": 0.0001, |
| "loss": 1.4541, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.3582423733557235, |
| "grad_norm": 0.197265625, |
| "learning_rate": 0.0001, |
| "loss": 1.4201, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.36383991043940667, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.0001, |
| "loss": 1.4503, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.3694374475230898, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 0.0001, |
| "loss": 1.4432, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.37503498460677304, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.0001, |
| "loss": 1.4763, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.3806325216904562, |
| "grad_norm": 0.23828125, |
| "learning_rate": 0.0001, |
| "loss": 1.4658, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.38623005877413935, |
| "grad_norm": 0.19921875, |
| "learning_rate": 0.0001, |
| "loss": 1.3971, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.39182759585782256, |
| "grad_norm": 0.263671875, |
| "learning_rate": 0.0001, |
| "loss": 1.3114, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.3974251329415057, |
| "grad_norm": 0.2080078125, |
| "learning_rate": 0.0001, |
| "loss": 1.3536, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.40302267002518893, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.0001, |
| "loss": 1.5357, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.4086202071088721, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.0001, |
| "loss": 1.4427, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.41421774419255525, |
| "grad_norm": 0.2119140625, |
| "learning_rate": 0.0001, |
| "loss": 1.527, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.41981528127623846, |
| "grad_norm": 0.212890625, |
| "learning_rate": 0.0001, |
| "loss": 1.4619, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.4254128183599216, |
| "grad_norm": 0.1767578125, |
| "learning_rate": 0.0001, |
| "loss": 1.3959, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.43101035544360483, |
| "grad_norm": 0.1884765625, |
| "learning_rate": 0.0001, |
| "loss": 1.388, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.436607892527288, |
| "grad_norm": 0.1826171875, |
| "learning_rate": 0.0001, |
| "loss": 1.436, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.44220542961097115, |
| "grad_norm": 0.2041015625, |
| "learning_rate": 0.0001, |
| "loss": 1.366, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.44780296669465436, |
| "grad_norm": 0.296875, |
| "learning_rate": 0.0001, |
| "loss": 1.2186, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.4534005037783375, |
| "grad_norm": 0.23046875, |
| "learning_rate": 0.0001, |
| "loss": 1.47, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.45899804086202073, |
| "grad_norm": 0.294921875, |
| "learning_rate": 0.0001, |
| "loss": 1.4639, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.4645955779457039, |
| "grad_norm": 0.2236328125, |
| "learning_rate": 0.0001, |
| "loss": 1.4478, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.47019311502938704, |
| "grad_norm": 0.2001953125, |
| "learning_rate": 0.0001, |
| "loss": 1.4302, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.47579065211307026, |
| "grad_norm": 0.1943359375, |
| "learning_rate": 0.0001, |
| "loss": 1.4157, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.4813881891967534, |
| "grad_norm": 0.2451171875, |
| "learning_rate": 0.0001, |
| "loss": 1.5067, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.4869857262804366, |
| "grad_norm": 0.2216796875, |
| "learning_rate": 0.0001, |
| "loss": 1.5442, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.4925832633641198, |
| "grad_norm": 0.25, |
| "learning_rate": 0.0001, |
| "loss": 1.3914, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.49818080044780294, |
| "grad_norm": 0.2177734375, |
| "learning_rate": 0.0001, |
| "loss": 1.3557, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.5037783375314862, |
| "grad_norm": 0.2890625, |
| "learning_rate": 0.0001, |
| "loss": 1.306, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.5037783375314862, |
| "step": 450, |
| "total_flos": 4.334613097500672e+17, |
| "train_loss": 1.5296179887983534, |
| "train_runtime": 15251.7008, |
| "train_samples_per_second": 1.888, |
| "train_steps_per_second": 0.03 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 450, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 90, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.334613097500672e+17, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|