| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.0, |
| "eval_steps": 500, |
| "global_step": 40, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.3193361535668373, |
| "epoch": 0.051118210862619806, |
| "grad_norm": 0.8119011521339417, |
| "learning_rate": 0.0, |
| "loss": 1.692, |
| "mean_token_accuracy": 0.654717817902565, |
| "num_tokens": 133947.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.3207192197442055, |
| "epoch": 0.10223642172523961, |
| "grad_norm": 0.8001739382743835, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.6938, |
| "mean_token_accuracy": 0.6540814265608788, |
| "num_tokens": 267949.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.3139144703745842, |
| "epoch": 0.15335463258785942, |
| "grad_norm": 0.8021382689476013, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.6875, |
| "mean_token_accuracy": 0.6542951986193657, |
| "num_tokens": 402435.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.3238477781414986, |
| "epoch": 0.20447284345047922, |
| "grad_norm": 0.8046473264694214, |
| "learning_rate": 6e-06, |
| "loss": 1.6979, |
| "mean_token_accuracy": 0.6523041352629662, |
| "num_tokens": 536339.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.3151762038469315, |
| "epoch": 0.25559105431309903, |
| "grad_norm": 0.7999162077903748, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.6884, |
| "mean_token_accuracy": 0.6541883014142513, |
| "num_tokens": 670396.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.3171968907117844, |
| "epoch": 0.30670926517571884, |
| "grad_norm": 0.8070191740989685, |
| "learning_rate": 1e-05, |
| "loss": 1.6881, |
| "mean_token_accuracy": 0.6536356993019581, |
| "num_tokens": 804575.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.3177252262830734, |
| "epoch": 0.35782747603833864, |
| "grad_norm": 0.8115559220314026, |
| "learning_rate": 1.2e-05, |
| "loss": 1.6852, |
| "mean_token_accuracy": 0.6539545804262161, |
| "num_tokens": 938351.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.3142458871006966, |
| "epoch": 0.40894568690095845, |
| "grad_norm": 0.809145987033844, |
| "learning_rate": 1.4e-05, |
| "loss": 1.6739, |
| "mean_token_accuracy": 0.6544475704431534, |
| "num_tokens": 1072655.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.3176514655351639, |
| "epoch": 0.46006389776357826, |
| "grad_norm": 0.8079097270965576, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.6733, |
| "mean_token_accuracy": 0.6541223935782909, |
| "num_tokens": 1206472.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.3175865784287453, |
| "epoch": 0.5111821086261981, |
| "grad_norm": 0.8009534478187561, |
| "learning_rate": 1.8e-05, |
| "loss": 1.657, |
| "mean_token_accuracy": 0.655005007982254, |
| "num_tokens": 1340462.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.3152535259723663, |
| "epoch": 0.5623003194888179, |
| "grad_norm": 0.7782304883003235, |
| "learning_rate": 2e-05, |
| "loss": 1.6391, |
| "mean_token_accuracy": 0.6572432741522789, |
| "num_tokens": 1474818.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.31825902312994, |
| "epoch": 0.6134185303514377, |
| "grad_norm": 0.7459490299224854, |
| "learning_rate": 1.977777777777778e-05, |
| "loss": 1.6261, |
| "mean_token_accuracy": 0.6601505167782307, |
| "num_tokens": 1608731.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.322536252439022, |
| "epoch": 0.6645367412140575, |
| "grad_norm": 0.7278594970703125, |
| "learning_rate": 1.9555555555555557e-05, |
| "loss": 1.6056, |
| "mean_token_accuracy": 0.6621886678040028, |
| "num_tokens": 1742229.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 1.3205925300717354, |
| "epoch": 0.7156549520766773, |
| "grad_norm": 0.6732656359672546, |
| "learning_rate": 1.9333333333333333e-05, |
| "loss": 1.587, |
| "mean_token_accuracy": 0.6625417172908783, |
| "num_tokens": 1875890.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.3151337951421738, |
| "epoch": 0.7667731629392971, |
| "grad_norm": 0.6385083794593811, |
| "learning_rate": 1.9111111111111113e-05, |
| "loss": 1.5551, |
| "mean_token_accuracy": 0.6706915572285652, |
| "num_tokens": 2009719.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.3125323951244354, |
| "epoch": 0.8178913738019169, |
| "grad_norm": 0.6251479387283325, |
| "learning_rate": 1.888888888888889e-05, |
| "loss": 1.5312, |
| "mean_token_accuracy": 0.6725872829556465, |
| "num_tokens": 2143710.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 1.3094838485121727, |
| "epoch": 0.8690095846645367, |
| "grad_norm": 0.6229560375213623, |
| "learning_rate": 1.866666666666667e-05, |
| "loss": 1.5063, |
| "mean_token_accuracy": 0.6759799160063267, |
| "num_tokens": 2277556.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 1.3096359893679619, |
| "epoch": 0.9201277955271565, |
| "grad_norm": 0.6263618469238281, |
| "learning_rate": 1.8444444444444448e-05, |
| "loss": 1.4846, |
| "mean_token_accuracy": 0.6817599721252918, |
| "num_tokens": 2411448.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 1.2960194796323776, |
| "epoch": 0.9712460063897763, |
| "grad_norm": 0.6250044703483582, |
| "learning_rate": 1.8222222222222224e-05, |
| "loss": 1.4525, |
| "mean_token_accuracy": 0.6899448521435261, |
| "num_tokens": 2545738.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 1.2956058846579657, |
| "epoch": 1.0, |
| "grad_norm": 0.6314756870269775, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4391, |
| "mean_token_accuracy": 0.6940541995896233, |
| "num_tokens": 2616926.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.2914148643612862, |
| "epoch": 1.0511182108626198, |
| "grad_norm": 0.631912887096405, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 1.4127, |
| "mean_token_accuracy": 0.6994642727077007, |
| "num_tokens": 2750943.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 1.2763815149664879, |
| "epoch": 1.1022364217252396, |
| "grad_norm": 0.6287756562232971, |
| "learning_rate": 1.7555555555555556e-05, |
| "loss": 1.3806, |
| "mean_token_accuracy": 0.7029485926032066, |
| "num_tokens": 2885149.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 1.2685835510492325, |
| "epoch": 1.1533546325878594, |
| "grad_norm": 0.6242936253547668, |
| "learning_rate": 1.7333333333333336e-05, |
| "loss": 1.3606, |
| "mean_token_accuracy": 0.7043894305825233, |
| "num_tokens": 3019433.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 1.263872005045414, |
| "epoch": 1.2044728434504792, |
| "grad_norm": 0.6192987561225891, |
| "learning_rate": 1.7111111111111112e-05, |
| "loss": 1.337, |
| "mean_token_accuracy": 0.709864255040884, |
| "num_tokens": 3153309.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 1.2505493015050888, |
| "epoch": 1.255591054313099, |
| "grad_norm": 0.6113152503967285, |
| "learning_rate": 1.688888888888889e-05, |
| "loss": 1.3113, |
| "mean_token_accuracy": 0.7119522020220757, |
| "num_tokens": 3287322.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.242555320262909, |
| "epoch": 1.3067092651757188, |
| "grad_norm": 0.597195029258728, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.2892, |
| "mean_token_accuracy": 0.7140648253262043, |
| "num_tokens": 3421258.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 1.229688823223114, |
| "epoch": 1.3578274760383386, |
| "grad_norm": 0.586483359336853, |
| "learning_rate": 1.6444444444444444e-05, |
| "loss": 1.264, |
| "mean_token_accuracy": 0.7178861573338509, |
| "num_tokens": 3555933.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 1.2219947651028633, |
| "epoch": 1.4089456869009584, |
| "grad_norm": 0.5772027373313904, |
| "learning_rate": 1.6222222222222223e-05, |
| "loss": 1.2416, |
| "mean_token_accuracy": 0.7191276662051678, |
| "num_tokens": 3689685.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 1.2158958613872528, |
| "epoch": 1.4600638977635783, |
| "grad_norm": 0.5700508952140808, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.2201, |
| "mean_token_accuracy": 0.7224024310708046, |
| "num_tokens": 3823781.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 1.2152001112699509, |
| "epoch": 1.511182108626198, |
| "grad_norm": 0.5722755789756775, |
| "learning_rate": 1.577777777777778e-05, |
| "loss": 1.2091, |
| "mean_token_accuracy": 0.722760371863842, |
| "num_tokens": 3956942.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.1953495219349861, |
| "epoch": 1.5623003194888179, |
| "grad_norm": 0.5692858695983887, |
| "learning_rate": 1.555555555555556e-05, |
| "loss": 1.1714, |
| "mean_token_accuracy": 0.7302578240633011, |
| "num_tokens": 4091313.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 1.1933885142207146, |
| "epoch": 1.6134185303514377, |
| "grad_norm": 0.5751745700836182, |
| "learning_rate": 1.5333333333333334e-05, |
| "loss": 1.1615, |
| "mean_token_accuracy": 0.7318685166537762, |
| "num_tokens": 4225481.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 1.1871799379587173, |
| "epoch": 1.6645367412140575, |
| "grad_norm": 0.5843569040298462, |
| "learning_rate": 1.5111111111111112e-05, |
| "loss": 1.1378, |
| "mean_token_accuracy": 0.7380619496107101, |
| "num_tokens": 4359158.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 1.1805067732930183, |
| "epoch": 1.7156549520766773, |
| "grad_norm": 0.5917448997497559, |
| "learning_rate": 1.488888888888889e-05, |
| "loss": 1.1218, |
| "mean_token_accuracy": 0.7423498816788197, |
| "num_tokens": 4493077.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 1.170977495610714, |
| "epoch": 1.766773162939297, |
| "grad_norm": 0.59839928150177, |
| "learning_rate": 1.4666666666666666e-05, |
| "loss": 1.096, |
| "mean_token_accuracy": 0.7493030689656734, |
| "num_tokens": 4627251.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.1678270995616913, |
| "epoch": 1.817891373801917, |
| "grad_norm": 0.6055343151092529, |
| "learning_rate": 1.4444444444444446e-05, |
| "loss": 1.0802, |
| "mean_token_accuracy": 0.7507753595709801, |
| "num_tokens": 4761020.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 1.158110834658146, |
| "epoch": 1.8690095846645367, |
| "grad_norm": 0.6079038381576538, |
| "learning_rate": 1.4222222222222224e-05, |
| "loss": 1.0628, |
| "mean_token_accuracy": 0.755638737231493, |
| "num_tokens": 4895183.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 1.154124453663826, |
| "epoch": 1.9201277955271565, |
| "grad_norm": 0.6134995222091675, |
| "learning_rate": 1.4e-05, |
| "loss": 1.043, |
| "mean_token_accuracy": 0.7614487372338772, |
| "num_tokens": 5028509.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 1.1415963619947433, |
| "epoch": 1.9712460063897763, |
| "grad_norm": 0.608974277973175, |
| "learning_rate": 1.377777777777778e-05, |
| "loss": 1.0203, |
| "mean_token_accuracy": 0.7697952277958393, |
| "num_tokens": 5162590.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 1.1324340105056763, |
| "epoch": 2.0, |
| "grad_norm": 0.6109018325805664, |
| "learning_rate": 1.3555555555555557e-05, |
| "loss": 0.9992, |
| "mean_token_accuracy": 0.7730923626157973, |
| "num_tokens": 5233852.0, |
| "step": 40 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 4.566288687449702e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|