| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 4.0, |
| "eval_steps": 500, |
| "global_step": 80, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "entropy": 1.3193361535668373, |
| "epoch": 0.051118210862619806, |
| "grad_norm": 0.8119011521339417, |
| "learning_rate": 0.0, |
| "loss": 1.692, |
| "mean_token_accuracy": 0.654717817902565, |
| "num_tokens": 133947.0, |
| "step": 1 |
| }, |
| { |
| "entropy": 1.3207192197442055, |
| "epoch": 0.10223642172523961, |
| "grad_norm": 0.8001739382743835, |
| "learning_rate": 2.0000000000000003e-06, |
| "loss": 1.6938, |
| "mean_token_accuracy": 0.6540814265608788, |
| "num_tokens": 267949.0, |
| "step": 2 |
| }, |
| { |
| "entropy": 1.3139144703745842, |
| "epoch": 0.15335463258785942, |
| "grad_norm": 0.8021382689476013, |
| "learning_rate": 4.000000000000001e-06, |
| "loss": 1.6875, |
| "mean_token_accuracy": 0.6542951986193657, |
| "num_tokens": 402435.0, |
| "step": 3 |
| }, |
| { |
| "entropy": 1.3238477781414986, |
| "epoch": 0.20447284345047922, |
| "grad_norm": 0.8046473264694214, |
| "learning_rate": 6e-06, |
| "loss": 1.6979, |
| "mean_token_accuracy": 0.6523041352629662, |
| "num_tokens": 536339.0, |
| "step": 4 |
| }, |
| { |
| "entropy": 1.3151762038469315, |
| "epoch": 0.25559105431309903, |
| "grad_norm": 0.7999162077903748, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 1.6884, |
| "mean_token_accuracy": 0.6541883014142513, |
| "num_tokens": 670396.0, |
| "step": 5 |
| }, |
| { |
| "entropy": 1.3171968907117844, |
| "epoch": 0.30670926517571884, |
| "grad_norm": 0.8070191740989685, |
| "learning_rate": 1e-05, |
| "loss": 1.6881, |
| "mean_token_accuracy": 0.6536356993019581, |
| "num_tokens": 804575.0, |
| "step": 6 |
| }, |
| { |
| "entropy": 1.3177252262830734, |
| "epoch": 0.35782747603833864, |
| "grad_norm": 0.8115559220314026, |
| "learning_rate": 1.2e-05, |
| "loss": 1.6852, |
| "mean_token_accuracy": 0.6539545804262161, |
| "num_tokens": 938351.0, |
| "step": 7 |
| }, |
| { |
| "entropy": 1.3142458871006966, |
| "epoch": 0.40894568690095845, |
| "grad_norm": 0.809145987033844, |
| "learning_rate": 1.4e-05, |
| "loss": 1.6739, |
| "mean_token_accuracy": 0.6544475704431534, |
| "num_tokens": 1072655.0, |
| "step": 8 |
| }, |
| { |
| "entropy": 1.3176514655351639, |
| "epoch": 0.46006389776357826, |
| "grad_norm": 0.8079097270965576, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.6733, |
| "mean_token_accuracy": 0.6541223935782909, |
| "num_tokens": 1206472.0, |
| "step": 9 |
| }, |
| { |
| "entropy": 1.3175865784287453, |
| "epoch": 0.5111821086261981, |
| "grad_norm": 0.8009534478187561, |
| "learning_rate": 1.8e-05, |
| "loss": 1.657, |
| "mean_token_accuracy": 0.655005007982254, |
| "num_tokens": 1340462.0, |
| "step": 10 |
| }, |
| { |
| "entropy": 1.3152535259723663, |
| "epoch": 0.5623003194888179, |
| "grad_norm": 0.7782304883003235, |
| "learning_rate": 2e-05, |
| "loss": 1.6391, |
| "mean_token_accuracy": 0.6572432741522789, |
| "num_tokens": 1474818.0, |
| "step": 11 |
| }, |
| { |
| "entropy": 1.31825902312994, |
| "epoch": 0.6134185303514377, |
| "grad_norm": 0.7459490299224854, |
| "learning_rate": 1.977777777777778e-05, |
| "loss": 1.6261, |
| "mean_token_accuracy": 0.6601505167782307, |
| "num_tokens": 1608731.0, |
| "step": 12 |
| }, |
| { |
| "entropy": 1.322536252439022, |
| "epoch": 0.6645367412140575, |
| "grad_norm": 0.7278594970703125, |
| "learning_rate": 1.9555555555555557e-05, |
| "loss": 1.6056, |
| "mean_token_accuracy": 0.6621886678040028, |
| "num_tokens": 1742229.0, |
| "step": 13 |
| }, |
| { |
| "entropy": 1.3205925300717354, |
| "epoch": 0.7156549520766773, |
| "grad_norm": 0.6732656359672546, |
| "learning_rate": 1.9333333333333333e-05, |
| "loss": 1.587, |
| "mean_token_accuracy": 0.6625417172908783, |
| "num_tokens": 1875890.0, |
| "step": 14 |
| }, |
| { |
| "entropy": 1.3151337951421738, |
| "epoch": 0.7667731629392971, |
| "grad_norm": 0.6385083794593811, |
| "learning_rate": 1.9111111111111113e-05, |
| "loss": 1.5551, |
| "mean_token_accuracy": 0.6706915572285652, |
| "num_tokens": 2009719.0, |
| "step": 15 |
| }, |
| { |
| "entropy": 1.3125323951244354, |
| "epoch": 0.8178913738019169, |
| "grad_norm": 0.6251479387283325, |
| "learning_rate": 1.888888888888889e-05, |
| "loss": 1.5312, |
| "mean_token_accuracy": 0.6725872829556465, |
| "num_tokens": 2143710.0, |
| "step": 16 |
| }, |
| { |
| "entropy": 1.3094838485121727, |
| "epoch": 0.8690095846645367, |
| "grad_norm": 0.6229560375213623, |
| "learning_rate": 1.866666666666667e-05, |
| "loss": 1.5063, |
| "mean_token_accuracy": 0.6759799160063267, |
| "num_tokens": 2277556.0, |
| "step": 17 |
| }, |
| { |
| "entropy": 1.3096359893679619, |
| "epoch": 0.9201277955271565, |
| "grad_norm": 0.6263618469238281, |
| "learning_rate": 1.8444444444444448e-05, |
| "loss": 1.4846, |
| "mean_token_accuracy": 0.6817599721252918, |
| "num_tokens": 2411448.0, |
| "step": 18 |
| }, |
| { |
| "entropy": 1.2960194796323776, |
| "epoch": 0.9712460063897763, |
| "grad_norm": 0.6250044703483582, |
| "learning_rate": 1.8222222222222224e-05, |
| "loss": 1.4525, |
| "mean_token_accuracy": 0.6899448521435261, |
| "num_tokens": 2545738.0, |
| "step": 19 |
| }, |
| { |
| "entropy": 1.2956058846579657, |
| "epoch": 1.0, |
| "grad_norm": 0.6314756870269775, |
| "learning_rate": 1.8e-05, |
| "loss": 1.4391, |
| "mean_token_accuracy": 0.6940541995896233, |
| "num_tokens": 2616926.0, |
| "step": 20 |
| }, |
| { |
| "entropy": 1.2914148643612862, |
| "epoch": 1.0511182108626198, |
| "grad_norm": 0.631912887096405, |
| "learning_rate": 1.7777777777777777e-05, |
| "loss": 1.4127, |
| "mean_token_accuracy": 0.6994642727077007, |
| "num_tokens": 2750943.0, |
| "step": 21 |
| }, |
| { |
| "entropy": 1.2763815149664879, |
| "epoch": 1.1022364217252396, |
| "grad_norm": 0.6287756562232971, |
| "learning_rate": 1.7555555555555556e-05, |
| "loss": 1.3806, |
| "mean_token_accuracy": 0.7029485926032066, |
| "num_tokens": 2885149.0, |
| "step": 22 |
| }, |
| { |
| "entropy": 1.2685835510492325, |
| "epoch": 1.1533546325878594, |
| "grad_norm": 0.6242936253547668, |
| "learning_rate": 1.7333333333333336e-05, |
| "loss": 1.3606, |
| "mean_token_accuracy": 0.7043894305825233, |
| "num_tokens": 3019433.0, |
| "step": 23 |
| }, |
| { |
| "entropy": 1.263872005045414, |
| "epoch": 1.2044728434504792, |
| "grad_norm": 0.6192987561225891, |
| "learning_rate": 1.7111111111111112e-05, |
| "loss": 1.337, |
| "mean_token_accuracy": 0.709864255040884, |
| "num_tokens": 3153309.0, |
| "step": 24 |
| }, |
| { |
| "entropy": 1.2505493015050888, |
| "epoch": 1.255591054313099, |
| "grad_norm": 0.6113152503967285, |
| "learning_rate": 1.688888888888889e-05, |
| "loss": 1.3113, |
| "mean_token_accuracy": 0.7119522020220757, |
| "num_tokens": 3287322.0, |
| "step": 25 |
| }, |
| { |
| "entropy": 1.242555320262909, |
| "epoch": 1.3067092651757188, |
| "grad_norm": 0.597195029258728, |
| "learning_rate": 1.6666666666666667e-05, |
| "loss": 1.2892, |
| "mean_token_accuracy": 0.7140648253262043, |
| "num_tokens": 3421258.0, |
| "step": 26 |
| }, |
| { |
| "entropy": 1.229688823223114, |
| "epoch": 1.3578274760383386, |
| "grad_norm": 0.586483359336853, |
| "learning_rate": 1.6444444444444444e-05, |
| "loss": 1.264, |
| "mean_token_accuracy": 0.7178861573338509, |
| "num_tokens": 3555933.0, |
| "step": 27 |
| }, |
| { |
| "entropy": 1.2219947651028633, |
| "epoch": 1.4089456869009584, |
| "grad_norm": 0.5772027373313904, |
| "learning_rate": 1.6222222222222223e-05, |
| "loss": 1.2416, |
| "mean_token_accuracy": 0.7191276662051678, |
| "num_tokens": 3689685.0, |
| "step": 28 |
| }, |
| { |
| "entropy": 1.2158958613872528, |
| "epoch": 1.4600638977635783, |
| "grad_norm": 0.5700508952140808, |
| "learning_rate": 1.6000000000000003e-05, |
| "loss": 1.2201, |
| "mean_token_accuracy": 0.7224024310708046, |
| "num_tokens": 3823781.0, |
| "step": 29 |
| }, |
| { |
| "entropy": 1.2152001112699509, |
| "epoch": 1.511182108626198, |
| "grad_norm": 0.5722755789756775, |
| "learning_rate": 1.577777777777778e-05, |
| "loss": 1.2091, |
| "mean_token_accuracy": 0.722760371863842, |
| "num_tokens": 3956942.0, |
| "step": 30 |
| }, |
| { |
| "entropy": 1.1953495219349861, |
| "epoch": 1.5623003194888179, |
| "grad_norm": 0.5692858695983887, |
| "learning_rate": 1.555555555555556e-05, |
| "loss": 1.1714, |
| "mean_token_accuracy": 0.7302578240633011, |
| "num_tokens": 4091313.0, |
| "step": 31 |
| }, |
| { |
| "entropy": 1.1933885142207146, |
| "epoch": 1.6134185303514377, |
| "grad_norm": 0.5751745700836182, |
| "learning_rate": 1.5333333333333334e-05, |
| "loss": 1.1615, |
| "mean_token_accuracy": 0.7318685166537762, |
| "num_tokens": 4225481.0, |
| "step": 32 |
| }, |
| { |
| "entropy": 1.1871799379587173, |
| "epoch": 1.6645367412140575, |
| "grad_norm": 0.5843569040298462, |
| "learning_rate": 1.5111111111111112e-05, |
| "loss": 1.1378, |
| "mean_token_accuracy": 0.7380619496107101, |
| "num_tokens": 4359158.0, |
| "step": 33 |
| }, |
| { |
| "entropy": 1.1805067732930183, |
| "epoch": 1.7156549520766773, |
| "grad_norm": 0.5917448997497559, |
| "learning_rate": 1.488888888888889e-05, |
| "loss": 1.1218, |
| "mean_token_accuracy": 0.7423498816788197, |
| "num_tokens": 4493077.0, |
| "step": 34 |
| }, |
| { |
| "entropy": 1.170977495610714, |
| "epoch": 1.766773162939297, |
| "grad_norm": 0.59839928150177, |
| "learning_rate": 1.4666666666666666e-05, |
| "loss": 1.096, |
| "mean_token_accuracy": 0.7493030689656734, |
| "num_tokens": 4627251.0, |
| "step": 35 |
| }, |
| { |
| "entropy": 1.1678270995616913, |
| "epoch": 1.817891373801917, |
| "grad_norm": 0.6055343151092529, |
| "learning_rate": 1.4444444444444446e-05, |
| "loss": 1.0802, |
| "mean_token_accuracy": 0.7507753595709801, |
| "num_tokens": 4761020.0, |
| "step": 36 |
| }, |
| { |
| "entropy": 1.158110834658146, |
| "epoch": 1.8690095846645367, |
| "grad_norm": 0.6079038381576538, |
| "learning_rate": 1.4222222222222224e-05, |
| "loss": 1.0628, |
| "mean_token_accuracy": 0.755638737231493, |
| "num_tokens": 4895183.0, |
| "step": 37 |
| }, |
| { |
| "entropy": 1.154124453663826, |
| "epoch": 1.9201277955271565, |
| "grad_norm": 0.6134995222091675, |
| "learning_rate": 1.4e-05, |
| "loss": 1.043, |
| "mean_token_accuracy": 0.7614487372338772, |
| "num_tokens": 5028509.0, |
| "step": 38 |
| }, |
| { |
| "entropy": 1.1415963619947433, |
| "epoch": 1.9712460063897763, |
| "grad_norm": 0.608974277973175, |
| "learning_rate": 1.377777777777778e-05, |
| "loss": 1.0203, |
| "mean_token_accuracy": 0.7697952277958393, |
| "num_tokens": 5162590.0, |
| "step": 39 |
| }, |
| { |
| "entropy": 1.1324340105056763, |
| "epoch": 2.0, |
| "grad_norm": 0.6109018325805664, |
| "learning_rate": 1.3555555555555557e-05, |
| "loss": 0.9992, |
| "mean_token_accuracy": 0.7730923626157973, |
| "num_tokens": 5233852.0, |
| "step": 40 |
| }, |
| { |
| "entropy": 1.1270885691046715, |
| "epoch": 2.0511182108626196, |
| "grad_norm": 0.6178512573242188, |
| "learning_rate": 1.3333333333333333e-05, |
| "loss": 0.982, |
| "mean_token_accuracy": 0.7760093286633492, |
| "num_tokens": 5367686.0, |
| "step": 41 |
| }, |
| { |
| "entropy": 1.1163304820656776, |
| "epoch": 2.1022364217252396, |
| "grad_norm": 0.6236265301704407, |
| "learning_rate": 1.3111111111111113e-05, |
| "loss": 0.9606, |
| "mean_token_accuracy": 0.7832612432539463, |
| "num_tokens": 5501101.0, |
| "step": 42 |
| }, |
| { |
| "entropy": 1.1050259843468666, |
| "epoch": 2.1533546325878596, |
| "grad_norm": 0.6330907940864563, |
| "learning_rate": 1.288888888888889e-05, |
| "loss": 0.9442, |
| "mean_token_accuracy": 0.7889900915324688, |
| "num_tokens": 5634943.0, |
| "step": 43 |
| }, |
| { |
| "entropy": 1.0991561263799667, |
| "epoch": 2.2044728434504792, |
| "grad_norm": 0.6459551453590393, |
| "learning_rate": 1.2666666666666667e-05, |
| "loss": 0.9323, |
| "mean_token_accuracy": 0.7888863421976566, |
| "num_tokens": 5768035.0, |
| "step": 44 |
| }, |
| { |
| "entropy": 1.080874651670456, |
| "epoch": 2.255591054313099, |
| "grad_norm": 0.6497413516044617, |
| "learning_rate": 1.2444444444444446e-05, |
| "loss": 0.9089, |
| "mean_token_accuracy": 0.7905767410993576, |
| "num_tokens": 5901673.0, |
| "step": 45 |
| }, |
| { |
| "entropy": 1.0577788427472115, |
| "epoch": 2.306709265175719, |
| "grad_norm": 0.6514442563056946, |
| "learning_rate": 1.2222222222222224e-05, |
| "loss": 0.8845, |
| "mean_token_accuracy": 0.7969931028783321, |
| "num_tokens": 6036094.0, |
| "step": 46 |
| }, |
| { |
| "entropy": 1.0467759743332863, |
| "epoch": 2.357827476038339, |
| "grad_norm": 0.6560313105583191, |
| "learning_rate": 1.2e-05, |
| "loss": 0.8659, |
| "mean_token_accuracy": 0.802506472915411, |
| "num_tokens": 6170134.0, |
| "step": 47 |
| }, |
| { |
| "entropy": 1.0260686576366425, |
| "epoch": 2.4089456869009584, |
| "grad_norm": 0.6551167368888855, |
| "learning_rate": 1.177777777777778e-05, |
| "loss": 0.8486, |
| "mean_token_accuracy": 0.8058239929378033, |
| "num_tokens": 6304397.0, |
| "step": 48 |
| }, |
| { |
| "entropy": 1.0083096772432327, |
| "epoch": 2.460063897763578, |
| "grad_norm": 0.6541892290115356, |
| "learning_rate": 1.1555555555555556e-05, |
| "loss": 0.8356, |
| "mean_token_accuracy": 0.8059861660003662, |
| "num_tokens": 6438340.0, |
| "step": 49 |
| }, |
| { |
| "entropy": 0.9794742912054062, |
| "epoch": 2.511182108626198, |
| "grad_norm": 0.6508305668830872, |
| "learning_rate": 1.1333333333333334e-05, |
| "loss": 0.8151, |
| "mean_token_accuracy": 0.8144995309412479, |
| "num_tokens": 6573003.0, |
| "step": 50 |
| }, |
| { |
| "entropy": 0.9591087996959686, |
| "epoch": 2.562300319488818, |
| "grad_norm": 0.6544970273971558, |
| "learning_rate": 1.1111111111111113e-05, |
| "loss": 0.7982, |
| "mean_token_accuracy": 0.821755301207304, |
| "num_tokens": 6707094.0, |
| "step": 51 |
| }, |
| { |
| "entropy": 0.9311400800943375, |
| "epoch": 2.6134185303514377, |
| "grad_norm": 0.661201536655426, |
| "learning_rate": 1.088888888888889e-05, |
| "loss": 0.7748, |
| "mean_token_accuracy": 0.8267807699739933, |
| "num_tokens": 6841315.0, |
| "step": 52 |
| }, |
| { |
| "entropy": 0.9006332121789455, |
| "epoch": 2.6645367412140573, |
| "grad_norm": 0.6626005172729492, |
| "learning_rate": 1.0666666666666667e-05, |
| "loss": 0.7543, |
| "mean_token_accuracy": 0.8362897895276546, |
| "num_tokens": 6976249.0, |
| "step": 53 |
| }, |
| { |
| "entropy": 0.8812699876725674, |
| "epoch": 2.7156549520766773, |
| "grad_norm": 0.674384355545044, |
| "learning_rate": 1.0444444444444445e-05, |
| "loss": 0.7417, |
| "mean_token_accuracy": 0.8424257524311543, |
| "num_tokens": 7110449.0, |
| "step": 54 |
| }, |
| { |
| "entropy": 0.8507667072117329, |
| "epoch": 2.7667731629392973, |
| "grad_norm": 0.695296049118042, |
| "learning_rate": 1.0222222222222223e-05, |
| "loss": 0.7201, |
| "mean_token_accuracy": 0.8484714813530445, |
| "num_tokens": 7244307.0, |
| "step": 55 |
| }, |
| { |
| "entropy": 0.8221894763410091, |
| "epoch": 2.817891373801917, |
| "grad_norm": 0.7484252452850342, |
| "learning_rate": 1e-05, |
| "loss": 0.7067, |
| "mean_token_accuracy": 0.8503611832857132, |
| "num_tokens": 7378331.0, |
| "step": 56 |
| }, |
| { |
| "entropy": 0.7954868413507938, |
| "epoch": 2.8690095846645365, |
| "grad_norm": 0.8117406368255615, |
| "learning_rate": 9.777777777777779e-06, |
| "loss": 0.6898, |
| "mean_token_accuracy": 0.8546305038034916, |
| "num_tokens": 7511657.0, |
| "step": 57 |
| }, |
| { |
| "entropy": 0.7787146084010601, |
| "epoch": 2.9201277955271565, |
| "grad_norm": 0.7788737416267395, |
| "learning_rate": 9.555555555555556e-06, |
| "loss": 0.6761, |
| "mean_token_accuracy": 0.8574383407831192, |
| "num_tokens": 7645680.0, |
| "step": 58 |
| }, |
| { |
| "entropy": 0.7654511369764805, |
| "epoch": 2.9712460063897765, |
| "grad_norm": 0.6763613820075989, |
| "learning_rate": 9.333333333333334e-06, |
| "loss": 0.6651, |
| "mean_token_accuracy": 0.8578773178160191, |
| "num_tokens": 7779761.0, |
| "step": 59 |
| }, |
| { |
| "entropy": 0.7472146418359544, |
| "epoch": 3.0, |
| "grad_norm": 0.6285676956176758, |
| "learning_rate": 9.111111111111112e-06, |
| "loss": 0.6435, |
| "mean_token_accuracy": 0.86120914750629, |
| "num_tokens": 7850778.0, |
| "step": 60 |
| }, |
| { |
| "entropy": 0.7355797588825226, |
| "epoch": 3.0511182108626196, |
| "grad_norm": 0.6584138870239258, |
| "learning_rate": 8.888888888888888e-06, |
| "loss": 0.6312, |
| "mean_token_accuracy": 0.8601678982377052, |
| "num_tokens": 7984664.0, |
| "step": 61 |
| }, |
| { |
| "entropy": 0.7231738641858101, |
| "epoch": 3.1022364217252396, |
| "grad_norm": 0.6893587112426758, |
| "learning_rate": 8.666666666666668e-06, |
| "loss": 0.621, |
| "mean_token_accuracy": 0.8597363233566284, |
| "num_tokens": 8118372.0, |
| "step": 62 |
| }, |
| { |
| "entropy": 0.6995701305568218, |
| "epoch": 3.1533546325878596, |
| "grad_norm": 0.6631926894187927, |
| "learning_rate": 8.444444444444446e-06, |
| "loss": 0.5998, |
| "mean_token_accuracy": 0.8639725260436535, |
| "num_tokens": 8252383.0, |
| "step": 63 |
| }, |
| { |
| "entropy": 0.6806200109422207, |
| "epoch": 3.2044728434504792, |
| "grad_norm": 0.6041826009750366, |
| "learning_rate": 8.222222222222222e-06, |
| "loss": 0.5892, |
| "mean_token_accuracy": 0.8684131018817425, |
| "num_tokens": 8387058.0, |
| "step": 64 |
| }, |
| { |
| "entropy": 0.6625471338629723, |
| "epoch": 3.255591054313099, |
| "grad_norm": 0.6061173677444458, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.578, |
| "mean_token_accuracy": 0.8719681017100811, |
| "num_tokens": 8521167.0, |
| "step": 65 |
| }, |
| { |
| "entropy": 0.6458840593695641, |
| "epoch": 3.306709265175719, |
| "grad_norm": 0.6620640158653259, |
| "learning_rate": 7.77777777777778e-06, |
| "loss": 0.5631, |
| "mean_token_accuracy": 0.8758602887392044, |
| "num_tokens": 8654886.0, |
| "step": 66 |
| }, |
| { |
| "entropy": 0.63466951623559, |
| "epoch": 3.357827476038339, |
| "grad_norm": 0.6536484956741333, |
| "learning_rate": 7.555555555555556e-06, |
| "loss": 0.5574, |
| "mean_token_accuracy": 0.8795712888240814, |
| "num_tokens": 8788388.0, |
| "step": 67 |
| }, |
| { |
| "entropy": 0.6206231378018856, |
| "epoch": 3.4089456869009584, |
| "grad_norm": 0.5983281135559082, |
| "learning_rate": 7.333333333333333e-06, |
| "loss": 0.5412, |
| "mean_token_accuracy": 0.8814935386180878, |
| "num_tokens": 8921968.0, |
| "step": 68 |
| }, |
| { |
| "entropy": 0.6042437292635441, |
| "epoch": 3.460063897763578, |
| "grad_norm": 0.568672776222229, |
| "learning_rate": 7.111111111111112e-06, |
| "loss": 0.5308, |
| "mean_token_accuracy": 0.8831478171050549, |
| "num_tokens": 9056066.0, |
| "step": 69 |
| }, |
| { |
| "entropy": 0.601006530225277, |
| "epoch": 3.511182108626198, |
| "grad_norm": 0.5580371618270874, |
| "learning_rate": 6.88888888888889e-06, |
| "loss": 0.5254, |
| "mean_token_accuracy": 0.8843187876045704, |
| "num_tokens": 9189930.0, |
| "step": 70 |
| }, |
| { |
| "entropy": 0.5836833901703358, |
| "epoch": 3.562300319488818, |
| "grad_norm": 0.541730523109436, |
| "learning_rate": 6.666666666666667e-06, |
| "loss": 0.5103, |
| "mean_token_accuracy": 0.8875499293208122, |
| "num_tokens": 9323696.0, |
| "step": 71 |
| }, |
| { |
| "entropy": 0.5648492202162743, |
| "epoch": 3.6134185303514377, |
| "grad_norm": 0.5018172860145569, |
| "learning_rate": 6.444444444444445e-06, |
| "loss": 0.4952, |
| "mean_token_accuracy": 0.8905236721038818, |
| "num_tokens": 9457840.0, |
| "step": 72 |
| }, |
| { |
| "entropy": 0.5578960217535496, |
| "epoch": 3.6645367412140573, |
| "grad_norm": 0.4896445572376251, |
| "learning_rate": 6.222222222222223e-06, |
| "loss": 0.4918, |
| "mean_token_accuracy": 0.8900170363485813, |
| "num_tokens": 9591569.0, |
| "step": 73 |
| }, |
| { |
| "entropy": 0.5444952994585037, |
| "epoch": 3.7156549520766773, |
| "grad_norm": 0.4938449263572693, |
| "learning_rate": 6e-06, |
| "loss": 0.4859, |
| "mean_token_accuracy": 0.8906422667205334, |
| "num_tokens": 9725752.0, |
| "step": 74 |
| }, |
| { |
| "entropy": 0.5357677936553955, |
| "epoch": 3.7667731629392973, |
| "grad_norm": 0.4953802227973938, |
| "learning_rate": 5.777777777777778e-06, |
| "loss": 0.4771, |
| "mean_token_accuracy": 0.8924155794084072, |
| "num_tokens": 9859936.0, |
| "step": 75 |
| }, |
| { |
| "entropy": 0.5303498916327953, |
| "epoch": 3.817891373801917, |
| "grad_norm": 0.47086596488952637, |
| "learning_rate": 5.555555555555557e-06, |
| "loss": 0.4745, |
| "mean_token_accuracy": 0.8935006484389305, |
| "num_tokens": 9993839.0, |
| "step": 76 |
| }, |
| { |
| "entropy": 0.5241257101297379, |
| "epoch": 3.8690095846645365, |
| "grad_norm": 0.46224120259284973, |
| "learning_rate": 5.333333333333334e-06, |
| "loss": 0.4695, |
| "mean_token_accuracy": 0.8958121947944164, |
| "num_tokens": 10127352.0, |
| "step": 77 |
| }, |
| { |
| "entropy": 0.5198668912053108, |
| "epoch": 3.9201277955271565, |
| "grad_norm": 0.4502220153808594, |
| "learning_rate": 5.1111111111111115e-06, |
| "loss": 0.458, |
| "mean_token_accuracy": 0.8979315273463726, |
| "num_tokens": 10261669.0, |
| "step": 78 |
| }, |
| { |
| "entropy": 0.5175420753657818, |
| "epoch": 3.9712460063897765, |
| "grad_norm": 0.460803359746933, |
| "learning_rate": 4.888888888888889e-06, |
| "loss": 0.4607, |
| "mean_token_accuracy": 0.8980869241058826, |
| "num_tokens": 10396077.0, |
| "step": 79 |
| }, |
| { |
| "entropy": 0.5124501652187772, |
| "epoch": 4.0, |
| "grad_norm": 0.4524611532688141, |
| "learning_rate": 4.666666666666667e-06, |
| "loss": 0.4524, |
| "mean_token_accuracy": 0.8987092839346992, |
| "num_tokens": 10467704.0, |
| "step": 80 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 100, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 5, |
| "save_steps": 10, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 9.131490936255283e+16, |
| "train_batch_size": 16, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|