| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 1.0, | |
| "eval_steps": 500, | |
| "global_step": 148, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "entropy": 0.8085902333259583, | |
| "epoch": 0.006756756756756757, | |
| "grad_norm": 0.14047187566757202, | |
| "learning_rate": 0.0, | |
| "loss": 1.0639, | |
| "mean_token_accuracy": 0.698029100894928, | |
| "num_tokens": 520290.0, | |
| "step": 1 | |
| }, | |
| { | |
| "entropy": 0.839566707611084, | |
| "epoch": 0.013513513513513514, | |
| "grad_norm": 0.1440640240907669, | |
| "learning_rate": 4e-05, | |
| "loss": 1.0952, | |
| "mean_token_accuracy": 0.687626302242279, | |
| "num_tokens": 1042076.0, | |
| "step": 2 | |
| }, | |
| { | |
| "entropy": 0.8076200485229492, | |
| "epoch": 0.02027027027027027, | |
| "grad_norm": 0.13277006149291992, | |
| "learning_rate": 8e-05, | |
| "loss": 1.0518, | |
| "mean_token_accuracy": 0.703219473361969, | |
| "num_tokens": 1561230.0, | |
| "step": 3 | |
| }, | |
| { | |
| "entropy": 0.8777214884757996, | |
| "epoch": 0.02702702702702703, | |
| "grad_norm": 0.11633922904729843, | |
| "learning_rate": 0.00012, | |
| "loss": 1.0688, | |
| "mean_token_accuracy": 0.6899521946907043, | |
| "num_tokens": 2082559.0, | |
| "step": 4 | |
| }, | |
| { | |
| "entropy": 0.9525601863861084, | |
| "epoch": 0.033783783783783786, | |
| "grad_norm": 0.11181541532278061, | |
| "learning_rate": 0.00016, | |
| "loss": 1.0118, | |
| "mean_token_accuracy": 0.6984938383102417, | |
| "num_tokens": 2604188.0, | |
| "step": 5 | |
| }, | |
| { | |
| "entropy": 1.030306100845337, | |
| "epoch": 0.04054054054054054, | |
| "grad_norm": 0.1037818044424057, | |
| "learning_rate": 0.0002, | |
| "loss": 0.9858, | |
| "mean_token_accuracy": 0.7052151560783386, | |
| "num_tokens": 3125438.0, | |
| "step": 6 | |
| }, | |
| { | |
| "entropy": 1.058103084564209, | |
| "epoch": 0.0472972972972973, | |
| "grad_norm": 0.08032543957233429, | |
| "learning_rate": 0.00019860139860139862, | |
| "loss": 0.9888, | |
| "mean_token_accuracy": 0.700404167175293, | |
| "num_tokens": 3647168.0, | |
| "step": 7 | |
| }, | |
| { | |
| "entropy": 1.0215051174163818, | |
| "epoch": 0.05405405405405406, | |
| "grad_norm": 0.11070238053798676, | |
| "learning_rate": 0.0001972027972027972, | |
| "loss": 0.9369, | |
| "mean_token_accuracy": 0.7151404619216919, | |
| "num_tokens": 4167149.0, | |
| "step": 8 | |
| }, | |
| { | |
| "entropy": 1.0240428447723389, | |
| "epoch": 0.060810810810810814, | |
| "grad_norm": 0.08369743078947067, | |
| "learning_rate": 0.00019580419580419583, | |
| "loss": 0.9682, | |
| "mean_token_accuracy": 0.7065526843070984, | |
| "num_tokens": 4685442.0, | |
| "step": 9 | |
| }, | |
| { | |
| "entropy": 0.9852886199951172, | |
| "epoch": 0.06756756756756757, | |
| "grad_norm": 0.045641954988241196, | |
| "learning_rate": 0.0001944055944055944, | |
| "loss": 0.9657, | |
| "mean_token_accuracy": 0.7040433287620544, | |
| "num_tokens": 5207318.0, | |
| "step": 10 | |
| }, | |
| { | |
| "entropy": 0.9090337157249451, | |
| "epoch": 0.07432432432432433, | |
| "grad_norm": 0.036005664616823196, | |
| "learning_rate": 0.000193006993006993, | |
| "loss": 0.9261, | |
| "mean_token_accuracy": 0.713391900062561, | |
| "num_tokens": 5728363.0, | |
| "step": 11 | |
| }, | |
| { | |
| "entropy": 0.8860379457473755, | |
| "epoch": 0.08108108108108109, | |
| "grad_norm": 0.0332878939807415, | |
| "learning_rate": 0.00019160839160839161, | |
| "loss": 0.9255, | |
| "mean_token_accuracy": 0.7144545912742615, | |
| "num_tokens": 6249685.0, | |
| "step": 12 | |
| }, | |
| { | |
| "entropy": 0.8721570372581482, | |
| "epoch": 0.08783783783783784, | |
| "grad_norm": 0.034468844532966614, | |
| "learning_rate": 0.00019020979020979023, | |
| "loss": 0.9262, | |
| "mean_token_accuracy": 0.7132186889648438, | |
| "num_tokens": 6771073.0, | |
| "step": 13 | |
| }, | |
| { | |
| "entropy": 0.8796703815460205, | |
| "epoch": 0.0945945945945946, | |
| "grad_norm": 0.03204338252544403, | |
| "learning_rate": 0.00018881118881118882, | |
| "loss": 0.9312, | |
| "mean_token_accuracy": 0.7109803557395935, | |
| "num_tokens": 7292911.0, | |
| "step": 14 | |
| }, | |
| { | |
| "entropy": 0.898871660232544, | |
| "epoch": 0.10135135135135136, | |
| "grad_norm": 0.02886817790567875, | |
| "learning_rate": 0.00018741258741258743, | |
| "loss": 0.9331, | |
| "mean_token_accuracy": 0.7116541266441345, | |
| "num_tokens": 7808250.0, | |
| "step": 15 | |
| }, | |
| { | |
| "entropy": 0.8994000554084778, | |
| "epoch": 0.10810810810810811, | |
| "grad_norm": 0.021798841655254364, | |
| "learning_rate": 0.00018601398601398602, | |
| "loss": 0.9064, | |
| "mean_token_accuracy": 0.717405378818512, | |
| "num_tokens": 8327930.0, | |
| "step": 16 | |
| }, | |
| { | |
| "entropy": 0.9358035922050476, | |
| "epoch": 0.11486486486486487, | |
| "grad_norm": 0.020242631435394287, | |
| "learning_rate": 0.00018461538461538463, | |
| "loss": 0.9271, | |
| "mean_token_accuracy": 0.7115734815597534, | |
| "num_tokens": 8843022.0, | |
| "step": 17 | |
| }, | |
| { | |
| "entropy": 0.9195488095283508, | |
| "epoch": 0.12162162162162163, | |
| "grad_norm": 0.02282623015344143, | |
| "learning_rate": 0.00018321678321678322, | |
| "loss": 0.8845, | |
| "mean_token_accuracy": 0.7234280109405518, | |
| "num_tokens": 9362231.0, | |
| "step": 18 | |
| }, | |
| { | |
| "entropy": 0.9643428325653076, | |
| "epoch": 0.12837837837837837, | |
| "grad_norm": 0.02410944364964962, | |
| "learning_rate": 0.00018181818181818183, | |
| "loss": 0.924, | |
| "mean_token_accuracy": 0.7115726470947266, | |
| "num_tokens": 9883300.0, | |
| "step": 19 | |
| }, | |
| { | |
| "entropy": 0.9452087879180908, | |
| "epoch": 0.13513513513513514, | |
| "grad_norm": 0.024604445323348045, | |
| "learning_rate": 0.00018041958041958042, | |
| "loss": 0.9038, | |
| "mean_token_accuracy": 0.7173081636428833, | |
| "num_tokens": 10403398.0, | |
| "step": 20 | |
| }, | |
| { | |
| "entropy": 0.9439239501953125, | |
| "epoch": 0.14189189189189189, | |
| "grad_norm": 0.022483721375465393, | |
| "learning_rate": 0.00017902097902097904, | |
| "loss": 0.9088, | |
| "mean_token_accuracy": 0.7151318192481995, | |
| "num_tokens": 10924591.0, | |
| "step": 21 | |
| }, | |
| { | |
| "entropy": 0.9263131022453308, | |
| "epoch": 0.14864864864864866, | |
| "grad_norm": 0.020426636561751366, | |
| "learning_rate": 0.00017762237762237762, | |
| "loss": 0.9073, | |
| "mean_token_accuracy": 0.7165982127189636, | |
| "num_tokens": 11445479.0, | |
| "step": 22 | |
| }, | |
| { | |
| "entropy": 0.8799407482147217, | |
| "epoch": 0.1554054054054054, | |
| "grad_norm": 0.015485940501093864, | |
| "learning_rate": 0.00017622377622377624, | |
| "loss": 0.8733, | |
| "mean_token_accuracy": 0.7245533466339111, | |
| "num_tokens": 11967017.0, | |
| "step": 23 | |
| }, | |
| { | |
| "entropy": 0.8788759708404541, | |
| "epoch": 0.16216216216216217, | |
| "grad_norm": 0.017046676948666573, | |
| "learning_rate": 0.00017482517482517485, | |
| "loss": 0.8949, | |
| "mean_token_accuracy": 0.7203764319419861, | |
| "num_tokens": 12488697.0, | |
| "step": 24 | |
| }, | |
| { | |
| "entropy": 0.8779318332672119, | |
| "epoch": 0.16891891891891891, | |
| "grad_norm": 0.018766306340694427, | |
| "learning_rate": 0.0001734265734265734, | |
| "loss": 0.8983, | |
| "mean_token_accuracy": 0.7172704935073853, | |
| "num_tokens": 13009235.0, | |
| "step": 25 | |
| }, | |
| { | |
| "entropy": 0.8834930658340454, | |
| "epoch": 0.17567567567567569, | |
| "grad_norm": 0.020971953868865967, | |
| "learning_rate": 0.00017202797202797203, | |
| "loss": 0.9148, | |
| "mean_token_accuracy": 0.714013397693634, | |
| "num_tokens": 13530899.0, | |
| "step": 26 | |
| }, | |
| { | |
| "entropy": 0.8656618595123291, | |
| "epoch": 0.18243243243243243, | |
| "grad_norm": 0.018672997131943703, | |
| "learning_rate": 0.00017062937062937064, | |
| "loss": 0.8888, | |
| "mean_token_accuracy": 0.7208808064460754, | |
| "num_tokens": 14052671.0, | |
| "step": 27 | |
| }, | |
| { | |
| "entropy": 0.9078769087791443, | |
| "epoch": 0.1891891891891892, | |
| "grad_norm": 0.017195429652929306, | |
| "learning_rate": 0.00016923076923076923, | |
| "loss": 0.9242, | |
| "mean_token_accuracy": 0.7125155329704285, | |
| "num_tokens": 14574400.0, | |
| "step": 28 | |
| }, | |
| { | |
| "entropy": 0.9030580520629883, | |
| "epoch": 0.19594594594594594, | |
| "grad_norm": 0.01640770211815834, | |
| "learning_rate": 0.00016783216783216784, | |
| "loss": 0.9033, | |
| "mean_token_accuracy": 0.7164167761802673, | |
| "num_tokens": 15094634.0, | |
| "step": 29 | |
| }, | |
| { | |
| "entropy": 0.9503644108772278, | |
| "epoch": 0.20270270270270271, | |
| "grad_norm": 0.015921002253890038, | |
| "learning_rate": 0.00016643356643356646, | |
| "loss": 0.9454, | |
| "mean_token_accuracy": 0.7049751877784729, | |
| "num_tokens": 15616494.0, | |
| "step": 30 | |
| }, | |
| { | |
| "entropy": 0.9235352873802185, | |
| "epoch": 0.20945945945945946, | |
| "grad_norm": 0.0211192574352026, | |
| "learning_rate": 0.00016503496503496504, | |
| "loss": 0.9083, | |
| "mean_token_accuracy": 0.7148462533950806, | |
| "num_tokens": 16137586.0, | |
| "step": 31 | |
| }, | |
| { | |
| "entropy": 0.9228708744049072, | |
| "epoch": 0.21621621621621623, | |
| "grad_norm": 0.016181398183107376, | |
| "learning_rate": 0.00016363636363636366, | |
| "loss": 0.9076, | |
| "mean_token_accuracy": 0.715546190738678, | |
| "num_tokens": 16658862.0, | |
| "step": 32 | |
| }, | |
| { | |
| "entropy": 0.9239456653594971, | |
| "epoch": 0.22297297297297297, | |
| "grad_norm": 0.018280163407325745, | |
| "learning_rate": 0.00016223776223776225, | |
| "loss": 0.8973, | |
| "mean_token_accuracy": 0.7179359197616577, | |
| "num_tokens": 17181392.0, | |
| "step": 33 | |
| }, | |
| { | |
| "entropy": 0.9081429243087769, | |
| "epoch": 0.22972972972972974, | |
| "grad_norm": 0.01677747257053852, | |
| "learning_rate": 0.00016083916083916083, | |
| "loss": 0.89, | |
| "mean_token_accuracy": 0.7211325764656067, | |
| "num_tokens": 17701533.0, | |
| "step": 34 | |
| }, | |
| { | |
| "entropy": 0.8966501951217651, | |
| "epoch": 0.23648648648648649, | |
| "grad_norm": 0.015331600792706013, | |
| "learning_rate": 0.00015944055944055945, | |
| "loss": 0.8834, | |
| "mean_token_accuracy": 0.7213951349258423, | |
| "num_tokens": 18222829.0, | |
| "step": 35 | |
| }, | |
| { | |
| "entropy": 0.8946911096572876, | |
| "epoch": 0.24324324324324326, | |
| "grad_norm": 0.015129225328564644, | |
| "learning_rate": 0.00015804195804195806, | |
| "loss": 0.8895, | |
| "mean_token_accuracy": 0.7205337882041931, | |
| "num_tokens": 18743612.0, | |
| "step": 36 | |
| }, | |
| { | |
| "entropy": 0.924943208694458, | |
| "epoch": 0.25, | |
| "grad_norm": 0.016918186098337173, | |
| "learning_rate": 0.00015664335664335665, | |
| "loss": 0.934, | |
| "mean_token_accuracy": 0.7084062099456787, | |
| "num_tokens": 19265066.0, | |
| "step": 37 | |
| }, | |
| { | |
| "entropy": 0.8968441486358643, | |
| "epoch": 0.25675675675675674, | |
| "grad_norm": 0.01708938553929329, | |
| "learning_rate": 0.00015524475524475526, | |
| "loss": 0.9031, | |
| "mean_token_accuracy": 0.7160695791244507, | |
| "num_tokens": 19785775.0, | |
| "step": 38 | |
| }, | |
| { | |
| "entropy": 0.9021536111831665, | |
| "epoch": 0.2635135135135135, | |
| "grad_norm": 0.01632198505103588, | |
| "learning_rate": 0.00015384615384615385, | |
| "loss": 0.9106, | |
| "mean_token_accuracy": 0.7141885161399841, | |
| "num_tokens": 20306201.0, | |
| "step": 39 | |
| }, | |
| { | |
| "entropy": 0.9144896864891052, | |
| "epoch": 0.2702702702702703, | |
| "grad_norm": 0.015601382590830326, | |
| "learning_rate": 0.00015244755244755244, | |
| "loss": 0.9177, | |
| "mean_token_accuracy": 0.7121185660362244, | |
| "num_tokens": 20827655.0, | |
| "step": 40 | |
| }, | |
| { | |
| "entropy": 0.905925989151001, | |
| "epoch": 0.27702702702702703, | |
| "grad_norm": 0.015409526415169239, | |
| "learning_rate": 0.00015104895104895105, | |
| "loss": 0.9078, | |
| "mean_token_accuracy": 0.7149533033370972, | |
| "num_tokens": 21347179.0, | |
| "step": 41 | |
| }, | |
| { | |
| "entropy": 0.9454245567321777, | |
| "epoch": 0.28378378378378377, | |
| "grad_norm": 0.01604871265590191, | |
| "learning_rate": 0.00014965034965034964, | |
| "loss": 0.9382, | |
| "mean_token_accuracy": 0.7070503234863281, | |
| "num_tokens": 21867430.0, | |
| "step": 42 | |
| }, | |
| { | |
| "entropy": 0.8873435258865356, | |
| "epoch": 0.2905405405405405, | |
| "grad_norm": 0.01542913168668747, | |
| "learning_rate": 0.00014825174825174825, | |
| "loss": 0.8767, | |
| "mean_token_accuracy": 0.722999632358551, | |
| "num_tokens": 22386589.0, | |
| "step": 43 | |
| }, | |
| { | |
| "entropy": 0.8967331647872925, | |
| "epoch": 0.2972972972972973, | |
| "grad_norm": 0.016037292778491974, | |
| "learning_rate": 0.00014685314685314687, | |
| "loss": 0.8846, | |
| "mean_token_accuracy": 0.7215548157691956, | |
| "num_tokens": 22901934.0, | |
| "step": 44 | |
| }, | |
| { | |
| "entropy": 0.8978803753852844, | |
| "epoch": 0.30405405405405406, | |
| "grad_norm": 0.015742763876914978, | |
| "learning_rate": 0.00014545454545454546, | |
| "loss": 0.8913, | |
| "mean_token_accuracy": 0.7180163264274597, | |
| "num_tokens": 23422539.0, | |
| "step": 45 | |
| }, | |
| { | |
| "entropy": 0.8771539926528931, | |
| "epoch": 0.3108108108108108, | |
| "grad_norm": 0.015965279191732407, | |
| "learning_rate": 0.00014405594405594407, | |
| "loss": 0.8759, | |
| "mean_token_accuracy": 0.7238014936447144, | |
| "num_tokens": 23943911.0, | |
| "step": 46 | |
| }, | |
| { | |
| "entropy": 0.9169310331344604, | |
| "epoch": 0.31756756756756754, | |
| "grad_norm": 0.01562552899122238, | |
| "learning_rate": 0.00014265734265734269, | |
| "loss": 0.9091, | |
| "mean_token_accuracy": 0.7138416171073914, | |
| "num_tokens": 24465171.0, | |
| "step": 47 | |
| }, | |
| { | |
| "entropy": 0.9132385849952698, | |
| "epoch": 0.32432432432432434, | |
| "grad_norm": 0.015293029136955738, | |
| "learning_rate": 0.00014125874125874125, | |
| "loss": 0.9102, | |
| "mean_token_accuracy": 0.7141794562339783, | |
| "num_tokens": 24986963.0, | |
| "step": 48 | |
| }, | |
| { | |
| "entropy": 0.9064264893531799, | |
| "epoch": 0.3310810810810811, | |
| "grad_norm": 0.01573154330253601, | |
| "learning_rate": 0.00013986013986013986, | |
| "loss": 0.9104, | |
| "mean_token_accuracy": 0.7143970727920532, | |
| "num_tokens": 25507940.0, | |
| "step": 49 | |
| }, | |
| { | |
| "entropy": 0.9247837662696838, | |
| "epoch": 0.33783783783783783, | |
| "grad_norm": 0.01566915400326252, | |
| "learning_rate": 0.00013846153846153847, | |
| "loss": 0.9244, | |
| "mean_token_accuracy": 0.7100151181221008, | |
| "num_tokens": 26029633.0, | |
| "step": 50 | |
| }, | |
| { | |
| "entropy": 0.8918017148971558, | |
| "epoch": 0.34459459459459457, | |
| "grad_norm": 0.016166144981980324, | |
| "learning_rate": 0.00013706293706293706, | |
| "loss": 0.8896, | |
| "mean_token_accuracy": 0.7206509709358215, | |
| "num_tokens": 26551330.0, | |
| "step": 51 | |
| }, | |
| { | |
| "entropy": 0.8915703296661377, | |
| "epoch": 0.35135135135135137, | |
| "grad_norm": 0.016258137300610542, | |
| "learning_rate": 0.00013566433566433568, | |
| "loss": 0.8838, | |
| "mean_token_accuracy": 0.7210925817489624, | |
| "num_tokens": 27071628.0, | |
| "step": 52 | |
| }, | |
| { | |
| "entropy": 0.910973310470581, | |
| "epoch": 0.3581081081081081, | |
| "grad_norm": 0.015521145425736904, | |
| "learning_rate": 0.0001342657342657343, | |
| "loss": 0.9066, | |
| "mean_token_accuracy": 0.7141618132591248, | |
| "num_tokens": 27592998.0, | |
| "step": 53 | |
| }, | |
| { | |
| "entropy": 0.9083860516548157, | |
| "epoch": 0.36486486486486486, | |
| "grad_norm": 0.01575257070362568, | |
| "learning_rate": 0.00013286713286713288, | |
| "loss": 0.9014, | |
| "mean_token_accuracy": 0.7151919603347778, | |
| "num_tokens": 28115257.0, | |
| "step": 54 | |
| }, | |
| { | |
| "entropy": 0.9026190638542175, | |
| "epoch": 0.3716216216216216, | |
| "grad_norm": 0.01594236120581627, | |
| "learning_rate": 0.00013146853146853147, | |
| "loss": 0.8943, | |
| "mean_token_accuracy": 0.7175394296646118, | |
| "num_tokens": 28636147.0, | |
| "step": 55 | |
| }, | |
| { | |
| "entropy": 0.8969719409942627, | |
| "epoch": 0.3783783783783784, | |
| "grad_norm": 0.016330119222402573, | |
| "learning_rate": 0.00013006993006993008, | |
| "loss": 0.8973, | |
| "mean_token_accuracy": 0.7174832820892334, | |
| "num_tokens": 29158715.0, | |
| "step": 56 | |
| }, | |
| { | |
| "entropy": 0.9341723322868347, | |
| "epoch": 0.38513513513513514, | |
| "grad_norm": 0.01603098399937153, | |
| "learning_rate": 0.00012867132867132867, | |
| "loss": 0.9331, | |
| "mean_token_accuracy": 0.7069593667984009, | |
| "num_tokens": 29681317.0, | |
| "step": 57 | |
| }, | |
| { | |
| "entropy": 0.8998703360557556, | |
| "epoch": 0.3918918918918919, | |
| "grad_norm": 0.016095977276563644, | |
| "learning_rate": 0.00012727272727272728, | |
| "loss": 0.8929, | |
| "mean_token_accuracy": 0.718587338924408, | |
| "num_tokens": 30202981.0, | |
| "step": 58 | |
| }, | |
| { | |
| "entropy": 0.9057657718658447, | |
| "epoch": 0.39864864864864863, | |
| "grad_norm": 0.016768187284469604, | |
| "learning_rate": 0.00012587412587412587, | |
| "loss": 0.9004, | |
| "mean_token_accuracy": 0.7168737649917603, | |
| "num_tokens": 30717612.0, | |
| "step": 59 | |
| }, | |
| { | |
| "entropy": 0.898177981376648, | |
| "epoch": 0.40540540540540543, | |
| "grad_norm": 0.01631537266075611, | |
| "learning_rate": 0.00012447552447552448, | |
| "loss": 0.8949, | |
| "mean_token_accuracy": 0.7172833681106567, | |
| "num_tokens": 31239946.0, | |
| "step": 60 | |
| }, | |
| { | |
| "entropy": 0.8847284317016602, | |
| "epoch": 0.41216216216216217, | |
| "grad_norm": 0.01665564626455307, | |
| "learning_rate": 0.0001230769230769231, | |
| "loss": 0.88, | |
| "mean_token_accuracy": 0.7212061285972595, | |
| "num_tokens": 31761778.0, | |
| "step": 61 | |
| }, | |
| { | |
| "entropy": 0.8945165872573853, | |
| "epoch": 0.4189189189189189, | |
| "grad_norm": 0.01744748093187809, | |
| "learning_rate": 0.0001216783216783217, | |
| "loss": 0.893, | |
| "mean_token_accuracy": 0.7190291285514832, | |
| "num_tokens": 32284035.0, | |
| "step": 62 | |
| }, | |
| { | |
| "entropy": 0.9090993404388428, | |
| "epoch": 0.42567567567567566, | |
| "grad_norm": 0.017030267044901848, | |
| "learning_rate": 0.00012027972027972027, | |
| "loss": 0.9082, | |
| "mean_token_accuracy": 0.7141250967979431, | |
| "num_tokens": 32804596.0, | |
| "step": 63 | |
| }, | |
| { | |
| "entropy": 0.9153857231140137, | |
| "epoch": 0.43243243243243246, | |
| "grad_norm": 0.016640154644846916, | |
| "learning_rate": 0.00011888111888111889, | |
| "loss": 0.9149, | |
| "mean_token_accuracy": 0.7134872078895569, | |
| "num_tokens": 33325570.0, | |
| "step": 64 | |
| }, | |
| { | |
| "entropy": 0.8797224164009094, | |
| "epoch": 0.4391891891891892, | |
| "grad_norm": 0.01646554283797741, | |
| "learning_rate": 0.00011748251748251749, | |
| "loss": 0.8781, | |
| "mean_token_accuracy": 0.7216721177101135, | |
| "num_tokens": 33846540.0, | |
| "step": 65 | |
| }, | |
| { | |
| "entropy": 0.8971645832061768, | |
| "epoch": 0.44594594594594594, | |
| "grad_norm": 0.016337089240550995, | |
| "learning_rate": 0.00011608391608391609, | |
| "loss": 0.8935, | |
| "mean_token_accuracy": 0.7179019451141357, | |
| "num_tokens": 34366891.0, | |
| "step": 66 | |
| }, | |
| { | |
| "entropy": 0.904381513595581, | |
| "epoch": 0.4527027027027027, | |
| "grad_norm": 0.017709996551275253, | |
| "learning_rate": 0.00011468531468531469, | |
| "loss": 0.8984, | |
| "mean_token_accuracy": 0.7170865535736084, | |
| "num_tokens": 34889245.0, | |
| "step": 67 | |
| }, | |
| { | |
| "entropy": 0.9063211679458618, | |
| "epoch": 0.4594594594594595, | |
| "grad_norm": 0.017201313748955727, | |
| "learning_rate": 0.0001132867132867133, | |
| "loss": 0.9015, | |
| "mean_token_accuracy": 0.7147793173789978, | |
| "num_tokens": 35404556.0, | |
| "step": 68 | |
| }, | |
| { | |
| "entropy": 0.8922293186187744, | |
| "epoch": 0.46621621621621623, | |
| "grad_norm": 0.016904350370168686, | |
| "learning_rate": 0.0001118881118881119, | |
| "loss": 0.888, | |
| "mean_token_accuracy": 0.719697892665863, | |
| "num_tokens": 35925065.0, | |
| "step": 69 | |
| }, | |
| { | |
| "entropy": 0.9034209251403809, | |
| "epoch": 0.47297297297297297, | |
| "grad_norm": 0.017079392448067665, | |
| "learning_rate": 0.00011048951048951048, | |
| "loss": 0.896, | |
| "mean_token_accuracy": 0.7172802686691284, | |
| "num_tokens": 36446132.0, | |
| "step": 70 | |
| }, | |
| { | |
| "entropy": 0.887146532535553, | |
| "epoch": 0.4797297297297297, | |
| "grad_norm": 0.01738973893225193, | |
| "learning_rate": 0.00010909090909090909, | |
| "loss": 0.8828, | |
| "mean_token_accuracy": 0.7213963270187378, | |
| "num_tokens": 36967214.0, | |
| "step": 71 | |
| }, | |
| { | |
| "entropy": 0.8868647813796997, | |
| "epoch": 0.4864864864864865, | |
| "grad_norm": 0.017861152067780495, | |
| "learning_rate": 0.0001076923076923077, | |
| "loss": 0.8773, | |
| "mean_token_accuracy": 0.7218159437179565, | |
| "num_tokens": 37488377.0, | |
| "step": 72 | |
| }, | |
| { | |
| "entropy": 0.9134626388549805, | |
| "epoch": 0.49324324324324326, | |
| "grad_norm": 0.017511827871203423, | |
| "learning_rate": 0.0001062937062937063, | |
| "loss": 0.9122, | |
| "mean_token_accuracy": 0.7130904793739319, | |
| "num_tokens": 38008473.0, | |
| "step": 73 | |
| }, | |
| { | |
| "entropy": 0.8864673376083374, | |
| "epoch": 0.5, | |
| "grad_norm": 0.017983395606279373, | |
| "learning_rate": 0.0001048951048951049, | |
| "loss": 0.8881, | |
| "mean_token_accuracy": 0.7188193798065186, | |
| "num_tokens": 38529415.0, | |
| "step": 74 | |
| }, | |
| { | |
| "entropy": 0.9193586111068726, | |
| "epoch": 0.5067567567567568, | |
| "grad_norm": 0.018130991607904434, | |
| "learning_rate": 0.00010349650349650351, | |
| "loss": 0.9259, | |
| "mean_token_accuracy": 0.7099359631538391, | |
| "num_tokens": 39049712.0, | |
| "step": 75 | |
| }, | |
| { | |
| "entropy": 0.8895922303199768, | |
| "epoch": 0.5135135135135135, | |
| "grad_norm": 0.017333725467324257, | |
| "learning_rate": 0.00010209790209790211, | |
| "loss": 0.8891, | |
| "mean_token_accuracy": 0.7198699116706848, | |
| "num_tokens": 39566565.0, | |
| "step": 76 | |
| }, | |
| { | |
| "entropy": 0.8800663948059082, | |
| "epoch": 0.5202702702702703, | |
| "grad_norm": 0.018249373883008957, | |
| "learning_rate": 0.00010069930069930071, | |
| "loss": 0.8692, | |
| "mean_token_accuracy": 0.724394679069519, | |
| "num_tokens": 40086868.0, | |
| "step": 77 | |
| }, | |
| { | |
| "entropy": 0.8940162658691406, | |
| "epoch": 0.527027027027027, | |
| "grad_norm": 0.01744958944618702, | |
| "learning_rate": 9.930069930069931e-05, | |
| "loss": 0.8898, | |
| "mean_token_accuracy": 0.7197460532188416, | |
| "num_tokens": 40607334.0, | |
| "step": 78 | |
| }, | |
| { | |
| "entropy": 0.8987851142883301, | |
| "epoch": 0.5337837837837838, | |
| "grad_norm": 0.018518365919589996, | |
| "learning_rate": 9.790209790209791e-05, | |
| "loss": 0.8904, | |
| "mean_token_accuracy": 0.7207316160202026, | |
| "num_tokens": 41126753.0, | |
| "step": 79 | |
| }, | |
| { | |
| "entropy": 0.8804575204849243, | |
| "epoch": 0.5405405405405406, | |
| "grad_norm": 0.018223201856017113, | |
| "learning_rate": 9.65034965034965e-05, | |
| "loss": 0.8763, | |
| "mean_token_accuracy": 0.7228670120239258, | |
| "num_tokens": 41647670.0, | |
| "step": 80 | |
| }, | |
| { | |
| "entropy": 0.8889448046684265, | |
| "epoch": 0.5472972972972973, | |
| "grad_norm": 0.018730709329247475, | |
| "learning_rate": 9.510489510489511e-05, | |
| "loss": 0.883, | |
| "mean_token_accuracy": 0.7205345630645752, | |
| "num_tokens": 42166183.0, | |
| "step": 81 | |
| }, | |
| { | |
| "entropy": 0.8915292024612427, | |
| "epoch": 0.5540540540540541, | |
| "grad_norm": 0.018218854442238808, | |
| "learning_rate": 9.370629370629372e-05, | |
| "loss": 0.8835, | |
| "mean_token_accuracy": 0.7211962342262268, | |
| "num_tokens": 42685978.0, | |
| "step": 82 | |
| }, | |
| { | |
| "entropy": 0.871468186378479, | |
| "epoch": 0.5608108108108109, | |
| "grad_norm": 0.0187361016869545, | |
| "learning_rate": 9.230769230769232e-05, | |
| "loss": 0.8697, | |
| "mean_token_accuracy": 0.7244738340377808, | |
| "num_tokens": 43203743.0, | |
| "step": 83 | |
| }, | |
| { | |
| "entropy": 0.8702860474586487, | |
| "epoch": 0.5675675675675675, | |
| "grad_norm": 0.018368471413850784, | |
| "learning_rate": 9.090909090909092e-05, | |
| "loss": 0.8698, | |
| "mean_token_accuracy": 0.7259374260902405, | |
| "num_tokens": 43725363.0, | |
| "step": 84 | |
| }, | |
| { | |
| "entropy": 0.8703951239585876, | |
| "epoch": 0.5743243243243243, | |
| "grad_norm": 0.01838189922273159, | |
| "learning_rate": 8.951048951048952e-05, | |
| "loss": 0.8743, | |
| "mean_token_accuracy": 0.7234249711036682, | |
| "num_tokens": 44246559.0, | |
| "step": 85 | |
| }, | |
| { | |
| "entropy": 0.8820457458496094, | |
| "epoch": 0.581081081081081, | |
| "grad_norm": 0.019160225987434387, | |
| "learning_rate": 8.811188811188812e-05, | |
| "loss": 0.8849, | |
| "mean_token_accuracy": 0.7206857800483704, | |
| "num_tokens": 44769062.0, | |
| "step": 86 | |
| }, | |
| { | |
| "entropy": 0.9152972102165222, | |
| "epoch": 0.5878378378378378, | |
| "grad_norm": 0.019004985690116882, | |
| "learning_rate": 8.67132867132867e-05, | |
| "loss": 0.9153, | |
| "mean_token_accuracy": 0.7120697498321533, | |
| "num_tokens": 45286787.0, | |
| "step": 87 | |
| }, | |
| { | |
| "entropy": 0.904834508895874, | |
| "epoch": 0.5945945945945946, | |
| "grad_norm": 0.018431641161441803, | |
| "learning_rate": 8.531468531468532e-05, | |
| "loss": 0.9011, | |
| "mean_token_accuracy": 0.7159322500228882, | |
| "num_tokens": 45807531.0, | |
| "step": 88 | |
| }, | |
| { | |
| "entropy": 0.9021150469779968, | |
| "epoch": 0.6013513513513513, | |
| "grad_norm": 0.01898609660565853, | |
| "learning_rate": 8.391608391608392e-05, | |
| "loss": 0.8956, | |
| "mean_token_accuracy": 0.7188680768013, | |
| "num_tokens": 46321183.0, | |
| "step": 89 | |
| }, | |
| { | |
| "entropy": 0.9032172560691833, | |
| "epoch": 0.6081081081081081, | |
| "grad_norm": 0.02004328928887844, | |
| "learning_rate": 8.251748251748252e-05, | |
| "loss": 0.8928, | |
| "mean_token_accuracy": 0.7190265655517578, | |
| "num_tokens": 46841033.0, | |
| "step": 90 | |
| }, | |
| { | |
| "entropy": 0.9000004529953003, | |
| "epoch": 0.6148648648648649, | |
| "grad_norm": 0.019782939925789833, | |
| "learning_rate": 8.111888111888112e-05, | |
| "loss": 0.8849, | |
| "mean_token_accuracy": 0.7195360064506531, | |
| "num_tokens": 47363499.0, | |
| "step": 91 | |
| }, | |
| { | |
| "entropy": 0.8834792375564575, | |
| "epoch": 0.6216216216216216, | |
| "grad_norm": 0.0185946486890316, | |
| "learning_rate": 7.972027972027972e-05, | |
| "loss": 0.8762, | |
| "mean_token_accuracy": 0.723339319229126, | |
| "num_tokens": 47884207.0, | |
| "step": 92 | |
| }, | |
| { | |
| "entropy": 0.9149696826934814, | |
| "epoch": 0.6283783783783784, | |
| "grad_norm": 0.018683424219489098, | |
| "learning_rate": 7.832167832167832e-05, | |
| "loss": 0.9166, | |
| "mean_token_accuracy": 0.7118301391601562, | |
| "num_tokens": 48405854.0, | |
| "step": 93 | |
| }, | |
| { | |
| "entropy": 0.8827645182609558, | |
| "epoch": 0.6351351351351351, | |
| "grad_norm": 0.02002580091357231, | |
| "learning_rate": 7.692307692307693e-05, | |
| "loss": 0.8823, | |
| "mean_token_accuracy": 0.7205896377563477, | |
| "num_tokens": 48923152.0, | |
| "step": 94 | |
| }, | |
| { | |
| "entropy": 0.8913484811782837, | |
| "epoch": 0.6418918918918919, | |
| "grad_norm": 0.01915843039751053, | |
| "learning_rate": 7.552447552447553e-05, | |
| "loss": 0.8938, | |
| "mean_token_accuracy": 0.7178173065185547, | |
| "num_tokens": 49445126.0, | |
| "step": 95 | |
| }, | |
| { | |
| "entropy": 0.8866020441055298, | |
| "epoch": 0.6486486486486487, | |
| "grad_norm": 0.020832480862736702, | |
| "learning_rate": 7.412587412587413e-05, | |
| "loss": 0.8917, | |
| "mean_token_accuracy": 0.7187622785568237, | |
| "num_tokens": 49967007.0, | |
| "step": 96 | |
| }, | |
| { | |
| "entropy": 0.8766802549362183, | |
| "epoch": 0.6554054054054054, | |
| "grad_norm": 0.019703548401594162, | |
| "learning_rate": 7.272727272727273e-05, | |
| "loss": 0.8714, | |
| "mean_token_accuracy": 0.7235036492347717, | |
| "num_tokens": 50488513.0, | |
| "step": 97 | |
| }, | |
| { | |
| "entropy": 0.9040693044662476, | |
| "epoch": 0.6621621621621622, | |
| "grad_norm": 0.0192877184599638, | |
| "learning_rate": 7.132867132867134e-05, | |
| "loss": 0.9035, | |
| "mean_token_accuracy": 0.7158621549606323, | |
| "num_tokens": 51008094.0, | |
| "step": 98 | |
| }, | |
| { | |
| "entropy": 0.8829696178436279, | |
| "epoch": 0.668918918918919, | |
| "grad_norm": 0.01927708089351654, | |
| "learning_rate": 6.993006993006993e-05, | |
| "loss": 0.8797, | |
| "mean_token_accuracy": 0.7219703793525696, | |
| "num_tokens": 51529469.0, | |
| "step": 99 | |
| }, | |
| { | |
| "entropy": 0.894615650177002, | |
| "epoch": 0.6756756756756757, | |
| "grad_norm": 0.01965499296784401, | |
| "learning_rate": 6.853146853146853e-05, | |
| "loss": 0.8882, | |
| "mean_token_accuracy": 0.7190660238265991, | |
| "num_tokens": 52050576.0, | |
| "step": 100 | |
| }, | |
| { | |
| "entropy": 0.8754645586013794, | |
| "epoch": 0.6824324324324325, | |
| "grad_norm": 0.019854635000228882, | |
| "learning_rate": 6.713286713286715e-05, | |
| "loss": 0.869, | |
| "mean_token_accuracy": 0.72502201795578, | |
| "num_tokens": 52571347.0, | |
| "step": 101 | |
| }, | |
| { | |
| "entropy": 0.8882539868354797, | |
| "epoch": 0.6891891891891891, | |
| "grad_norm": 0.020126935094594955, | |
| "learning_rate": 6.573426573426573e-05, | |
| "loss": 0.8766, | |
| "mean_token_accuracy": 0.7229670882225037, | |
| "num_tokens": 53092933.0, | |
| "step": 102 | |
| }, | |
| { | |
| "entropy": 0.9050229787826538, | |
| "epoch": 0.6959459459459459, | |
| "grad_norm": 0.019794149324297905, | |
| "learning_rate": 6.433566433566433e-05, | |
| "loss": 0.8965, | |
| "mean_token_accuracy": 0.7169359922409058, | |
| "num_tokens": 53614658.0, | |
| "step": 103 | |
| }, | |
| { | |
| "entropy": 0.900147557258606, | |
| "epoch": 0.7027027027027027, | |
| "grad_norm": 0.01930818147957325, | |
| "learning_rate": 6.293706293706293e-05, | |
| "loss": 0.8975, | |
| "mean_token_accuracy": 0.716631293296814, | |
| "num_tokens": 54134578.0, | |
| "step": 104 | |
| }, | |
| { | |
| "entropy": 0.8568655252456665, | |
| "epoch": 0.7094594594594594, | |
| "grad_norm": 0.019813908264040947, | |
| "learning_rate": 6.153846153846155e-05, | |
| "loss": 0.8549, | |
| "mean_token_accuracy": 0.7281835079193115, | |
| "num_tokens": 54656493.0, | |
| "step": 105 | |
| }, | |
| { | |
| "entropy": 0.8811562061309814, | |
| "epoch": 0.7162162162162162, | |
| "grad_norm": 0.02051232010126114, | |
| "learning_rate": 6.0139860139860136e-05, | |
| "loss": 0.8798, | |
| "mean_token_accuracy": 0.7215043902397156, | |
| "num_tokens": 55177423.0, | |
| "step": 106 | |
| }, | |
| { | |
| "entropy": 0.8680734634399414, | |
| "epoch": 0.722972972972973, | |
| "grad_norm": 0.02060469426214695, | |
| "learning_rate": 5.8741258741258744e-05, | |
| "loss": 0.8739, | |
| "mean_token_accuracy": 0.7240718603134155, | |
| "num_tokens": 55698753.0, | |
| "step": 107 | |
| }, | |
| { | |
| "entropy": 0.8927019238471985, | |
| "epoch": 0.7297297297297297, | |
| "grad_norm": 0.020770812407135963, | |
| "learning_rate": 5.7342657342657345e-05, | |
| "loss": 0.8931, | |
| "mean_token_accuracy": 0.7185073494911194, | |
| "num_tokens": 56208445.0, | |
| "step": 108 | |
| }, | |
| { | |
| "entropy": 0.8723157644271851, | |
| "epoch": 0.7364864864864865, | |
| "grad_norm": 0.020027851685881615, | |
| "learning_rate": 5.594405594405595e-05, | |
| "loss": 0.8677, | |
| "mean_token_accuracy": 0.724934995174408, | |
| "num_tokens": 56727266.0, | |
| "step": 109 | |
| }, | |
| { | |
| "entropy": 0.885744571685791, | |
| "epoch": 0.7432432432432432, | |
| "grad_norm": 0.019678086042404175, | |
| "learning_rate": 5.4545454545454546e-05, | |
| "loss": 0.8811, | |
| "mean_token_accuracy": 0.7209365367889404, | |
| "num_tokens": 57248166.0, | |
| "step": 110 | |
| }, | |
| { | |
| "entropy": 0.8876606225967407, | |
| "epoch": 0.75, | |
| "grad_norm": 0.020123396068811417, | |
| "learning_rate": 5.314685314685315e-05, | |
| "loss": 0.8834, | |
| "mean_token_accuracy": 0.7213138937950134, | |
| "num_tokens": 57769996.0, | |
| "step": 111 | |
| }, | |
| { | |
| "entropy": 0.9018759727478027, | |
| "epoch": 0.7567567567567568, | |
| "grad_norm": 0.02047978714108467, | |
| "learning_rate": 5.1748251748251755e-05, | |
| "loss": 0.8945, | |
| "mean_token_accuracy": 0.7177107930183411, | |
| "num_tokens": 58291779.0, | |
| "step": 112 | |
| }, | |
| { | |
| "entropy": 0.9049036502838135, | |
| "epoch": 0.7635135135135135, | |
| "grad_norm": 0.020464390516281128, | |
| "learning_rate": 5.0349650349650356e-05, | |
| "loss": 0.8984, | |
| "mean_token_accuracy": 0.7173320651054382, | |
| "num_tokens": 58805386.0, | |
| "step": 113 | |
| }, | |
| { | |
| "entropy": 0.8888832330703735, | |
| "epoch": 0.7702702702702703, | |
| "grad_norm": 0.01985686831176281, | |
| "learning_rate": 4.8951048951048956e-05, | |
| "loss": 0.882, | |
| "mean_token_accuracy": 0.7208877205848694, | |
| "num_tokens": 59324642.0, | |
| "step": 114 | |
| }, | |
| { | |
| "entropy": 0.9020118117332458, | |
| "epoch": 0.777027027027027, | |
| "grad_norm": 0.020026598125696182, | |
| "learning_rate": 4.755244755244756e-05, | |
| "loss": 0.8973, | |
| "mean_token_accuracy": 0.7182012796401978, | |
| "num_tokens": 59847011.0, | |
| "step": 115 | |
| }, | |
| { | |
| "entropy": 0.8986602425575256, | |
| "epoch": 0.7837837837837838, | |
| "grad_norm": 0.01975986920297146, | |
| "learning_rate": 4.615384615384616e-05, | |
| "loss": 0.8968, | |
| "mean_token_accuracy": 0.7171120047569275, | |
| "num_tokens": 60369152.0, | |
| "step": 116 | |
| }, | |
| { | |
| "entropy": 0.8851807117462158, | |
| "epoch": 0.7905405405405406, | |
| "grad_norm": 0.01993614062666893, | |
| "learning_rate": 4.475524475524476e-05, | |
| "loss": 0.8804, | |
| "mean_token_accuracy": 0.7222026586532593, | |
| "num_tokens": 60889245.0, | |
| "step": 117 | |
| }, | |
| { | |
| "entropy": 0.882935643196106, | |
| "epoch": 0.7972972972972973, | |
| "grad_norm": 0.01959838718175888, | |
| "learning_rate": 4.335664335664335e-05, | |
| "loss": 0.8797, | |
| "mean_token_accuracy": 0.7208013534545898, | |
| "num_tokens": 61407350.0, | |
| "step": 118 | |
| }, | |
| { | |
| "entropy": 0.8777122497558594, | |
| "epoch": 0.8040540540540541, | |
| "grad_norm": 0.0199885256588459, | |
| "learning_rate": 4.195804195804196e-05, | |
| "loss": 0.8715, | |
| "mean_token_accuracy": 0.725073516368866, | |
| "num_tokens": 61928742.0, | |
| "step": 119 | |
| }, | |
| { | |
| "entropy": 0.850617527961731, | |
| "epoch": 0.8108108108108109, | |
| "grad_norm": 0.020385252311825752, | |
| "learning_rate": 4.055944055944056e-05, | |
| "loss": 0.8499, | |
| "mean_token_accuracy": 0.730745792388916, | |
| "num_tokens": 62446355.0, | |
| "step": 120 | |
| }, | |
| { | |
| "entropy": 0.8720276355743408, | |
| "epoch": 0.8175675675675675, | |
| "grad_norm": 0.02067047357559204, | |
| "learning_rate": 3.916083916083916e-05, | |
| "loss": 0.8746, | |
| "mean_token_accuracy": 0.7231326699256897, | |
| "num_tokens": 62967935.0, | |
| "step": 121 | |
| }, | |
| { | |
| "entropy": 0.9207990169525146, | |
| "epoch": 0.8243243243243243, | |
| "grad_norm": 0.02032148465514183, | |
| "learning_rate": 3.776223776223776e-05, | |
| "loss": 0.922, | |
| "mean_token_accuracy": 0.710451066493988, | |
| "num_tokens": 63489615.0, | |
| "step": 122 | |
| }, | |
| { | |
| "entropy": 0.9047123193740845, | |
| "epoch": 0.831081081081081, | |
| "grad_norm": 0.020533205941319466, | |
| "learning_rate": 3.6363636363636364e-05, | |
| "loss": 0.9015, | |
| "mean_token_accuracy": 0.7166460752487183, | |
| "num_tokens": 64010947.0, | |
| "step": 123 | |
| }, | |
| { | |
| "entropy": 0.8477683067321777, | |
| "epoch": 0.8378378378378378, | |
| "grad_norm": 0.019841615110635757, | |
| "learning_rate": 3.4965034965034965e-05, | |
| "loss": 0.8415, | |
| "mean_token_accuracy": 0.7314550876617432, | |
| "num_tokens": 64531314.0, | |
| "step": 124 | |
| }, | |
| { | |
| "entropy": 0.8877344131469727, | |
| "epoch": 0.8445945945945946, | |
| "grad_norm": 0.01969732716679573, | |
| "learning_rate": 3.356643356643357e-05, | |
| "loss": 0.8858, | |
| "mean_token_accuracy": 0.7207072973251343, | |
| "num_tokens": 65053203.0, | |
| "step": 125 | |
| }, | |
| { | |
| "entropy": 0.8977670669555664, | |
| "epoch": 0.8513513513513513, | |
| "grad_norm": 0.01998847909271717, | |
| "learning_rate": 3.216783216783217e-05, | |
| "loss": 0.8914, | |
| "mean_token_accuracy": 0.7174409627914429, | |
| "num_tokens": 65574215.0, | |
| "step": 126 | |
| }, | |
| { | |
| "entropy": 0.8922737836837769, | |
| "epoch": 0.8581081081081081, | |
| "grad_norm": 0.02041775733232498, | |
| "learning_rate": 3.0769230769230774e-05, | |
| "loss": 0.8918, | |
| "mean_token_accuracy": 0.7177229523658752, | |
| "num_tokens": 66095688.0, | |
| "step": 127 | |
| }, | |
| { | |
| "entropy": 0.8866901397705078, | |
| "epoch": 0.8648648648648649, | |
| "grad_norm": 0.02134627476334572, | |
| "learning_rate": 2.9370629370629372e-05, | |
| "loss": 0.877, | |
| "mean_token_accuracy": 0.7219728827476501, | |
| "num_tokens": 66608762.0, | |
| "step": 128 | |
| }, | |
| { | |
| "entropy": 0.8762195110321045, | |
| "epoch": 0.8716216216216216, | |
| "grad_norm": 0.020530981943011284, | |
| "learning_rate": 2.7972027972027976e-05, | |
| "loss": 0.8705, | |
| "mean_token_accuracy": 0.7246843576431274, | |
| "num_tokens": 67128373.0, | |
| "step": 129 | |
| }, | |
| { | |
| "entropy": 0.8929407596588135, | |
| "epoch": 0.8783783783783784, | |
| "grad_norm": 0.021080242469906807, | |
| "learning_rate": 2.6573426573426574e-05, | |
| "loss": 0.883, | |
| "mean_token_accuracy": 0.7201827168464661, | |
| "num_tokens": 67645584.0, | |
| "step": 130 | |
| }, | |
| { | |
| "entropy": 0.8947334289550781, | |
| "epoch": 0.8851351351351351, | |
| "grad_norm": 0.021501585841178894, | |
| "learning_rate": 2.5174825174825178e-05, | |
| "loss": 0.8867, | |
| "mean_token_accuracy": 0.7202873826026917, | |
| "num_tokens": 68163799.0, | |
| "step": 131 | |
| }, | |
| { | |
| "entropy": 0.8815573453903198, | |
| "epoch": 0.8918918918918919, | |
| "grad_norm": 0.02011815272271633, | |
| "learning_rate": 2.377622377622378e-05, | |
| "loss": 0.8755, | |
| "mean_token_accuracy": 0.722142219543457, | |
| "num_tokens": 68684635.0, | |
| "step": 132 | |
| }, | |
| { | |
| "entropy": 0.8770313262939453, | |
| "epoch": 0.8986486486486487, | |
| "grad_norm": 0.021030854433774948, | |
| "learning_rate": 2.237762237762238e-05, | |
| "loss": 0.8726, | |
| "mean_token_accuracy": 0.7231403589248657, | |
| "num_tokens": 69205861.0, | |
| "step": 133 | |
| }, | |
| { | |
| "entropy": 0.8650751709938049, | |
| "epoch": 0.9054054054054054, | |
| "grad_norm": 0.020364264026284218, | |
| "learning_rate": 2.097902097902098e-05, | |
| "loss": 0.8633, | |
| "mean_token_accuracy": 0.7260516881942749, | |
| "num_tokens": 69728254.0, | |
| "step": 134 | |
| }, | |
| { | |
| "entropy": 0.9033790230751038, | |
| "epoch": 0.9121621621621622, | |
| "grad_norm": 0.02128477208316326, | |
| "learning_rate": 1.958041958041958e-05, | |
| "loss": 0.9083, | |
| "mean_token_accuracy": 0.7141423225402832, | |
| "num_tokens": 70248446.0, | |
| "step": 135 | |
| }, | |
| { | |
| "entropy": 0.8698260188102722, | |
| "epoch": 0.918918918918919, | |
| "grad_norm": 0.020461006090044975, | |
| "learning_rate": 1.8181818181818182e-05, | |
| "loss": 0.8678, | |
| "mean_token_accuracy": 0.7250317335128784, | |
| "num_tokens": 70768977.0, | |
| "step": 136 | |
| }, | |
| { | |
| "entropy": 0.9101998805999756, | |
| "epoch": 0.9256756756756757, | |
| "grad_norm": 0.021351408213377, | |
| "learning_rate": 1.6783216783216786e-05, | |
| "loss": 0.9156, | |
| "mean_token_accuracy": 0.7119852304458618, | |
| "num_tokens": 71285104.0, | |
| "step": 137 | |
| }, | |
| { | |
| "entropy": 0.8741437196731567, | |
| "epoch": 0.9324324324324325, | |
| "grad_norm": 0.02133285254240036, | |
| "learning_rate": 1.5384615384615387e-05, | |
| "loss": 0.8756, | |
| "mean_token_accuracy": 0.7225217819213867, | |
| "num_tokens": 71806248.0, | |
| "step": 138 | |
| }, | |
| { | |
| "entropy": 0.8736119270324707, | |
| "epoch": 0.9391891891891891, | |
| "grad_norm": 0.020086556673049927, | |
| "learning_rate": 1.3986013986013988e-05, | |
| "loss": 0.8683, | |
| "mean_token_accuracy": 0.7247164249420166, | |
| "num_tokens": 72328759.0, | |
| "step": 139 | |
| }, | |
| { | |
| "entropy": 0.8891340494155884, | |
| "epoch": 0.9459459459459459, | |
| "grad_norm": 0.02030119113624096, | |
| "learning_rate": 1.2587412587412589e-05, | |
| "loss": 0.886, | |
| "mean_token_accuracy": 0.720429539680481, | |
| "num_tokens": 72848818.0, | |
| "step": 140 | |
| }, | |
| { | |
| "entropy": 0.9049081802368164, | |
| "epoch": 0.9527027027027027, | |
| "grad_norm": 0.020596666261553764, | |
| "learning_rate": 1.118881118881119e-05, | |
| "loss": 0.9042, | |
| "mean_token_accuracy": 0.7161701321601868, | |
| "num_tokens": 73370480.0, | |
| "step": 141 | |
| }, | |
| { | |
| "entropy": 0.8795987367630005, | |
| "epoch": 0.9594594594594594, | |
| "grad_norm": 0.020133303478360176, | |
| "learning_rate": 9.79020979020979e-06, | |
| "loss": 0.8769, | |
| "mean_token_accuracy": 0.722213089466095, | |
| "num_tokens": 73892465.0, | |
| "step": 142 | |
| }, | |
| { | |
| "entropy": 0.9042908549308777, | |
| "epoch": 0.9662162162162162, | |
| "grad_norm": 0.020722530782222748, | |
| "learning_rate": 8.391608391608393e-06, | |
| "loss": 0.9, | |
| "mean_token_accuracy": 0.7148555517196655, | |
| "num_tokens": 74407048.0, | |
| "step": 143 | |
| }, | |
| { | |
| "entropy": 0.8909604549407959, | |
| "epoch": 0.972972972972973, | |
| "grad_norm": 0.020139718428254128, | |
| "learning_rate": 6.993006993006994e-06, | |
| "loss": 0.8875, | |
| "mean_token_accuracy": 0.7199447154998779, | |
| "num_tokens": 74927929.0, | |
| "step": 144 | |
| }, | |
| { | |
| "entropy": 0.8854581117630005, | |
| "epoch": 0.9797297297297297, | |
| "grad_norm": 0.020443160086870193, | |
| "learning_rate": 5.594405594405595e-06, | |
| "loss": 0.8828, | |
| "mean_token_accuracy": 0.7213336825370789, | |
| "num_tokens": 75448350.0, | |
| "step": 145 | |
| }, | |
| { | |
| "entropy": 0.9017068147659302, | |
| "epoch": 0.9864864864864865, | |
| "grad_norm": 0.02036883309483528, | |
| "learning_rate": 4.195804195804197e-06, | |
| "loss": 0.8969, | |
| "mean_token_accuracy": 0.7174678444862366, | |
| "num_tokens": 75968294.0, | |
| "step": 146 | |
| }, | |
| { | |
| "entropy": 0.8653884530067444, | |
| "epoch": 0.9932432432432432, | |
| "grad_norm": 0.020496118813753128, | |
| "learning_rate": 2.7972027972027974e-06, | |
| "loss": 0.862, | |
| "mean_token_accuracy": 0.725719153881073, | |
| "num_tokens": 76484227.0, | |
| "step": 147 | |
| }, | |
| { | |
| "entropy": 0.8869370818138123, | |
| "epoch": 1.0, | |
| "grad_norm": 0.020335717126727104, | |
| "learning_rate": 1.3986013986013987e-06, | |
| "loss": 0.8832, | |
| "mean_token_accuracy": 0.72120600938797, | |
| "num_tokens": 77005516.0, | |
| "step": 148 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "step": 148, | |
| "total_flos": 3.219089170299355e+18, | |
| "train_loss": 0.901587930080053, | |
| "train_runtime": 1651.2377, | |
| "train_samples_per_second": 5.736, | |
| "train_steps_per_second": 0.09 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 148, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 1, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": false, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 3.219089170299355e+18, | |
| "train_batch_size": 8, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |