{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 148, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "entropy": 0.8085902333259583, "epoch": 0.006756756756756757, "grad_norm": 0.14047187566757202, "learning_rate": 0.0, "loss": 1.0639, "mean_token_accuracy": 0.698029100894928, "num_tokens": 520290.0, "step": 1 }, { "entropy": 0.839566707611084, "epoch": 0.013513513513513514, "grad_norm": 0.1440640240907669, "learning_rate": 4e-05, "loss": 1.0952, "mean_token_accuracy": 0.687626302242279, "num_tokens": 1042076.0, "step": 2 }, { "entropy": 0.8076200485229492, "epoch": 0.02027027027027027, "grad_norm": 0.13277006149291992, "learning_rate": 8e-05, "loss": 1.0518, "mean_token_accuracy": 0.703219473361969, "num_tokens": 1561230.0, "step": 3 }, { "entropy": 0.8777214884757996, "epoch": 0.02702702702702703, "grad_norm": 0.11633922904729843, "learning_rate": 0.00012, "loss": 1.0688, "mean_token_accuracy": 0.6899521946907043, "num_tokens": 2082559.0, "step": 4 }, { "entropy": 0.9525601863861084, "epoch": 0.033783783783783786, "grad_norm": 0.11181541532278061, "learning_rate": 0.00016, "loss": 1.0118, "mean_token_accuracy": 0.6984938383102417, "num_tokens": 2604188.0, "step": 5 }, { "entropy": 1.030306100845337, "epoch": 0.04054054054054054, "grad_norm": 0.1037818044424057, "learning_rate": 0.0002, "loss": 0.9858, "mean_token_accuracy": 0.7052151560783386, "num_tokens": 3125438.0, "step": 6 }, { "entropy": 1.058103084564209, "epoch": 0.0472972972972973, "grad_norm": 0.08032543957233429, "learning_rate": 0.00019860139860139862, "loss": 0.9888, "mean_token_accuracy": 0.700404167175293, "num_tokens": 3647168.0, "step": 7 }, { "entropy": 1.0215051174163818, "epoch": 0.05405405405405406, "grad_norm": 0.11070238053798676, "learning_rate": 0.0001972027972027972, "loss": 0.9369, "mean_token_accuracy": 0.7151404619216919, "num_tokens": 4167149.0, "step": 8 }, { "entropy": 1.0240428447723389, "epoch": 0.060810810810810814, "grad_norm": 0.08369743078947067, "learning_rate": 0.00019580419580419583, "loss": 0.9682, "mean_token_accuracy": 0.7065526843070984, "num_tokens": 4685442.0, "step": 9 }, { "entropy": 0.9852886199951172, "epoch": 0.06756756756756757, "grad_norm": 0.045641954988241196, "learning_rate": 0.0001944055944055944, "loss": 0.9657, "mean_token_accuracy": 0.7040433287620544, "num_tokens": 5207318.0, "step": 10 }, { "entropy": 0.9090337157249451, "epoch": 0.07432432432432433, "grad_norm": 0.036005664616823196, "learning_rate": 0.000193006993006993, "loss": 0.9261, "mean_token_accuracy": 0.713391900062561, "num_tokens": 5728363.0, "step": 11 }, { "entropy": 0.8860379457473755, "epoch": 0.08108108108108109, "grad_norm": 0.0332878939807415, "learning_rate": 0.00019160839160839161, "loss": 0.9255, "mean_token_accuracy": 0.7144545912742615, "num_tokens": 6249685.0, "step": 12 }, { "entropy": 0.8721570372581482, "epoch": 0.08783783783783784, "grad_norm": 0.034468844532966614, "learning_rate": 0.00019020979020979023, "loss": 0.9262, "mean_token_accuracy": 0.7132186889648438, "num_tokens": 6771073.0, "step": 13 }, { "entropy": 0.8796703815460205, "epoch": 0.0945945945945946, "grad_norm": 0.03204338252544403, "learning_rate": 0.00018881118881118882, "loss": 0.9312, "mean_token_accuracy": 0.7109803557395935, "num_tokens": 7292911.0, "step": 14 }, { "entropy": 0.898871660232544, "epoch": 0.10135135135135136, "grad_norm": 0.02886817790567875, "learning_rate": 0.00018741258741258743, "loss": 0.9331, "mean_token_accuracy": 0.7116541266441345, "num_tokens": 7808250.0, "step": 15 }, { "entropy": 0.8994000554084778, "epoch": 0.10810810810810811, "grad_norm": 0.021798841655254364, "learning_rate": 0.00018601398601398602, "loss": 0.9064, "mean_token_accuracy": 0.717405378818512, "num_tokens": 8327930.0, "step": 16 }, { "entropy": 0.9358035922050476, "epoch": 0.11486486486486487, "grad_norm": 0.020242631435394287, "learning_rate": 0.00018461538461538463, "loss": 0.9271, "mean_token_accuracy": 0.7115734815597534, "num_tokens": 8843022.0, "step": 17 }, { "entropy": 0.9195488095283508, "epoch": 0.12162162162162163, "grad_norm": 0.02282623015344143, "learning_rate": 0.00018321678321678322, "loss": 0.8845, "mean_token_accuracy": 0.7234280109405518, "num_tokens": 9362231.0, "step": 18 }, { "entropy": 0.9643428325653076, "epoch": 0.12837837837837837, "grad_norm": 0.02410944364964962, "learning_rate": 0.00018181818181818183, "loss": 0.924, "mean_token_accuracy": 0.7115726470947266, "num_tokens": 9883300.0, "step": 19 }, { "entropy": 0.9452087879180908, "epoch": 0.13513513513513514, "grad_norm": 0.024604445323348045, "learning_rate": 0.00018041958041958042, "loss": 0.9038, "mean_token_accuracy": 0.7173081636428833, "num_tokens": 10403398.0, "step": 20 }, { "entropy": 0.9439239501953125, "epoch": 0.14189189189189189, "grad_norm": 0.022483721375465393, "learning_rate": 0.00017902097902097904, "loss": 0.9088, "mean_token_accuracy": 0.7151318192481995, "num_tokens": 10924591.0, "step": 21 }, { "entropy": 0.9263131022453308, "epoch": 0.14864864864864866, "grad_norm": 0.020426636561751366, "learning_rate": 0.00017762237762237762, "loss": 0.9073, "mean_token_accuracy": 0.7165982127189636, "num_tokens": 11445479.0, "step": 22 }, { "entropy": 0.8799407482147217, "epoch": 0.1554054054054054, "grad_norm": 0.015485940501093864, "learning_rate": 0.00017622377622377624, "loss": 0.8733, "mean_token_accuracy": 0.7245533466339111, "num_tokens": 11967017.0, "step": 23 }, { "entropy": 0.8788759708404541, "epoch": 0.16216216216216217, "grad_norm": 0.017046676948666573, "learning_rate": 0.00017482517482517485, "loss": 0.8949, "mean_token_accuracy": 0.7203764319419861, "num_tokens": 12488697.0, "step": 24 }, { "entropy": 0.8779318332672119, "epoch": 0.16891891891891891, "grad_norm": 0.018766306340694427, "learning_rate": 0.0001734265734265734, "loss": 0.8983, "mean_token_accuracy": 0.7172704935073853, "num_tokens": 13009235.0, "step": 25 }, { "entropy": 0.8834930658340454, "epoch": 0.17567567567567569, "grad_norm": 0.020971953868865967, "learning_rate": 0.00017202797202797203, "loss": 0.9148, "mean_token_accuracy": 0.714013397693634, "num_tokens": 13530899.0, "step": 26 }, { "entropy": 0.8656618595123291, "epoch": 0.18243243243243243, "grad_norm": 0.018672997131943703, "learning_rate": 0.00017062937062937064, "loss": 0.8888, "mean_token_accuracy": 0.7208808064460754, "num_tokens": 14052671.0, "step": 27 }, { "entropy": 0.9078769087791443, "epoch": 0.1891891891891892, "grad_norm": 0.017195429652929306, "learning_rate": 0.00016923076923076923, "loss": 0.9242, "mean_token_accuracy": 0.7125155329704285, "num_tokens": 14574400.0, "step": 28 }, { "entropy": 0.9030580520629883, "epoch": 0.19594594594594594, "grad_norm": 0.01640770211815834, "learning_rate": 0.00016783216783216784, "loss": 0.9033, "mean_token_accuracy": 0.7164167761802673, "num_tokens": 15094634.0, "step": 29 }, { "entropy": 0.9503644108772278, "epoch": 0.20270270270270271, "grad_norm": 0.015921002253890038, "learning_rate": 0.00016643356643356646, "loss": 0.9454, "mean_token_accuracy": 0.7049751877784729, "num_tokens": 15616494.0, "step": 30 }, { "entropy": 0.9235352873802185, "epoch": 0.20945945945945946, "grad_norm": 0.0211192574352026, "learning_rate": 0.00016503496503496504, "loss": 0.9083, "mean_token_accuracy": 0.7148462533950806, "num_tokens": 16137586.0, "step": 31 }, { "entropy": 0.9228708744049072, "epoch": 0.21621621621621623, "grad_norm": 0.016181398183107376, "learning_rate": 0.00016363636363636366, "loss": 0.9076, "mean_token_accuracy": 0.715546190738678, "num_tokens": 16658862.0, "step": 32 }, { "entropy": 0.9239456653594971, "epoch": 0.22297297297297297, "grad_norm": 0.018280163407325745, "learning_rate": 0.00016223776223776225, "loss": 0.8973, "mean_token_accuracy": 0.7179359197616577, "num_tokens": 17181392.0, "step": 33 }, { "entropy": 0.9081429243087769, "epoch": 0.22972972972972974, "grad_norm": 0.01677747257053852, "learning_rate": 0.00016083916083916083, "loss": 0.89, "mean_token_accuracy": 0.7211325764656067, "num_tokens": 17701533.0, "step": 34 }, { "entropy": 0.8966501951217651, "epoch": 0.23648648648648649, "grad_norm": 0.015331600792706013, "learning_rate": 0.00015944055944055945, "loss": 0.8834, "mean_token_accuracy": 0.7213951349258423, "num_tokens": 18222829.0, "step": 35 }, { "entropy": 0.8946911096572876, "epoch": 0.24324324324324326, "grad_norm": 0.015129225328564644, "learning_rate": 0.00015804195804195806, "loss": 0.8895, "mean_token_accuracy": 0.7205337882041931, "num_tokens": 18743612.0, "step": 36 }, { "entropy": 0.924943208694458, "epoch": 0.25, "grad_norm": 0.016918186098337173, "learning_rate": 0.00015664335664335665, "loss": 0.934, "mean_token_accuracy": 0.7084062099456787, "num_tokens": 19265066.0, "step": 37 }, { "entropy": 0.8968441486358643, "epoch": 0.25675675675675674, "grad_norm": 0.01708938553929329, "learning_rate": 0.00015524475524475526, "loss": 0.9031, "mean_token_accuracy": 0.7160695791244507, "num_tokens": 19785775.0, "step": 38 }, { "entropy": 0.9021536111831665, "epoch": 0.2635135135135135, "grad_norm": 0.01632198505103588, "learning_rate": 0.00015384615384615385, "loss": 0.9106, "mean_token_accuracy": 0.7141885161399841, "num_tokens": 20306201.0, "step": 39 }, { "entropy": 0.9144896864891052, "epoch": 0.2702702702702703, "grad_norm": 0.015601382590830326, "learning_rate": 0.00015244755244755244, "loss": 0.9177, "mean_token_accuracy": 0.7121185660362244, "num_tokens": 20827655.0, "step": 40 }, { "entropy": 0.905925989151001, "epoch": 0.27702702702702703, "grad_norm": 0.015409526415169239, "learning_rate": 0.00015104895104895105, "loss": 0.9078, "mean_token_accuracy": 0.7149533033370972, "num_tokens": 21347179.0, "step": 41 }, { "entropy": 0.9454245567321777, "epoch": 0.28378378378378377, "grad_norm": 0.01604871265590191, "learning_rate": 0.00014965034965034964, "loss": 0.9382, "mean_token_accuracy": 0.7070503234863281, "num_tokens": 21867430.0, "step": 42 }, { "entropy": 0.8873435258865356, "epoch": 0.2905405405405405, "grad_norm": 0.01542913168668747, "learning_rate": 0.00014825174825174825, "loss": 0.8767, "mean_token_accuracy": 0.722999632358551, "num_tokens": 22386589.0, "step": 43 }, { "entropy": 0.8967331647872925, "epoch": 0.2972972972972973, "grad_norm": 0.016037292778491974, "learning_rate": 0.00014685314685314687, "loss": 0.8846, "mean_token_accuracy": 0.7215548157691956, "num_tokens": 22901934.0, "step": 44 }, { "entropy": 0.8978803753852844, "epoch": 0.30405405405405406, "grad_norm": 0.015742763876914978, "learning_rate": 0.00014545454545454546, "loss": 0.8913, "mean_token_accuracy": 0.7180163264274597, "num_tokens": 23422539.0, "step": 45 }, { "entropy": 0.8771539926528931, "epoch": 0.3108108108108108, "grad_norm": 0.015965279191732407, "learning_rate": 0.00014405594405594407, "loss": 0.8759, "mean_token_accuracy": 0.7238014936447144, "num_tokens": 23943911.0, "step": 46 }, { "entropy": 0.9169310331344604, "epoch": 0.31756756756756754, "grad_norm": 0.01562552899122238, "learning_rate": 0.00014265734265734269, "loss": 0.9091, "mean_token_accuracy": 0.7138416171073914, "num_tokens": 24465171.0, "step": 47 }, { "entropy": 0.9132385849952698, "epoch": 0.32432432432432434, "grad_norm": 0.015293029136955738, "learning_rate": 0.00014125874125874125, "loss": 0.9102, "mean_token_accuracy": 0.7141794562339783, "num_tokens": 24986963.0, "step": 48 }, { "entropy": 0.9064264893531799, "epoch": 0.3310810810810811, "grad_norm": 0.01573154330253601, "learning_rate": 0.00013986013986013986, "loss": 0.9104, "mean_token_accuracy": 0.7143970727920532, "num_tokens": 25507940.0, "step": 49 }, { "entropy": 0.9247837662696838, "epoch": 0.33783783783783783, "grad_norm": 0.01566915400326252, "learning_rate": 0.00013846153846153847, "loss": 0.9244, "mean_token_accuracy": 0.7100151181221008, "num_tokens": 26029633.0, "step": 50 }, { "entropy": 0.8918017148971558, "epoch": 0.34459459459459457, "grad_norm": 0.016166144981980324, "learning_rate": 0.00013706293706293706, "loss": 0.8896, "mean_token_accuracy": 0.7206509709358215, "num_tokens": 26551330.0, "step": 51 }, { "entropy": 0.8915703296661377, "epoch": 0.35135135135135137, "grad_norm": 0.016258137300610542, "learning_rate": 0.00013566433566433568, "loss": 0.8838, "mean_token_accuracy": 0.7210925817489624, "num_tokens": 27071628.0, "step": 52 }, { "entropy": 0.910973310470581, "epoch": 0.3581081081081081, "grad_norm": 0.015521145425736904, "learning_rate": 0.0001342657342657343, "loss": 0.9066, "mean_token_accuracy": 0.7141618132591248, "num_tokens": 27592998.0, "step": 53 }, { "entropy": 0.9083860516548157, "epoch": 0.36486486486486486, "grad_norm": 0.01575257070362568, "learning_rate": 0.00013286713286713288, "loss": 0.9014, "mean_token_accuracy": 0.7151919603347778, "num_tokens": 28115257.0, "step": 54 }, { "entropy": 0.9026190638542175, "epoch": 0.3716216216216216, "grad_norm": 0.01594236120581627, "learning_rate": 0.00013146853146853147, "loss": 0.8943, "mean_token_accuracy": 0.7175394296646118, "num_tokens": 28636147.0, "step": 55 }, { "entropy": 0.8969719409942627, "epoch": 0.3783783783783784, "grad_norm": 0.016330119222402573, "learning_rate": 0.00013006993006993008, "loss": 0.8973, "mean_token_accuracy": 0.7174832820892334, "num_tokens": 29158715.0, "step": 56 }, { "entropy": 0.9341723322868347, "epoch": 0.38513513513513514, "grad_norm": 0.01603098399937153, "learning_rate": 0.00012867132867132867, "loss": 0.9331, "mean_token_accuracy": 0.7069593667984009, "num_tokens": 29681317.0, "step": 57 }, { "entropy": 0.8998703360557556, "epoch": 0.3918918918918919, "grad_norm": 0.016095977276563644, "learning_rate": 0.00012727272727272728, "loss": 0.8929, "mean_token_accuracy": 0.718587338924408, "num_tokens": 30202981.0, "step": 58 }, { "entropy": 0.9057657718658447, "epoch": 0.39864864864864863, "grad_norm": 0.016768187284469604, "learning_rate": 0.00012587412587412587, "loss": 0.9004, "mean_token_accuracy": 0.7168737649917603, "num_tokens": 30717612.0, "step": 59 }, { "entropy": 0.898177981376648, "epoch": 0.40540540540540543, "grad_norm": 0.01631537266075611, "learning_rate": 0.00012447552447552448, "loss": 0.8949, "mean_token_accuracy": 0.7172833681106567, "num_tokens": 31239946.0, "step": 60 }, { "entropy": 0.8847284317016602, "epoch": 0.41216216216216217, "grad_norm": 0.01665564626455307, "learning_rate": 0.0001230769230769231, "loss": 0.88, "mean_token_accuracy": 0.7212061285972595, "num_tokens": 31761778.0, "step": 61 }, { "entropy": 0.8945165872573853, "epoch": 0.4189189189189189, "grad_norm": 0.01744748093187809, "learning_rate": 0.0001216783216783217, "loss": 0.893, "mean_token_accuracy": 0.7190291285514832, "num_tokens": 32284035.0, "step": 62 }, { "entropy": 0.9090993404388428, "epoch": 0.42567567567567566, "grad_norm": 0.017030267044901848, "learning_rate": 0.00012027972027972027, "loss": 0.9082, "mean_token_accuracy": 0.7141250967979431, "num_tokens": 32804596.0, "step": 63 }, { "entropy": 0.9153857231140137, "epoch": 0.43243243243243246, "grad_norm": 0.016640154644846916, "learning_rate": 0.00011888111888111889, "loss": 0.9149, "mean_token_accuracy": 0.7134872078895569, "num_tokens": 33325570.0, "step": 64 }, { "entropy": 0.8797224164009094, "epoch": 0.4391891891891892, "grad_norm": 0.01646554283797741, "learning_rate": 0.00011748251748251749, "loss": 0.8781, "mean_token_accuracy": 0.7216721177101135, "num_tokens": 33846540.0, "step": 65 }, { "entropy": 0.8971645832061768, "epoch": 0.44594594594594594, "grad_norm": 0.016337089240550995, "learning_rate": 0.00011608391608391609, "loss": 0.8935, "mean_token_accuracy": 0.7179019451141357, "num_tokens": 34366891.0, "step": 66 }, { "entropy": 0.904381513595581, "epoch": 0.4527027027027027, "grad_norm": 0.017709996551275253, "learning_rate": 0.00011468531468531469, "loss": 0.8984, "mean_token_accuracy": 0.7170865535736084, "num_tokens": 34889245.0, "step": 67 }, { "entropy": 0.9063211679458618, "epoch": 0.4594594594594595, "grad_norm": 0.017201313748955727, "learning_rate": 0.0001132867132867133, "loss": 0.9015, "mean_token_accuracy": 0.7147793173789978, "num_tokens": 35404556.0, "step": 68 }, { "entropy": 0.8922293186187744, "epoch": 0.46621621621621623, "grad_norm": 0.016904350370168686, "learning_rate": 0.0001118881118881119, "loss": 0.888, "mean_token_accuracy": 0.719697892665863, "num_tokens": 35925065.0, "step": 69 }, { "entropy": 0.9034209251403809, "epoch": 0.47297297297297297, "grad_norm": 0.017079392448067665, "learning_rate": 0.00011048951048951048, "loss": 0.896, "mean_token_accuracy": 0.7172802686691284, "num_tokens": 36446132.0, "step": 70 }, { "entropy": 0.887146532535553, "epoch": 0.4797297297297297, "grad_norm": 0.01738973893225193, "learning_rate": 0.00010909090909090909, "loss": 0.8828, "mean_token_accuracy": 0.7213963270187378, "num_tokens": 36967214.0, "step": 71 }, { "entropy": 0.8868647813796997, "epoch": 0.4864864864864865, "grad_norm": 0.017861152067780495, "learning_rate": 0.0001076923076923077, "loss": 0.8773, "mean_token_accuracy": 0.7218159437179565, "num_tokens": 37488377.0, "step": 72 }, { "entropy": 0.9134626388549805, "epoch": 0.49324324324324326, "grad_norm": 0.017511827871203423, "learning_rate": 0.0001062937062937063, "loss": 0.9122, "mean_token_accuracy": 0.7130904793739319, "num_tokens": 38008473.0, "step": 73 }, { "entropy": 0.8864673376083374, "epoch": 0.5, "grad_norm": 0.017983395606279373, "learning_rate": 0.0001048951048951049, "loss": 0.8881, "mean_token_accuracy": 0.7188193798065186, "num_tokens": 38529415.0, "step": 74 }, { "entropy": 0.9193586111068726, "epoch": 0.5067567567567568, "grad_norm": 0.018130991607904434, "learning_rate": 0.00010349650349650351, "loss": 0.9259, "mean_token_accuracy": 0.7099359631538391, "num_tokens": 39049712.0, "step": 75 }, { "entropy": 0.8895922303199768, "epoch": 0.5135135135135135, "grad_norm": 0.017333725467324257, "learning_rate": 0.00010209790209790211, "loss": 0.8891, "mean_token_accuracy": 0.7198699116706848, "num_tokens": 39566565.0, "step": 76 }, { "entropy": 0.8800663948059082, "epoch": 0.5202702702702703, "grad_norm": 0.018249373883008957, "learning_rate": 0.00010069930069930071, "loss": 0.8692, "mean_token_accuracy": 0.724394679069519, "num_tokens": 40086868.0, "step": 77 }, { "entropy": 0.8940162658691406, "epoch": 0.527027027027027, "grad_norm": 0.01744958944618702, "learning_rate": 9.930069930069931e-05, "loss": 0.8898, "mean_token_accuracy": 0.7197460532188416, "num_tokens": 40607334.0, "step": 78 }, { "entropy": 0.8987851142883301, "epoch": 0.5337837837837838, "grad_norm": 0.018518365919589996, "learning_rate": 9.790209790209791e-05, "loss": 0.8904, "mean_token_accuracy": 0.7207316160202026, "num_tokens": 41126753.0, "step": 79 }, { "entropy": 0.8804575204849243, "epoch": 0.5405405405405406, "grad_norm": 0.018223201856017113, "learning_rate": 9.65034965034965e-05, "loss": 0.8763, "mean_token_accuracy": 0.7228670120239258, "num_tokens": 41647670.0, "step": 80 }, { "entropy": 0.8889448046684265, "epoch": 0.5472972972972973, "grad_norm": 0.018730709329247475, "learning_rate": 9.510489510489511e-05, "loss": 0.883, "mean_token_accuracy": 0.7205345630645752, "num_tokens": 42166183.0, "step": 81 }, { "entropy": 0.8915292024612427, "epoch": 0.5540540540540541, "grad_norm": 0.018218854442238808, "learning_rate": 9.370629370629372e-05, "loss": 0.8835, "mean_token_accuracy": 0.7211962342262268, "num_tokens": 42685978.0, "step": 82 }, { "entropy": 0.871468186378479, "epoch": 0.5608108108108109, "grad_norm": 0.0187361016869545, "learning_rate": 9.230769230769232e-05, "loss": 0.8697, "mean_token_accuracy": 0.7244738340377808, "num_tokens": 43203743.0, "step": 83 }, { "entropy": 0.8702860474586487, "epoch": 0.5675675675675675, "grad_norm": 0.018368471413850784, "learning_rate": 9.090909090909092e-05, "loss": 0.8698, "mean_token_accuracy": 0.7259374260902405, "num_tokens": 43725363.0, "step": 84 }, { "entropy": 0.8703951239585876, "epoch": 0.5743243243243243, "grad_norm": 0.01838189922273159, "learning_rate": 8.951048951048952e-05, "loss": 0.8743, "mean_token_accuracy": 0.7234249711036682, "num_tokens": 44246559.0, "step": 85 }, { "entropy": 0.8820457458496094, "epoch": 0.581081081081081, "grad_norm": 0.019160225987434387, "learning_rate": 8.811188811188812e-05, "loss": 0.8849, "mean_token_accuracy": 0.7206857800483704, "num_tokens": 44769062.0, "step": 86 }, { "entropy": 0.9152972102165222, "epoch": 0.5878378378378378, "grad_norm": 0.019004985690116882, "learning_rate": 8.67132867132867e-05, "loss": 0.9153, "mean_token_accuracy": 0.7120697498321533, "num_tokens": 45286787.0, "step": 87 }, { "entropy": 0.904834508895874, "epoch": 0.5945945945945946, "grad_norm": 0.018431641161441803, "learning_rate": 8.531468531468532e-05, "loss": 0.9011, "mean_token_accuracy": 0.7159322500228882, "num_tokens": 45807531.0, "step": 88 }, { "entropy": 0.9021150469779968, "epoch": 0.6013513513513513, "grad_norm": 0.01898609660565853, "learning_rate": 8.391608391608392e-05, "loss": 0.8956, "mean_token_accuracy": 0.7188680768013, "num_tokens": 46321183.0, "step": 89 }, { "entropy": 0.9032172560691833, "epoch": 0.6081081081081081, "grad_norm": 0.02004328928887844, "learning_rate": 8.251748251748252e-05, "loss": 0.8928, "mean_token_accuracy": 0.7190265655517578, "num_tokens": 46841033.0, "step": 90 }, { "entropy": 0.9000004529953003, "epoch": 0.6148648648648649, "grad_norm": 0.019782939925789833, "learning_rate": 8.111888111888112e-05, "loss": 0.8849, "mean_token_accuracy": 0.7195360064506531, "num_tokens": 47363499.0, "step": 91 }, { "entropy": 0.8834792375564575, "epoch": 0.6216216216216216, "grad_norm": 0.0185946486890316, "learning_rate": 7.972027972027972e-05, "loss": 0.8762, "mean_token_accuracy": 0.723339319229126, "num_tokens": 47884207.0, "step": 92 }, { "entropy": 0.9149696826934814, "epoch": 0.6283783783783784, "grad_norm": 0.018683424219489098, "learning_rate": 7.832167832167832e-05, "loss": 0.9166, "mean_token_accuracy": 0.7118301391601562, "num_tokens": 48405854.0, "step": 93 }, { "entropy": 0.8827645182609558, "epoch": 0.6351351351351351, "grad_norm": 0.02002580091357231, "learning_rate": 7.692307692307693e-05, "loss": 0.8823, "mean_token_accuracy": 0.7205896377563477, "num_tokens": 48923152.0, "step": 94 }, { "entropy": 0.8913484811782837, "epoch": 0.6418918918918919, "grad_norm": 0.01915843039751053, "learning_rate": 7.552447552447553e-05, "loss": 0.8938, "mean_token_accuracy": 0.7178173065185547, "num_tokens": 49445126.0, "step": 95 }, { "entropy": 0.8866020441055298, "epoch": 0.6486486486486487, "grad_norm": 0.020832480862736702, "learning_rate": 7.412587412587413e-05, "loss": 0.8917, "mean_token_accuracy": 0.7187622785568237, "num_tokens": 49967007.0, "step": 96 }, { "entropy": 0.8766802549362183, "epoch": 0.6554054054054054, "grad_norm": 0.019703548401594162, "learning_rate": 7.272727272727273e-05, "loss": 0.8714, "mean_token_accuracy": 0.7235036492347717, "num_tokens": 50488513.0, "step": 97 }, { "entropy": 0.9040693044662476, "epoch": 0.6621621621621622, "grad_norm": 0.0192877184599638, "learning_rate": 7.132867132867134e-05, "loss": 0.9035, "mean_token_accuracy": 0.7158621549606323, "num_tokens": 51008094.0, "step": 98 }, { "entropy": 0.8829696178436279, "epoch": 0.668918918918919, "grad_norm": 0.01927708089351654, "learning_rate": 6.993006993006993e-05, "loss": 0.8797, "mean_token_accuracy": 0.7219703793525696, "num_tokens": 51529469.0, "step": 99 }, { "entropy": 0.894615650177002, "epoch": 0.6756756756756757, "grad_norm": 0.01965499296784401, "learning_rate": 6.853146853146853e-05, "loss": 0.8882, "mean_token_accuracy": 0.7190660238265991, "num_tokens": 52050576.0, "step": 100 }, { "entropy": 0.8754645586013794, "epoch": 0.6824324324324325, "grad_norm": 0.019854635000228882, "learning_rate": 6.713286713286715e-05, "loss": 0.869, "mean_token_accuracy": 0.72502201795578, "num_tokens": 52571347.0, "step": 101 }, { "entropy": 0.8882539868354797, "epoch": 0.6891891891891891, "grad_norm": 0.020126935094594955, "learning_rate": 6.573426573426573e-05, "loss": 0.8766, "mean_token_accuracy": 0.7229670882225037, "num_tokens": 53092933.0, "step": 102 }, { "entropy": 0.9050229787826538, "epoch": 0.6959459459459459, "grad_norm": 0.019794149324297905, "learning_rate": 6.433566433566433e-05, "loss": 0.8965, "mean_token_accuracy": 0.7169359922409058, "num_tokens": 53614658.0, "step": 103 }, { "entropy": 0.900147557258606, "epoch": 0.7027027027027027, "grad_norm": 0.01930818147957325, "learning_rate": 6.293706293706293e-05, "loss": 0.8975, "mean_token_accuracy": 0.716631293296814, "num_tokens": 54134578.0, "step": 104 }, { "entropy": 0.8568655252456665, "epoch": 0.7094594594594594, "grad_norm": 0.019813908264040947, "learning_rate": 6.153846153846155e-05, "loss": 0.8549, "mean_token_accuracy": 0.7281835079193115, "num_tokens": 54656493.0, "step": 105 }, { "entropy": 0.8811562061309814, "epoch": 0.7162162162162162, "grad_norm": 0.02051232010126114, "learning_rate": 6.0139860139860136e-05, "loss": 0.8798, "mean_token_accuracy": 0.7215043902397156, "num_tokens": 55177423.0, "step": 106 }, { "entropy": 0.8680734634399414, "epoch": 0.722972972972973, "grad_norm": 0.02060469426214695, "learning_rate": 5.8741258741258744e-05, "loss": 0.8739, "mean_token_accuracy": 0.7240718603134155, "num_tokens": 55698753.0, "step": 107 }, { "entropy": 0.8927019238471985, "epoch": 0.7297297297297297, "grad_norm": 0.020770812407135963, "learning_rate": 5.7342657342657345e-05, "loss": 0.8931, "mean_token_accuracy": 0.7185073494911194, "num_tokens": 56208445.0, "step": 108 }, { "entropy": 0.8723157644271851, "epoch": 0.7364864864864865, "grad_norm": 0.020027851685881615, "learning_rate": 5.594405594405595e-05, "loss": 0.8677, "mean_token_accuracy": 0.724934995174408, "num_tokens": 56727266.0, "step": 109 }, { "entropy": 0.885744571685791, "epoch": 0.7432432432432432, "grad_norm": 0.019678086042404175, "learning_rate": 5.4545454545454546e-05, "loss": 0.8811, "mean_token_accuracy": 0.7209365367889404, "num_tokens": 57248166.0, "step": 110 }, { "entropy": 0.8876606225967407, "epoch": 0.75, "grad_norm": 0.020123396068811417, "learning_rate": 5.314685314685315e-05, "loss": 0.8834, "mean_token_accuracy": 0.7213138937950134, "num_tokens": 57769996.0, "step": 111 }, { "entropy": 0.9018759727478027, "epoch": 0.7567567567567568, "grad_norm": 0.02047978714108467, "learning_rate": 5.1748251748251755e-05, "loss": 0.8945, "mean_token_accuracy": 0.7177107930183411, "num_tokens": 58291779.0, "step": 112 }, { "entropy": 0.9049036502838135, "epoch": 0.7635135135135135, "grad_norm": 0.020464390516281128, "learning_rate": 5.0349650349650356e-05, "loss": 0.8984, "mean_token_accuracy": 0.7173320651054382, "num_tokens": 58805386.0, "step": 113 }, { "entropy": 0.8888832330703735, "epoch": 0.7702702702702703, "grad_norm": 0.01985686831176281, "learning_rate": 4.8951048951048956e-05, "loss": 0.882, "mean_token_accuracy": 0.7208877205848694, "num_tokens": 59324642.0, "step": 114 }, { "entropy": 0.9020118117332458, "epoch": 0.777027027027027, "grad_norm": 0.020026598125696182, "learning_rate": 4.755244755244756e-05, "loss": 0.8973, "mean_token_accuracy": 0.7182012796401978, "num_tokens": 59847011.0, "step": 115 }, { "entropy": 0.8986602425575256, "epoch": 0.7837837837837838, "grad_norm": 0.01975986920297146, "learning_rate": 4.615384615384616e-05, "loss": 0.8968, "mean_token_accuracy": 0.7171120047569275, "num_tokens": 60369152.0, "step": 116 }, { "entropy": 0.8851807117462158, "epoch": 0.7905405405405406, "grad_norm": 0.01993614062666893, "learning_rate": 4.475524475524476e-05, "loss": 0.8804, "mean_token_accuracy": 0.7222026586532593, "num_tokens": 60889245.0, "step": 117 }, { "entropy": 0.882935643196106, "epoch": 0.7972972972972973, "grad_norm": 0.01959838718175888, "learning_rate": 4.335664335664335e-05, "loss": 0.8797, "mean_token_accuracy": 0.7208013534545898, "num_tokens": 61407350.0, "step": 118 }, { "entropy": 0.8777122497558594, "epoch": 0.8040540540540541, "grad_norm": 0.0199885256588459, "learning_rate": 4.195804195804196e-05, "loss": 0.8715, "mean_token_accuracy": 0.725073516368866, "num_tokens": 61928742.0, "step": 119 }, { "entropy": 0.850617527961731, "epoch": 0.8108108108108109, "grad_norm": 0.020385252311825752, "learning_rate": 4.055944055944056e-05, "loss": 0.8499, "mean_token_accuracy": 0.730745792388916, "num_tokens": 62446355.0, "step": 120 }, { "entropy": 0.8720276355743408, "epoch": 0.8175675675675675, "grad_norm": 0.02067047357559204, "learning_rate": 3.916083916083916e-05, "loss": 0.8746, "mean_token_accuracy": 0.7231326699256897, "num_tokens": 62967935.0, "step": 121 }, { "entropy": 0.9207990169525146, "epoch": 0.8243243243243243, "grad_norm": 0.02032148465514183, "learning_rate": 3.776223776223776e-05, "loss": 0.922, "mean_token_accuracy": 0.710451066493988, "num_tokens": 63489615.0, "step": 122 }, { "entropy": 0.9047123193740845, "epoch": 0.831081081081081, "grad_norm": 0.020533205941319466, "learning_rate": 3.6363636363636364e-05, "loss": 0.9015, "mean_token_accuracy": 0.7166460752487183, "num_tokens": 64010947.0, "step": 123 }, { "entropy": 0.8477683067321777, "epoch": 0.8378378378378378, "grad_norm": 0.019841615110635757, "learning_rate": 3.4965034965034965e-05, "loss": 0.8415, "mean_token_accuracy": 0.7314550876617432, "num_tokens": 64531314.0, "step": 124 }, { "entropy": 0.8877344131469727, "epoch": 0.8445945945945946, "grad_norm": 0.01969732716679573, "learning_rate": 3.356643356643357e-05, "loss": 0.8858, "mean_token_accuracy": 0.7207072973251343, "num_tokens": 65053203.0, "step": 125 }, { "entropy": 0.8977670669555664, "epoch": 0.8513513513513513, "grad_norm": 0.01998847909271717, "learning_rate": 3.216783216783217e-05, "loss": 0.8914, "mean_token_accuracy": 0.7174409627914429, "num_tokens": 65574215.0, "step": 126 }, { "entropy": 0.8922737836837769, "epoch": 0.8581081081081081, "grad_norm": 0.02041775733232498, "learning_rate": 3.0769230769230774e-05, "loss": 0.8918, "mean_token_accuracy": 0.7177229523658752, "num_tokens": 66095688.0, "step": 127 }, { "entropy": 0.8866901397705078, "epoch": 0.8648648648648649, "grad_norm": 0.02134627476334572, "learning_rate": 2.9370629370629372e-05, "loss": 0.877, "mean_token_accuracy": 0.7219728827476501, "num_tokens": 66608762.0, "step": 128 }, { "entropy": 0.8762195110321045, "epoch": 0.8716216216216216, "grad_norm": 0.020530981943011284, "learning_rate": 2.7972027972027976e-05, "loss": 0.8705, "mean_token_accuracy": 0.7246843576431274, "num_tokens": 67128373.0, "step": 129 }, { "entropy": 0.8929407596588135, "epoch": 0.8783783783783784, "grad_norm": 0.021080242469906807, "learning_rate": 2.6573426573426574e-05, "loss": 0.883, "mean_token_accuracy": 0.7201827168464661, "num_tokens": 67645584.0, "step": 130 }, { "entropy": 0.8947334289550781, "epoch": 0.8851351351351351, "grad_norm": 0.021501585841178894, "learning_rate": 2.5174825174825178e-05, "loss": 0.8867, "mean_token_accuracy": 0.7202873826026917, "num_tokens": 68163799.0, "step": 131 }, { "entropy": 0.8815573453903198, "epoch": 0.8918918918918919, "grad_norm": 0.02011815272271633, "learning_rate": 2.377622377622378e-05, "loss": 0.8755, "mean_token_accuracy": 0.722142219543457, "num_tokens": 68684635.0, "step": 132 }, { "entropy": 0.8770313262939453, "epoch": 0.8986486486486487, "grad_norm": 0.021030854433774948, "learning_rate": 2.237762237762238e-05, "loss": 0.8726, "mean_token_accuracy": 0.7231403589248657, "num_tokens": 69205861.0, "step": 133 }, { "entropy": 0.8650751709938049, "epoch": 0.9054054054054054, "grad_norm": 0.020364264026284218, "learning_rate": 2.097902097902098e-05, "loss": 0.8633, "mean_token_accuracy": 0.7260516881942749, "num_tokens": 69728254.0, "step": 134 }, { "entropy": 0.9033790230751038, "epoch": 0.9121621621621622, "grad_norm": 0.02128477208316326, "learning_rate": 1.958041958041958e-05, "loss": 0.9083, "mean_token_accuracy": 0.7141423225402832, "num_tokens": 70248446.0, "step": 135 }, { "entropy": 0.8698260188102722, "epoch": 0.918918918918919, "grad_norm": 0.020461006090044975, "learning_rate": 1.8181818181818182e-05, "loss": 0.8678, "mean_token_accuracy": 0.7250317335128784, "num_tokens": 70768977.0, "step": 136 }, { "entropy": 0.9101998805999756, "epoch": 0.9256756756756757, "grad_norm": 0.021351408213377, "learning_rate": 1.6783216783216786e-05, "loss": 0.9156, "mean_token_accuracy": 0.7119852304458618, "num_tokens": 71285104.0, "step": 137 }, { "entropy": 0.8741437196731567, "epoch": 0.9324324324324325, "grad_norm": 0.02133285254240036, "learning_rate": 1.5384615384615387e-05, "loss": 0.8756, "mean_token_accuracy": 0.7225217819213867, "num_tokens": 71806248.0, "step": 138 }, { "entropy": 0.8736119270324707, "epoch": 0.9391891891891891, "grad_norm": 0.020086556673049927, "learning_rate": 1.3986013986013988e-05, "loss": 0.8683, "mean_token_accuracy": 0.7247164249420166, "num_tokens": 72328759.0, "step": 139 }, { "entropy": 0.8891340494155884, "epoch": 0.9459459459459459, "grad_norm": 0.02030119113624096, "learning_rate": 1.2587412587412589e-05, "loss": 0.886, "mean_token_accuracy": 0.720429539680481, "num_tokens": 72848818.0, "step": 140 }, { "entropy": 0.9049081802368164, "epoch": 0.9527027027027027, "grad_norm": 0.020596666261553764, "learning_rate": 1.118881118881119e-05, "loss": 0.9042, "mean_token_accuracy": 0.7161701321601868, "num_tokens": 73370480.0, "step": 141 }, { "entropy": 0.8795987367630005, "epoch": 0.9594594594594594, "grad_norm": 0.020133303478360176, "learning_rate": 9.79020979020979e-06, "loss": 0.8769, "mean_token_accuracy": 0.722213089466095, "num_tokens": 73892465.0, "step": 142 }, { "entropy": 0.9042908549308777, "epoch": 0.9662162162162162, "grad_norm": 0.020722530782222748, "learning_rate": 8.391608391608393e-06, "loss": 0.9, "mean_token_accuracy": 0.7148555517196655, "num_tokens": 74407048.0, "step": 143 }, { "entropy": 0.8909604549407959, "epoch": 0.972972972972973, "grad_norm": 0.020139718428254128, "learning_rate": 6.993006993006994e-06, "loss": 0.8875, "mean_token_accuracy": 0.7199447154998779, "num_tokens": 74927929.0, "step": 144 }, { "entropy": 0.8854581117630005, "epoch": 0.9797297297297297, "grad_norm": 0.020443160086870193, "learning_rate": 5.594405594405595e-06, "loss": 0.8828, "mean_token_accuracy": 0.7213336825370789, "num_tokens": 75448350.0, "step": 145 }, { "entropy": 0.9017068147659302, "epoch": 0.9864864864864865, "grad_norm": 0.02036883309483528, "learning_rate": 4.195804195804197e-06, "loss": 0.8969, "mean_token_accuracy": 0.7174678444862366, "num_tokens": 75968294.0, "step": 146 }, { "entropy": 0.8653884530067444, "epoch": 0.9932432432432432, "grad_norm": 0.020496118813753128, "learning_rate": 2.7972027972027974e-06, "loss": 0.862, "mean_token_accuracy": 0.725719153881073, "num_tokens": 76484227.0, "step": 147 }, { "entropy": 0.8869370818138123, "epoch": 1.0, "grad_norm": 0.020335717126727104, "learning_rate": 1.3986013986013987e-06, "loss": 0.8832, "mean_token_accuracy": 0.72120600938797, "num_tokens": 77005516.0, "step": 148 }, { "epoch": 1.0, "step": 148, "total_flos": 3.219089170299355e+18, "train_loss": 0.901587930080053, "train_runtime": 1651.2377, "train_samples_per_second": 5.736, "train_steps_per_second": 0.09 } ], "logging_steps": 1, "max_steps": 148, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.219089170299355e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }