{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8333333333333334, "eval_steps": 500, "global_step": 80, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio": 0.0, "completion_length": 153.1875, "epoch": 0.010416666666666666, "grad_norm": 2.2964696301344203, "kl": 0.0008754730224609375, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 44706.0, "reward": 0.46666670590639114, "reward_std": 0.7099685594439507, "rewards/warm_up_reward/mean": 0.3888888955116272, "rewards/warm_up_reward/std": 0.7168056517839432, "step": 1 }, { "clip_ratio": 0.0, "completion_length": 147.95833587646484, "epoch": 0.020833333333333332, "grad_norm": 2.332332033549255, "kl": 0.0011844635009765625, "learning_rate": 1e-06, "loss": 0.0071, "num_tokens": 88904.0, "reward": 0.5250000506639481, "reward_std": 0.748512014746666, "rewards/warm_up_reward/mean": 0.4375, "rewards/warm_up_reward/std": 0.6997176110744476, "step": 2 }, { "clip_ratio": 0.0, "completion_length": 154.00000381469727, "epoch": 0.03125, "grad_norm": 2.04671487874671, "kl": 0.0013523101806640625, "learning_rate": 1e-06, "loss": 0.0124, "num_tokens": 133634.0, "reward": 0.500000037252903, "reward_std": 0.667382538318634, "rewards/warm_up_reward/mean": 0.416666679084301, "rewards/warm_up_reward/std": 0.7197580486536026, "step": 3 }, { "clip_ratio": 0.0, "completion_length": 151.69792556762695, "epoch": 0.041666666666666664, "grad_norm": 2.2153093232246226, "kl": 0.00243377685546875, "learning_rate": 1e-06, "loss": 0.0232, "num_tokens": 178089.0, "reward": 0.43125002086162567, "reward_std": 0.662388876080513, "rewards/warm_up_reward/mean": 0.359375, "rewards/warm_up_reward/std": 0.6697845309972763, "step": 4 }, { "clip_ratio": 0.0, "completion_length": 148.84375762939453, "epoch": 0.052083333333333336, "grad_norm": 2.4723944423129782, "kl": 0.004669189453125, "learning_rate": 1e-06, "loss": 0.0002, "num_tokens": 222360.0, "reward": 0.6250000596046448, "reward_std": 0.8341160118579865, "rewards/warm_up_reward/mean": 0.5208333358168602, "rewards/warm_up_reward/std": 0.7749323397874832, "step": 5 }, { "clip_ratio": 0.0, "completion_length": 147.4791717529297, "epoch": 0.0625, "grad_norm": 2.452268566541841, "kl": 0.00980377197265625, "learning_rate": 1e-06, "loss": 0.05, "num_tokens": 266434.0, "reward": 0.5625000298023224, "reward_std": 0.8674589395523071, "rewards/warm_up_reward/mean": 0.46875, "rewards/warm_up_reward/std": 0.7468476742506027, "step": 6 }, { "clip_ratio": 0.0, "completion_length": 147.79166793823242, "epoch": 0.07291666666666667, "grad_norm": 55.84129867431237, "kl": 0.194091796875, "learning_rate": 1e-06, "loss": 0.0511, "num_tokens": 310532.0, "reward": 0.9333333820104599, "reward_std": 0.9551109671592712, "rewards/warm_up_reward/mean": 0.7777777910232544, "rewards/warm_up_reward/std": 0.8151216059923172, "step": 7 }, { "clip_ratio": 0.0, "completion_length": 157.65625381469727, "epoch": 0.08333333333333333, "grad_norm": 29.072228981795064, "kl": 0.1204376220703125, "learning_rate": 1e-06, "loss": 0.0653, "num_tokens": 355673.0, "reward": 0.8281250894069672, "reward_std": 0.9019797444343567, "rewards/warm_up_reward/mean": 0.6901041716337204, "rewards/warm_up_reward/std": 0.800490528345108, "step": 8 }, { "clip_ratio": 0.0, "completion_length": 147.62500381469727, "epoch": 0.09375, "grad_norm": 6.698990666563141, "kl": 0.04058837890625, "learning_rate": 1e-06, "loss": 0.0742, "num_tokens": 399929.0, "reward": 0.6916667073965073, "reward_std": 0.7857400476932526, "rewards/warm_up_reward/mean": 0.5763888955116272, "rewards/warm_up_reward/std": 0.7836765646934509, "step": 9 }, { "clip_ratio": 0.0, "completion_length": 150.2291717529297, "epoch": 0.10416666666666667, "grad_norm": 5.758187436239667, "kl": 0.1109619140625, "learning_rate": 1e-06, "loss": 0.0303, "num_tokens": 444279.0, "reward": 0.9000000357627869, "reward_std": 0.9381224364042282, "rewards/warm_up_reward/mean": 0.75, "rewards/warm_up_reward/std": 0.820982426404953, "step": 10 }, { "clip_ratio": 0.0, "completion_length": 150.67708587646484, "epoch": 0.11458333333333333, "grad_norm": 24.363046434997802, "kl": 0.2352294921875, "learning_rate": 1e-06, "loss": 0.0567, "num_tokens": 488708.0, "reward": 0.8645834177732468, "reward_std": 0.9693308770656586, "rewards/warm_up_reward/mean": 0.720486119389534, "rewards/warm_up_reward/std": 0.8197668194770813, "step": 11 }, { "clip_ratio": 0.0, "completion_length": 155.2916717529297, "epoch": 0.125, "grad_norm": 3.1714199544174053, "kl": 0.08984375, "learning_rate": 1e-06, "loss": 0.0481, "num_tokens": 533526.0, "reward": 1.0781250596046448, "reward_std": 0.9790745824575424, "rewards/warm_up_reward/mean": 0.8984375, "rewards/warm_up_reward/std": 0.8148495256900787, "step": 12 }, { "clip_ratio": 0.0, "completion_length": 140.2395896911621, "epoch": 0.13541666666666666, "grad_norm": 5.687529927842251, "kl": 0.12872314453125, "learning_rate": 1e-06, "loss": 0.0633, "num_tokens": 576965.0, "reward": 0.9635417610406876, "reward_std": 0.9496043026447296, "rewards/warm_up_reward/mean": 0.802951380610466, "rewards/warm_up_reward/std": 0.7969174236059189, "step": 13 }, { "clip_ratio": 0.0, "completion_length": 137.87500762939453, "epoch": 0.14583333333333334, "grad_norm": 4.785248899065895, "kl": 0.09088134765625, "learning_rate": 1e-06, "loss": 0.0956, "num_tokens": 620003.0, "reward": 0.8687500655651093, "reward_std": 0.8496406525373459, "rewards/warm_up_reward/mean": 0.7239583283662796, "rewards/warm_up_reward/std": 0.7836050242185593, "step": 14 }, { "clip_ratio": 0.0, "completion_length": 140.87500381469727, "epoch": 0.15625, "grad_norm": 3.043938278706536, "kl": 0.06243896484375, "learning_rate": 1e-06, "loss": 0.048, "num_tokens": 663347.0, "reward": 0.8843750804662704, "reward_std": 0.9135490357875824, "rewards/warm_up_reward/mean": 0.7369791716337204, "rewards/warm_up_reward/std": 0.8226732462644577, "step": 15 }, { "clip_ratio": 0.0, "completion_length": 145.0833396911621, "epoch": 0.16666666666666666, "grad_norm": 4.964409464782977, "kl": 0.0794677734375, "learning_rate": 1e-06, "loss": 0.0574, "num_tokens": 707341.0, "reward": 0.9437500536441803, "reward_std": 0.8649309277534485, "rewards/warm_up_reward/mean": 0.7864583283662796, "rewards/warm_up_reward/std": 0.795333594083786, "step": 16 }, { "clip_ratio": 0.0, "completion_length": 143.76041793823242, "epoch": 0.17708333333333334, "grad_norm": 2.5933725635254965, "kl": 0.0601806640625, "learning_rate": 1e-06, "loss": 0.0293, "num_tokens": 751028.0, "reward": 0.7000000327825546, "reward_std": 0.9639299660921097, "rewards/warm_up_reward/mean": 0.5833333283662796, "rewards/warm_up_reward/std": 0.7868598401546478, "step": 17 }, { "clip_ratio": 0.0, "completion_length": 140.90625381469727, "epoch": 0.1875, "grad_norm": 2.560263758795748, "kl": 0.0513916015625, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 794597.0, "reward": 0.947916716337204, "reward_std": 0.9378172904253006, "rewards/warm_up_reward/mean": 0.7899305373430252, "rewards/warm_up_reward/std": 0.8166805952787399, "step": 18 }, { "clip_ratio": 0.0, "completion_length": 134.42708587646484, "epoch": 0.19791666666666666, "grad_norm": 2.827678010439988, "kl": 0.05694580078125, "learning_rate": 1e-06, "loss": 0.0819, "num_tokens": 837364.0, "reward": 0.9250000715255737, "reward_std": 0.9560818523168564, "rewards/warm_up_reward/mean": 0.7708333283662796, "rewards/warm_up_reward/std": 0.8091117739677429, "step": 19 }, { "clip_ratio": 0.0, "completion_length": 137.16666793823242, "epoch": 0.20833333333333334, "grad_norm": 2.92256682920272, "kl": 0.05963134765625, "learning_rate": 1e-06, "loss": 0.0522, "num_tokens": 880442.0, "reward": 0.9791667610406876, "reward_std": 0.9076904356479645, "rewards/warm_up_reward/mean": 0.8159722089767456, "rewards/warm_up_reward/std": 0.8102934062480927, "step": 20 }, { "clip_ratio": 0.0, "completion_length": 137.84375381469727, "epoch": 0.21875, "grad_norm": 2.864931852177023, "kl": 0.04974365234375, "learning_rate": 1e-06, "loss": 0.0248, "num_tokens": 923615.0, "reward": 0.9281250536441803, "reward_std": 0.9918985664844513, "rewards/warm_up_reward/mean": 0.7734375, "rewards/warm_up_reward/std": 0.8185379058122635, "step": 21 }, { "clip_ratio": 0.0, "completion_length": 130.8750057220459, "epoch": 0.22916666666666666, "grad_norm": 2.8348309851740456, "kl": 0.05242919921875, "learning_rate": 1e-06, "loss": 0.0427, "num_tokens": 966107.0, "reward": 1.0531250834465027, "reward_std": 0.9989801347255707, "rewards/warm_up_reward/mean": 0.8776041567325592, "rewards/warm_up_reward/std": 0.8102044314146042, "step": 22 }, { "clip_ratio": 0.0, "completion_length": 144.26041793823242, "epoch": 0.23958333333333334, "grad_norm": 2.6071380741149004, "kl": 0.06549072265625, "learning_rate": 1e-06, "loss": 0.0523, "num_tokens": 1009998.0, "reward": 0.9625000357627869, "reward_std": 0.9378929734230042, "rewards/warm_up_reward/mean": 0.8020833432674408, "rewards/warm_up_reward/std": 0.8023640215396881, "step": 23 }, { "clip_ratio": 0.0, "completion_length": 139.73958587646484, "epoch": 0.25, "grad_norm": 2.697375275936146, "kl": 0.05670166015625, "learning_rate": 1e-06, "loss": 0.0705, "num_tokens": 1053401.0, "reward": 1.031250074505806, "reward_std": 0.9829376488924026, "rewards/warm_up_reward/mean": 0.859375, "rewards/warm_up_reward/std": 0.813440352678299, "step": 24 }, { "clip_ratio": 0.0, "completion_length": 138.8854217529297, "epoch": 0.2604166666666667, "grad_norm": 85.52835351864486, "kl": 0.198486328125, "learning_rate": 1e-06, "loss": 0.0333, "num_tokens": 1096650.0, "reward": 1.089583471417427, "reward_std": 0.9481612741947174, "rewards/warm_up_reward/mean": 0.9079861044883728, "rewards/warm_up_reward/std": 0.7854074388742447, "step": 25 }, { "clip_ratio": 0.0, "completion_length": 130.62500381469727, "epoch": 0.2708333333333333, "grad_norm": 3.4064495832019297, "kl": 0.07513427734375, "learning_rate": 1e-06, "loss": 0.0544, "num_tokens": 1139058.0, "reward": 0.9885417520999908, "reward_std": 0.990489736199379, "rewards/warm_up_reward/mean": 0.8237847238779068, "rewards/warm_up_reward/std": 0.8235566318035126, "step": 26 }, { "clip_ratio": 0.0, "completion_length": 134.50000381469727, "epoch": 0.28125, "grad_norm": 118.32186343464612, "kl": 0.2930908203125, "learning_rate": 1e-06, "loss": 0.011, "num_tokens": 1181802.0, "reward": 1.0552084296941757, "reward_std": 0.868858814239502, "rewards/warm_up_reward/mean": 0.8793402910232544, "rewards/warm_up_reward/std": 0.8186527788639069, "step": 27 }, { "clip_ratio": 0.0, "completion_length": 140.8541717529297, "epoch": 0.2916666666666667, "grad_norm": 6.711195724360143, "kl": 0.1092529296875, "learning_rate": 1e-06, "loss": 0.0187, "num_tokens": 1225408.0, "reward": 0.9666667431592941, "reward_std": 0.9556048065423965, "rewards/warm_up_reward/mean": 0.8055555671453476, "rewards/warm_up_reward/std": 0.8192583322525024, "step": 28 }, { "clip_ratio": 0.0, "completion_length": 137.48958778381348, "epoch": 0.3020833333333333, "grad_norm": 4.712858469896548, "kl": 0.13214111328125, "learning_rate": 1e-06, "loss": 0.0356, "num_tokens": 1268481.0, "reward": 0.9000000655651093, "reward_std": 1.0170713812112808, "rewards/warm_up_reward/mean": 0.7500000149011612, "rewards/warm_up_reward/std": 0.8409183472394943, "step": 29 }, { "clip_ratio": 0.0, "completion_length": 142.09375381469727, "epoch": 0.3125, "grad_norm": 2.604660112727859, "kl": 0.063720703125, "learning_rate": 1e-06, "loss": -0.0023, "num_tokens": 1312008.0, "reward": 0.9000000059604645, "reward_std": 0.9869166016578674, "rewards/warm_up_reward/mean": 0.75, "rewards/warm_up_reward/std": 0.8337783664464951, "step": 30 }, { "clip_ratio": 0.0, "completion_length": 145.59375381469727, "epoch": 0.3229166666666667, "grad_norm": 6.185404235919849, "kl": 0.157470703125, "learning_rate": 1e-06, "loss": 0.0097, "num_tokens": 1356039.0, "reward": 1.0770834237337112, "reward_std": 0.9777155965566635, "rewards/warm_up_reward/mean": 0.8975694477558136, "rewards/warm_up_reward/std": 0.7948237210512161, "step": 31 }, { "clip_ratio": 0.0, "completion_length": 146.37500762939453, "epoch": 0.3333333333333333, "grad_norm": 2.4537091049295956, "kl": 0.06231689453125, "learning_rate": 1e-06, "loss": 0.0046, "num_tokens": 1400007.0, "reward": 0.9812500327825546, "reward_std": 0.911426916718483, "rewards/warm_up_reward/mean": 0.8177083283662796, "rewards/warm_up_reward/std": 0.7836276739835739, "step": 32 }, { "clip_ratio": 0.0, "completion_length": 145.19792556762695, "epoch": 0.34375, "grad_norm": 3.0265092714690702, "kl": 0.061767578125, "learning_rate": 1e-06, "loss": 0.0423, "num_tokens": 1443874.0, "reward": 1.0354167222976685, "reward_std": 0.9364243745803833, "rewards/warm_up_reward/mean": 0.8628472238779068, "rewards/warm_up_reward/std": 0.8118415027856827, "step": 33 }, { "clip_ratio": 0.0, "completion_length": 143.8645896911621, "epoch": 0.3541666666666667, "grad_norm": 2.4770416085087477, "kl": 0.05682373046875, "learning_rate": 1e-06, "loss": 0.0242, "num_tokens": 1487601.0, "reward": 0.9906250834465027, "reward_std": 1.0336193144321442, "rewards/warm_up_reward/mean": 0.8255208432674408, "rewards/warm_up_reward/std": 0.8268236815929413, "step": 34 }, { "clip_ratio": 0.0, "completion_length": 137.34375762939453, "epoch": 0.3645833333333333, "grad_norm": 2.787024849258649, "kl": 0.051513671875, "learning_rate": 1e-06, "loss": 0.0409, "num_tokens": 1530594.0, "reward": 1.0822917073965073, "reward_std": 0.8545732349157333, "rewards/warm_up_reward/mean": 0.9019097238779068, "rewards/warm_up_reward/std": 0.7748938798904419, "step": 35 }, { "clip_ratio": 0.0, "completion_length": 145.71875381469727, "epoch": 0.375, "grad_norm": 2.515335560097886, "kl": 0.0687255859375, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 1574409.0, "reward": 0.9875000715255737, "reward_std": 0.9168877303600311, "rewards/warm_up_reward/mean": 0.8229166567325592, "rewards/warm_up_reward/std": 0.8267286717891693, "step": 36 }, { "clip_ratio": 0.0, "completion_length": 136.58333778381348, "epoch": 0.3854166666666667, "grad_norm": 11.790654099301383, "kl": 0.1207275390625, "learning_rate": 1e-06, "loss": 0.0652, "num_tokens": 1617503.0, "reward": 1.031250074505806, "reward_std": 0.9142753481864929, "rewards/warm_up_reward/mean": 0.859375, "rewards/warm_up_reward/std": 0.7878352403640747, "step": 37 }, { "clip_ratio": 0.0, "completion_length": 151.8645896911621, "epoch": 0.3958333333333333, "grad_norm": 3.4202024786499643, "kl": 0.08099365234375, "learning_rate": 1e-06, "loss": 0.0212, "num_tokens": 1661980.0, "reward": 0.6229167133569717, "reward_std": 0.8097958564758301, "rewards/warm_up_reward/mean": 0.5190972313284874, "rewards/warm_up_reward/std": 0.7617596387863159, "step": 38 }, { "clip_ratio": 0.0, "completion_length": 142.8854217529297, "epoch": 0.40625, "grad_norm": 2.4994048524913364, "kl": 0.06414794921875, "learning_rate": 1e-06, "loss": 0.0569, "num_tokens": 1705637.0, "reward": 1.0656251087784767, "reward_std": 0.8034301698207855, "rewards/warm_up_reward/mean": 0.8880208432674408, "rewards/warm_up_reward/std": 0.7390912175178528, "step": 39 }, { "clip_ratio": 0.0, "completion_length": 146.18750762939453, "epoch": 0.4166666666666667, "grad_norm": 2.4999575203414186, "kl": 0.0628662109375, "learning_rate": 1e-06, "loss": 0.0137, "num_tokens": 1749719.0, "reward": 1.0104167312383652, "reward_std": 0.9736887365579605, "rewards/warm_up_reward/mean": 0.8420138955116272, "rewards/warm_up_reward/std": 0.822531133890152, "step": 40 }, { "clip_ratio": 0.0, "completion_length": 143.9479217529297, "epoch": 0.4270833333333333, "grad_norm": 2.2639443093591205, "kl": 0.0675048828125, "learning_rate": 1e-06, "loss": 0.0087, "num_tokens": 1793436.0, "reward": 0.9229167401790619, "reward_std": 0.9153347015380859, "rewards/warm_up_reward/mean": 0.7690972089767456, "rewards/warm_up_reward/std": 0.8183709383010864, "step": 41 }, { "clip_ratio": 0.0, "completion_length": 147.58333587646484, "epoch": 0.4375, "grad_norm": 2.5193978009218965, "kl": 0.07177734375, "learning_rate": 1e-06, "loss": 0.0392, "num_tokens": 1837562.0, "reward": 1.006250038743019, "reward_std": 0.8919505327939987, "rewards/warm_up_reward/mean": 0.8385416716337204, "rewards/warm_up_reward/std": 0.8046689182519913, "step": 42 }, { "clip_ratio": 0.0, "completion_length": 141.37500381469727, "epoch": 0.4479166666666667, "grad_norm": 2.5496207567669034, "kl": 0.06622314453125, "learning_rate": 1e-06, "loss": 0.0452, "num_tokens": 1880972.0, "reward": 1.068750038743019, "reward_std": 0.9111972749233246, "rewards/warm_up_reward/mean": 0.890625, "rewards/warm_up_reward/std": 0.7989336252212524, "step": 43 }, { "clip_ratio": 0.0, "completion_length": 130.83333587646484, "epoch": 0.4583333333333333, "grad_norm": 5.245713134644536, "kl": 0.0999755859375, "learning_rate": 1e-06, "loss": -0.0051, "num_tokens": 1923460.0, "reward": 0.929166704416275, "reward_std": 0.8137651234865189, "rewards/warm_up_reward/mean": 0.7743055373430252, "rewards/warm_up_reward/std": 0.8136637955904007, "step": 44 }, { "clip_ratio": 0.0, "completion_length": 144.2604217529297, "epoch": 0.46875, "grad_norm": 6.317144379422631, "kl": 0.10107421875, "learning_rate": 1e-06, "loss": -0.0175, "num_tokens": 1967315.0, "reward": 1.1012500673532486, "reward_std": 0.7948171943426132, "rewards/warm_up_reward/mean": 0.9177083224058151, "rewards/warm_up_reward/std": 0.7791551500558853, "step": 45 }, { "clip_ratio": 0.0, "completion_length": 141.5104217529297, "epoch": 0.4791666666666667, "grad_norm": 2.8932825846308172, "kl": 0.0859375, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 2010882.0, "reward": 1.0656251162290573, "reward_std": 0.9644656330347061, "rewards/warm_up_reward/mean": 0.8880208432674408, "rewards/warm_up_reward/std": 0.8194199502468109, "step": 46 }, { "clip_ratio": 0.0, "completion_length": 140.21875762939453, "epoch": 0.4895833333333333, "grad_norm": 2.9093719889125174, "kl": 0.1007080078125, "learning_rate": 1e-06, "loss": -0.0275, "num_tokens": 2054307.0, "reward": 0.991666704416275, "reward_std": 0.9324973523616791, "rewards/warm_up_reward/mean": 0.8263888955116272, "rewards/warm_up_reward/std": 0.8247981667518616, "step": 47 }, { "clip_ratio": 0.0, "completion_length": 136.5729217529297, "epoch": 0.5, "grad_norm": 2.3209209113606146, "kl": 0.08441162109375, "learning_rate": 1e-06, "loss": 0.001, "num_tokens": 2097424.0, "reward": 1.1010417491197586, "reward_std": 0.8305719494819641, "rewards/warm_up_reward/mean": 0.9175347238779068, "rewards/warm_up_reward/std": 0.7910451591014862, "step": 48 }, { "clip_ratio": 0.0, "completion_length": 142.50000762939453, "epoch": 0.5104166666666666, "grad_norm": 2.470827147583925, "kl": 0.08355712890625, "learning_rate": 1e-06, "loss": 0.0433, "num_tokens": 2141092.0, "reward": 1.0208334028720856, "reward_std": 0.9052031934261322, "rewards/warm_up_reward/mean": 0.8506944477558136, "rewards/warm_up_reward/std": 0.8133653849363327, "step": 49 }, { "clip_ratio": 0.0, "completion_length": 138.58333587646484, "epoch": 0.5208333333333334, "grad_norm": 2.314915736053297, "kl": 0.0765380859375, "learning_rate": 1e-06, "loss": 0.0188, "num_tokens": 2184306.0, "reward": 1.050000086426735, "reward_std": 0.9242848604917526, "rewards/warm_up_reward/mean": 0.8749999850988388, "rewards/warm_up_reward/std": 0.7892495840787888, "step": 50 }, { "clip_ratio": 0.0, "completion_length": 138.92708778381348, "epoch": 0.53125, "grad_norm": 16.28250872141934, "kl": 0.26171875, "learning_rate": 1e-06, "loss": 0.0328, "num_tokens": 2227673.0, "reward": 1.1312500685453415, "reward_std": 1.0009342432022095, "rewards/warm_up_reward/mean": 0.9427083432674408, "rewards/warm_up_reward/std": 0.8114291131496429, "step": 51 }, { "clip_ratio": 0.0, "completion_length": 140.92708587646484, "epoch": 0.5416666666666666, "grad_norm": 2.4052567403889364, "kl": 0.0528564453125, "learning_rate": 1e-06, "loss": 0.0201, "num_tokens": 2271064.0, "reward": 1.2375000417232513, "reward_std": 0.8487301468849182, "rewards/warm_up_reward/mean": 1.03125, "rewards/warm_up_reward/std": 0.7739475220441818, "step": 52 }, { "clip_ratio": 0.0, "completion_length": 146.1458396911621, "epoch": 0.5520833333333334, "grad_norm": 2.4582483635661467, "kl": 0.05413818359375, "learning_rate": 1e-06, "loss": 0.0429, "num_tokens": 2315052.0, "reward": 1.1104167699813843, "reward_std": 0.9700468927621841, "rewards/warm_up_reward/mean": 0.9253472238779068, "rewards/warm_up_reward/std": 0.7955830246210098, "step": 53 }, { "clip_ratio": 0.0, "completion_length": 145.2083396911621, "epoch": 0.5625, "grad_norm": 2.4577305981193165, "kl": 0.0728759765625, "learning_rate": 1e-06, "loss": -0.0131, "num_tokens": 2358962.0, "reward": 1.1750001013278961, "reward_std": 0.9326367676258087, "rewards/warm_up_reward/mean": 0.9791666716337204, "rewards/warm_up_reward/std": 0.7897387892007828, "step": 54 }, { "clip_ratio": 0.0, "completion_length": 145.7916717529297, "epoch": 0.5729166666666666, "grad_norm": 2.302326467124225, "kl": 0.0810546875, "learning_rate": 1e-06, "loss": 0.0291, "num_tokens": 2402922.0, "reward": 0.9875000566244125, "reward_std": 1.0013651847839355, "rewards/warm_up_reward/mean": 0.8229166716337204, "rewards/warm_up_reward/std": 0.8247637003660202, "step": 55 }, { "clip_ratio": 0.0, "completion_length": 147.9479217529297, "epoch": 0.5833333333333334, "grad_norm": 2.580316913788417, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0509, "num_tokens": 2447065.0, "reward": 1.183750033378601, "reward_std": 0.9039967954158783, "rewards/warm_up_reward/mean": 0.9864583313465118, "rewards/warm_up_reward/std": 0.787983849644661, "step": 56 }, { "clip_ratio": 0.0, "completion_length": 142.05208587646484, "epoch": 0.59375, "grad_norm": 2.748934001160214, "kl": 0.06414794921875, "learning_rate": 1e-06, "loss": -0.0078, "num_tokens": 2490552.0, "reward": 1.07750004529953, "reward_std": 0.9763932228088379, "rewards/warm_up_reward/mean": 0.8979166746139526, "rewards/warm_up_reward/std": 0.8105349242687225, "step": 57 }, { "clip_ratio": 0.0, "completion_length": 146.0729217529297, "epoch": 0.6041666666666666, "grad_norm": 2.279728805159882, "kl": 0.05682373046875, "learning_rate": 1e-06, "loss": 0.0054, "num_tokens": 2534503.0, "reward": 1.2562500685453415, "reward_std": 0.8728772848844528, "rewards/warm_up_reward/mean": 1.046875, "rewards/warm_up_reward/std": 0.715716764330864, "step": 58 }, { "clip_ratio": 0.0, "completion_length": 136.77083587646484, "epoch": 0.6145833333333334, "grad_norm": 2.301940905359112, "kl": 0.0634765625, "learning_rate": 1e-06, "loss": 0.0198, "num_tokens": 2577603.0, "reward": 1.2200001031160355, "reward_std": 0.9238942861557007, "rewards/warm_up_reward/mean": 1.0166666805744171, "rewards/warm_up_reward/std": 0.7837828695774078, "step": 59 }, { "clip_ratio": 0.0, "completion_length": 149.3645896911621, "epoch": 0.625, "grad_norm": 2.3088797777576535, "kl": 0.0701904296875, "learning_rate": 1e-06, "loss": 0.0668, "num_tokens": 2621852.0, "reward": 1.0885417461395264, "reward_std": 0.9921838045120239, "rewards/warm_up_reward/mean": 0.9071180820465088, "rewards/warm_up_reward/std": 0.8187949508428574, "step": 60 }, { "clip_ratio": 0.0, "completion_length": 144.9791717529297, "epoch": 0.6354166666666666, "grad_norm": 2.3531788106957063, "kl": 0.0830078125, "learning_rate": 1e-06, "loss": 0.0715, "num_tokens": 2665584.0, "reward": 1.103541761636734, "reward_std": 0.8904776722192764, "rewards/warm_up_reward/mean": 0.9196180552244186, "rewards/warm_up_reward/std": 0.7996000051498413, "step": 61 }, { "clip_ratio": 0.0, "completion_length": 140.5416717529297, "epoch": 0.6458333333333334, "grad_norm": 2.3780245058749894, "kl": 0.0545654296875, "learning_rate": 1e-06, "loss": 0.0144, "num_tokens": 2709058.0, "reward": 1.2650001347064972, "reward_std": 0.8583473563194275, "rewards/warm_up_reward/mean": 1.0541666597127914, "rewards/warm_up_reward/std": 0.753357321023941, "step": 62 }, { "clip_ratio": 0.0, "completion_length": 150.5208396911621, "epoch": 0.65625, "grad_norm": 2.291724896507479, "kl": 0.05914306640625, "learning_rate": 1e-06, "loss": 0.0102, "num_tokens": 2753538.0, "reward": 1.2431251406669617, "reward_std": 0.8321643471717834, "rewards/warm_up_reward/mean": 1.0359375476837158, "rewards/warm_up_reward/std": 0.74222831428051, "step": 63 }, { "clip_ratio": 0.0, "completion_length": 143.2395896911621, "epoch": 0.6666666666666666, "grad_norm": 3.018275289311917, "kl": 0.085693359375, "learning_rate": 1e-06, "loss": 0.0078, "num_tokens": 2797253.0, "reward": 1.1802085041999817, "reward_std": 0.948539987206459, "rewards/warm_up_reward/mean": 0.9835069626569748, "rewards/warm_up_reward/std": 0.7748740911483765, "step": 64 }, { "clip_ratio": 0.0, "completion_length": 138.9791717529297, "epoch": 0.6770833333333334, "grad_norm": 2.219343433713507, "kl": 0.06298828125, "learning_rate": 1e-06, "loss": 0.0203, "num_tokens": 2840571.0, "reward": 1.2229167520999908, "reward_std": 0.783911868929863, "rewards/warm_up_reward/mean": 1.0190972089767456, "rewards/warm_up_reward/std": 0.7600821256637573, "step": 65 }, { "clip_ratio": 0.0, "completion_length": 152.8958396911621, "epoch": 0.6875, "grad_norm": 2.2126312475293877, "kl": 0.0704345703125, "learning_rate": 1e-06, "loss": 0.0425, "num_tokens": 2885201.0, "reward": 1.1691668182611465, "reward_std": 0.8853475451469421, "rewards/warm_up_reward/mean": 0.9743055552244186, "rewards/warm_up_reward/std": 0.7726792246103287, "step": 66 }, { "clip_ratio": 0.0, "completion_length": 147.61459350585938, "epoch": 0.6979166666666666, "grad_norm": 2.3729988144079748, "kl": 0.05316162109375, "learning_rate": 1e-06, "loss": 0.0235, "num_tokens": 2929396.0, "reward": 1.3312501311302185, "reward_std": 0.7987091541290283, "rewards/warm_up_reward/mean": 1.109375, "rewards/warm_up_reward/std": 0.7549550831317902, "step": 67 }, { "clip_ratio": 0.0, "completion_length": 144.1041717529297, "epoch": 0.7083333333333334, "grad_norm": 2.268946790543885, "kl": 0.04766845703125, "learning_rate": 1e-06, "loss": -0.0027, "num_tokens": 2973200.0, "reward": 1.325416773557663, "reward_std": 0.8379913568496704, "rewards/warm_up_reward/mean": 1.1045138835906982, "rewards/warm_up_reward/std": 0.7519785463809967, "step": 68 }, { "clip_ratio": 0.0, "completion_length": 140.4791717529297, "epoch": 0.71875, "grad_norm": 2.40410834715932, "kl": 0.0625, "learning_rate": 1e-06, "loss": 0.0508, "num_tokens": 3016728.0, "reward": 1.143750011920929, "reward_std": 0.8826231509447098, "rewards/warm_up_reward/mean": 0.953125, "rewards/warm_up_reward/std": 0.7572390139102936, "step": 69 }, { "clip_ratio": 0.0, "completion_length": 145.9791717529297, "epoch": 0.7291666666666666, "grad_norm": 2.377991017189631, "kl": 0.06512451171875, "learning_rate": 1e-06, "loss": 0.0505, "num_tokens": 3060790.0, "reward": 1.2854167819023132, "reward_std": 0.9141092300415039, "rewards/warm_up_reward/mean": 1.0711805671453476, "rewards/warm_up_reward/std": 0.7661919444799423, "step": 70 }, { "clip_ratio": 0.0, "completion_length": 144.7916717529297, "epoch": 0.7395833333333334, "grad_norm": 5.490360584080418, "kl": 0.068115234375, "learning_rate": 1e-06, "loss": 0.0061, "num_tokens": 3104672.0, "reward": 1.2937501072883606, "reward_std": 0.9385685622692108, "rewards/warm_up_reward/mean": 1.078125, "rewards/warm_up_reward/std": 0.7564428001642227, "step": 71 }, { "clip_ratio": 0.0, "completion_length": 132.94792366027832, "epoch": 0.75, "grad_norm": 2.705450393270237, "kl": 0.0748291015625, "learning_rate": 1e-06, "loss": 0.0032, "num_tokens": 3147441.0, "reward": 1.2687500715255737, "reward_std": 0.9319685697555542, "rewards/warm_up_reward/mean": 1.0572916567325592, "rewards/warm_up_reward/std": 0.7771977633237839, "step": 72 }, { "clip_ratio": 0.0, "completion_length": 151.91667556762695, "epoch": 0.7604166666666666, "grad_norm": 2.254735299608175, "kl": 0.07171630859375, "learning_rate": 1e-06, "loss": 0.0105, "num_tokens": 3192121.0, "reward": 1.0045834183692932, "reward_std": 0.8630332052707672, "rewards/warm_up_reward/mean": 0.8371527940034866, "rewards/warm_up_reward/std": 0.7969614416360855, "step": 73 }, { "clip_ratio": 0.0, "completion_length": 140.31250190734863, "epoch": 0.7708333333333334, "grad_norm": 2.4523220833578345, "kl": 0.05816650390625, "learning_rate": 1e-06, "loss": 0.0158, "num_tokens": 3235453.0, "reward": 1.1837501227855682, "reward_std": 0.9126418828964233, "rewards/warm_up_reward/mean": 0.9864583015441895, "rewards/warm_up_reward/std": 0.7863775044679642, "step": 74 }, { "clip_ratio": 0.0, "completion_length": 141.7395896911621, "epoch": 0.78125, "grad_norm": 2.2478331336635513, "kl": 0.0609130859375, "learning_rate": 1e-06, "loss": 0.0284, "num_tokens": 3278964.0, "reward": 1.2054167687892914, "reward_std": 0.9118891954421997, "rewards/warm_up_reward/mean": 1.0045138746500015, "rewards/warm_up_reward/std": 0.7829048186540604, "step": 75 }, { "clip_ratio": 0.0, "completion_length": 139.71875381469727, "epoch": 0.7916666666666666, "grad_norm": 2.6147718994418203, "kl": 0.0904541015625, "learning_rate": 1e-06, "loss": 0.0143, "num_tokens": 3322407.0, "reward": 1.2660417556762695, "reward_std": 0.9197860509157181, "rewards/warm_up_reward/mean": 1.0550346970558167, "rewards/warm_up_reward/std": 0.7622723281383514, "step": 76 }, { "clip_ratio": 0.0, "completion_length": 141.53125762939453, "epoch": 0.8020833333333334, "grad_norm": 2.530432648354832, "kl": 0.093505859375, "learning_rate": 1e-06, "loss": -0.0104, "num_tokens": 3366036.0, "reward": 1.1020834147930145, "reward_std": 0.902558371424675, "rewards/warm_up_reward/mean": 0.9184028059244156, "rewards/warm_up_reward/std": 0.7975014746189117, "step": 77 }, { "clip_ratio": 0.0, "completion_length": 146.0416717529297, "epoch": 0.8125, "grad_norm": 2.2386503327740015, "kl": 0.0811767578125, "learning_rate": 1e-06, "loss": 0.0184, "num_tokens": 3410026.0, "reward": 1.3406251072883606, "reward_std": 0.8862900286912918, "rewards/warm_up_reward/mean": 1.1171874850988388, "rewards/warm_up_reward/std": 0.7199237793684006, "step": 78 }, { "clip_ratio": 0.0, "completion_length": 133.6041717529297, "epoch": 0.8229166666666666, "grad_norm": 2.9736551256376744, "kl": 0.0992431640625, "learning_rate": 1e-06, "loss": 0.0029, "num_tokens": 3452750.0, "reward": 1.3556251227855682, "reward_std": 0.8310635536909103, "rewards/warm_up_reward/mean": 1.129687488079071, "rewards/warm_up_reward/std": 0.7288718819618225, "step": 79 }, { "clip_ratio": 0.0, "completion_length": 139.45834350585938, "epoch": 0.8333333333333334, "grad_norm": 4.081234898460602, "kl": 0.15234375, "learning_rate": 1e-06, "loss": -0.0154, "num_tokens": 3496018.0, "reward": 1.0906250923871994, "reward_std": 0.8768025040626526, "rewards/warm_up_reward/mean": 0.9088541716337204, "rewards/warm_up_reward/std": 0.8029628545045853, "step": 80 } ], "logging_steps": 1.0, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 16, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 6, "trial_name": null, "trial_params": null }