| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8333333333333334, |
| "eval_steps": 500, |
| "global_step": 80, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 153.1875, |
| "epoch": 0.010416666666666666, |
| "grad_norm": 2.2964696301344203, |
| "kl": 0.0008754730224609375, |
| "learning_rate": 1e-06, |
| "loss": -0.0104, |
| "num_tokens": 44706.0, |
| "reward": 0.46666670590639114, |
| "reward_std": 0.7099685594439507, |
| "rewards/warm_up_reward/mean": 0.3888888955116272, |
| "rewards/warm_up_reward/std": 0.7168056517839432, |
| "step": 1 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.95833587646484, |
| "epoch": 0.020833333333333332, |
| "grad_norm": 2.332332033549255, |
| "kl": 0.0011844635009765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0071, |
| "num_tokens": 88904.0, |
| "reward": 0.5250000506639481, |
| "reward_std": 0.748512014746666, |
| "rewards/warm_up_reward/mean": 0.4375, |
| "rewards/warm_up_reward/std": 0.6997176110744476, |
| "step": 2 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 154.00000381469727, |
| "epoch": 0.03125, |
| "grad_norm": 2.04671487874671, |
| "kl": 0.0013523101806640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0124, |
| "num_tokens": 133634.0, |
| "reward": 0.500000037252903, |
| "reward_std": 0.667382538318634, |
| "rewards/warm_up_reward/mean": 0.416666679084301, |
| "rewards/warm_up_reward/std": 0.7197580486536026, |
| "step": 3 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 151.69792556762695, |
| "epoch": 0.041666666666666664, |
| "grad_norm": 2.2153093232246226, |
| "kl": 0.00243377685546875, |
| "learning_rate": 1e-06, |
| "loss": 0.0232, |
| "num_tokens": 178089.0, |
| "reward": 0.43125002086162567, |
| "reward_std": 0.662388876080513, |
| "rewards/warm_up_reward/mean": 0.359375, |
| "rewards/warm_up_reward/std": 0.6697845309972763, |
| "step": 4 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 148.84375762939453, |
| "epoch": 0.052083333333333336, |
| "grad_norm": 2.4723944423129782, |
| "kl": 0.004669189453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0002, |
| "num_tokens": 222360.0, |
| "reward": 0.6250000596046448, |
| "reward_std": 0.8341160118579865, |
| "rewards/warm_up_reward/mean": 0.5208333358168602, |
| "rewards/warm_up_reward/std": 0.7749323397874832, |
| "step": 5 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.4791717529297, |
| "epoch": 0.0625, |
| "grad_norm": 2.452268566541841, |
| "kl": 0.00980377197265625, |
| "learning_rate": 1e-06, |
| "loss": 0.05, |
| "num_tokens": 266434.0, |
| "reward": 0.5625000298023224, |
| "reward_std": 0.8674589395523071, |
| "rewards/warm_up_reward/mean": 0.46875, |
| "rewards/warm_up_reward/std": 0.7468476742506027, |
| "step": 6 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.79166793823242, |
| "epoch": 0.07291666666666667, |
| "grad_norm": 55.84129867431237, |
| "kl": 0.194091796875, |
| "learning_rate": 1e-06, |
| "loss": 0.0511, |
| "num_tokens": 310532.0, |
| "reward": 0.9333333820104599, |
| "reward_std": 0.9551109671592712, |
| "rewards/warm_up_reward/mean": 0.7777777910232544, |
| "rewards/warm_up_reward/std": 0.8151216059923172, |
| "step": 7 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 157.65625381469727, |
| "epoch": 0.08333333333333333, |
| "grad_norm": 29.072228981795064, |
| "kl": 0.1204376220703125, |
| "learning_rate": 1e-06, |
| "loss": 0.0653, |
| "num_tokens": 355673.0, |
| "reward": 0.8281250894069672, |
| "reward_std": 0.9019797444343567, |
| "rewards/warm_up_reward/mean": 0.6901041716337204, |
| "rewards/warm_up_reward/std": 0.800490528345108, |
| "step": 8 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.62500381469727, |
| "epoch": 0.09375, |
| "grad_norm": 6.698990666563141, |
| "kl": 0.04058837890625, |
| "learning_rate": 1e-06, |
| "loss": 0.0742, |
| "num_tokens": 399929.0, |
| "reward": 0.6916667073965073, |
| "reward_std": 0.7857400476932526, |
| "rewards/warm_up_reward/mean": 0.5763888955116272, |
| "rewards/warm_up_reward/std": 0.7836765646934509, |
| "step": 9 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.2291717529297, |
| "epoch": 0.10416666666666667, |
| "grad_norm": 5.758187436239667, |
| "kl": 0.1109619140625, |
| "learning_rate": 1e-06, |
| "loss": 0.0303, |
| "num_tokens": 444279.0, |
| "reward": 0.9000000357627869, |
| "reward_std": 0.9381224364042282, |
| "rewards/warm_up_reward/mean": 0.75, |
| "rewards/warm_up_reward/std": 0.820982426404953, |
| "step": 10 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.67708587646484, |
| "epoch": 0.11458333333333333, |
| "grad_norm": 24.363046434997802, |
| "kl": 0.2352294921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0567, |
| "num_tokens": 488708.0, |
| "reward": 0.8645834177732468, |
| "reward_std": 0.9693308770656586, |
| "rewards/warm_up_reward/mean": 0.720486119389534, |
| "rewards/warm_up_reward/std": 0.8197668194770813, |
| "step": 11 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 155.2916717529297, |
| "epoch": 0.125, |
| "grad_norm": 3.1714199544174053, |
| "kl": 0.08984375, |
| "learning_rate": 1e-06, |
| "loss": 0.0481, |
| "num_tokens": 533526.0, |
| "reward": 1.0781250596046448, |
| "reward_std": 0.9790745824575424, |
| "rewards/warm_up_reward/mean": 0.8984375, |
| "rewards/warm_up_reward/std": 0.8148495256900787, |
| "step": 12 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.2395896911621, |
| "epoch": 0.13541666666666666, |
| "grad_norm": 5.687529927842251, |
| "kl": 0.12872314453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0633, |
| "num_tokens": 576965.0, |
| "reward": 0.9635417610406876, |
| "reward_std": 0.9496043026447296, |
| "rewards/warm_up_reward/mean": 0.802951380610466, |
| "rewards/warm_up_reward/std": 0.7969174236059189, |
| "step": 13 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.87500762939453, |
| "epoch": 0.14583333333333334, |
| "grad_norm": 4.785248899065895, |
| "kl": 0.09088134765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0956, |
| "num_tokens": 620003.0, |
| "reward": 0.8687500655651093, |
| "reward_std": 0.8496406525373459, |
| "rewards/warm_up_reward/mean": 0.7239583283662796, |
| "rewards/warm_up_reward/std": 0.7836050242185593, |
| "step": 14 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.87500381469727, |
| "epoch": 0.15625, |
| "grad_norm": 3.043938278706536, |
| "kl": 0.06243896484375, |
| "learning_rate": 1e-06, |
| "loss": 0.048, |
| "num_tokens": 663347.0, |
| "reward": 0.8843750804662704, |
| "reward_std": 0.9135490357875824, |
| "rewards/warm_up_reward/mean": 0.7369791716337204, |
| "rewards/warm_up_reward/std": 0.8226732462644577, |
| "step": 15 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.0833396911621, |
| "epoch": 0.16666666666666666, |
| "grad_norm": 4.964409464782977, |
| "kl": 0.0794677734375, |
| "learning_rate": 1e-06, |
| "loss": 0.0574, |
| "num_tokens": 707341.0, |
| "reward": 0.9437500536441803, |
| "reward_std": 0.8649309277534485, |
| "rewards/warm_up_reward/mean": 0.7864583283662796, |
| "rewards/warm_up_reward/std": 0.795333594083786, |
| "step": 16 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.76041793823242, |
| "epoch": 0.17708333333333334, |
| "grad_norm": 2.5933725635254965, |
| "kl": 0.0601806640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0293, |
| "num_tokens": 751028.0, |
| "reward": 0.7000000327825546, |
| "reward_std": 0.9639299660921097, |
| "rewards/warm_up_reward/mean": 0.5833333283662796, |
| "rewards/warm_up_reward/std": 0.7868598401546478, |
| "step": 17 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.90625381469727, |
| "epoch": 0.1875, |
| "grad_norm": 2.560263758795748, |
| "kl": 0.0513916015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0333, |
| "num_tokens": 794597.0, |
| "reward": 0.947916716337204, |
| "reward_std": 0.9378172904253006, |
| "rewards/warm_up_reward/mean": 0.7899305373430252, |
| "rewards/warm_up_reward/std": 0.8166805952787399, |
| "step": 18 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 134.42708587646484, |
| "epoch": 0.19791666666666666, |
| "grad_norm": 2.827678010439988, |
| "kl": 0.05694580078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0819, |
| "num_tokens": 837364.0, |
| "reward": 0.9250000715255737, |
| "reward_std": 0.9560818523168564, |
| "rewards/warm_up_reward/mean": 0.7708333283662796, |
| "rewards/warm_up_reward/std": 0.8091117739677429, |
| "step": 19 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.16666793823242, |
| "epoch": 0.20833333333333334, |
| "grad_norm": 2.92256682920272, |
| "kl": 0.05963134765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0522, |
| "num_tokens": 880442.0, |
| "reward": 0.9791667610406876, |
| "reward_std": 0.9076904356479645, |
| "rewards/warm_up_reward/mean": 0.8159722089767456, |
| "rewards/warm_up_reward/std": 0.8102934062480927, |
| "step": 20 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.84375381469727, |
| "epoch": 0.21875, |
| "grad_norm": 2.864931852177023, |
| "kl": 0.04974365234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0248, |
| "num_tokens": 923615.0, |
| "reward": 0.9281250536441803, |
| "reward_std": 0.9918985664844513, |
| "rewards/warm_up_reward/mean": 0.7734375, |
| "rewards/warm_up_reward/std": 0.8185379058122635, |
| "step": 21 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.8750057220459, |
| "epoch": 0.22916666666666666, |
| "grad_norm": 2.8348309851740456, |
| "kl": 0.05242919921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0427, |
| "num_tokens": 966107.0, |
| "reward": 1.0531250834465027, |
| "reward_std": 0.9989801347255707, |
| "rewards/warm_up_reward/mean": 0.8776041567325592, |
| "rewards/warm_up_reward/std": 0.8102044314146042, |
| "step": 22 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.26041793823242, |
| "epoch": 0.23958333333333334, |
| "grad_norm": 2.6071380741149004, |
| "kl": 0.06549072265625, |
| "learning_rate": 1e-06, |
| "loss": 0.0523, |
| "num_tokens": 1009998.0, |
| "reward": 0.9625000357627869, |
| "reward_std": 0.9378929734230042, |
| "rewards/warm_up_reward/mean": 0.8020833432674408, |
| "rewards/warm_up_reward/std": 0.8023640215396881, |
| "step": 23 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.73958587646484, |
| "epoch": 0.25, |
| "grad_norm": 2.697375275936146, |
| "kl": 0.05670166015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0705, |
| "num_tokens": 1053401.0, |
| "reward": 1.031250074505806, |
| "reward_std": 0.9829376488924026, |
| "rewards/warm_up_reward/mean": 0.859375, |
| "rewards/warm_up_reward/std": 0.813440352678299, |
| "step": 24 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.8854217529297, |
| "epoch": 0.2604166666666667, |
| "grad_norm": 85.52835351864486, |
| "kl": 0.198486328125, |
| "learning_rate": 1e-06, |
| "loss": 0.0333, |
| "num_tokens": 1096650.0, |
| "reward": 1.089583471417427, |
| "reward_std": 0.9481612741947174, |
| "rewards/warm_up_reward/mean": 0.9079861044883728, |
| "rewards/warm_up_reward/std": 0.7854074388742447, |
| "step": 25 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.62500381469727, |
| "epoch": 0.2708333333333333, |
| "grad_norm": 3.4064495832019297, |
| "kl": 0.07513427734375, |
| "learning_rate": 1e-06, |
| "loss": 0.0544, |
| "num_tokens": 1139058.0, |
| "reward": 0.9885417520999908, |
| "reward_std": 0.990489736199379, |
| "rewards/warm_up_reward/mean": 0.8237847238779068, |
| "rewards/warm_up_reward/std": 0.8235566318035126, |
| "step": 26 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 134.50000381469727, |
| "epoch": 0.28125, |
| "grad_norm": 118.32186343464612, |
| "kl": 0.2930908203125, |
| "learning_rate": 1e-06, |
| "loss": 0.011, |
| "num_tokens": 1181802.0, |
| "reward": 1.0552084296941757, |
| "reward_std": 0.868858814239502, |
| "rewards/warm_up_reward/mean": 0.8793402910232544, |
| "rewards/warm_up_reward/std": 0.8186527788639069, |
| "step": 27 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.8541717529297, |
| "epoch": 0.2916666666666667, |
| "grad_norm": 6.711195724360143, |
| "kl": 0.1092529296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0187, |
| "num_tokens": 1225408.0, |
| "reward": 0.9666667431592941, |
| "reward_std": 0.9556048065423965, |
| "rewards/warm_up_reward/mean": 0.8055555671453476, |
| "rewards/warm_up_reward/std": 0.8192583322525024, |
| "step": 28 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.48958778381348, |
| "epoch": 0.3020833333333333, |
| "grad_norm": 4.712858469896548, |
| "kl": 0.13214111328125, |
| "learning_rate": 1e-06, |
| "loss": 0.0356, |
| "num_tokens": 1268481.0, |
| "reward": 0.9000000655651093, |
| "reward_std": 1.0170713812112808, |
| "rewards/warm_up_reward/mean": 0.7500000149011612, |
| "rewards/warm_up_reward/std": 0.8409183472394943, |
| "step": 29 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.09375381469727, |
| "epoch": 0.3125, |
| "grad_norm": 2.604660112727859, |
| "kl": 0.063720703125, |
| "learning_rate": 1e-06, |
| "loss": -0.0023, |
| "num_tokens": 1312008.0, |
| "reward": 0.9000000059604645, |
| "reward_std": 0.9869166016578674, |
| "rewards/warm_up_reward/mean": 0.75, |
| "rewards/warm_up_reward/std": 0.8337783664464951, |
| "step": 30 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.59375381469727, |
| "epoch": 0.3229166666666667, |
| "grad_norm": 6.185404235919849, |
| "kl": 0.157470703125, |
| "learning_rate": 1e-06, |
| "loss": 0.0097, |
| "num_tokens": 1356039.0, |
| "reward": 1.0770834237337112, |
| "reward_std": 0.9777155965566635, |
| "rewards/warm_up_reward/mean": 0.8975694477558136, |
| "rewards/warm_up_reward/std": 0.7948237210512161, |
| "step": 31 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.37500762939453, |
| "epoch": 0.3333333333333333, |
| "grad_norm": 2.4537091049295956, |
| "kl": 0.06231689453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0046, |
| "num_tokens": 1400007.0, |
| "reward": 0.9812500327825546, |
| "reward_std": 0.911426916718483, |
| "rewards/warm_up_reward/mean": 0.8177083283662796, |
| "rewards/warm_up_reward/std": 0.7836276739835739, |
| "step": 32 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.19792556762695, |
| "epoch": 0.34375, |
| "grad_norm": 3.0265092714690702, |
| "kl": 0.061767578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0423, |
| "num_tokens": 1443874.0, |
| "reward": 1.0354167222976685, |
| "reward_std": 0.9364243745803833, |
| "rewards/warm_up_reward/mean": 0.8628472238779068, |
| "rewards/warm_up_reward/std": 0.8118415027856827, |
| "step": 33 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.8645896911621, |
| "epoch": 0.3541666666666667, |
| "grad_norm": 2.4770416085087477, |
| "kl": 0.05682373046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0242, |
| "num_tokens": 1487601.0, |
| "reward": 0.9906250834465027, |
| "reward_std": 1.0336193144321442, |
| "rewards/warm_up_reward/mean": 0.8255208432674408, |
| "rewards/warm_up_reward/std": 0.8268236815929413, |
| "step": 34 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 137.34375762939453, |
| "epoch": 0.3645833333333333, |
| "grad_norm": 2.787024849258649, |
| "kl": 0.051513671875, |
| "learning_rate": 1e-06, |
| "loss": 0.0409, |
| "num_tokens": 1530594.0, |
| "reward": 1.0822917073965073, |
| "reward_std": 0.8545732349157333, |
| "rewards/warm_up_reward/mean": 0.9019097238779068, |
| "rewards/warm_up_reward/std": 0.7748938798904419, |
| "step": 35 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.71875381469727, |
| "epoch": 0.375, |
| "grad_norm": 2.515335560097886, |
| "kl": 0.0687255859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0284, |
| "num_tokens": 1574409.0, |
| "reward": 0.9875000715255737, |
| "reward_std": 0.9168877303600311, |
| "rewards/warm_up_reward/mean": 0.8229166567325592, |
| "rewards/warm_up_reward/std": 0.8267286717891693, |
| "step": 36 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.58333778381348, |
| "epoch": 0.3854166666666667, |
| "grad_norm": 11.790654099301383, |
| "kl": 0.1207275390625, |
| "learning_rate": 1e-06, |
| "loss": 0.0652, |
| "num_tokens": 1617503.0, |
| "reward": 1.031250074505806, |
| "reward_std": 0.9142753481864929, |
| "rewards/warm_up_reward/mean": 0.859375, |
| "rewards/warm_up_reward/std": 0.7878352403640747, |
| "step": 37 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 151.8645896911621, |
| "epoch": 0.3958333333333333, |
| "grad_norm": 3.4202024786499643, |
| "kl": 0.08099365234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0212, |
| "num_tokens": 1661980.0, |
| "reward": 0.6229167133569717, |
| "reward_std": 0.8097958564758301, |
| "rewards/warm_up_reward/mean": 0.5190972313284874, |
| "rewards/warm_up_reward/std": 0.7617596387863159, |
| "step": 38 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.8854217529297, |
| "epoch": 0.40625, |
| "grad_norm": 2.4994048524913364, |
| "kl": 0.06414794921875, |
| "learning_rate": 1e-06, |
| "loss": 0.0569, |
| "num_tokens": 1705637.0, |
| "reward": 1.0656251087784767, |
| "reward_std": 0.8034301698207855, |
| "rewards/warm_up_reward/mean": 0.8880208432674408, |
| "rewards/warm_up_reward/std": 0.7390912175178528, |
| "step": 39 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.18750762939453, |
| "epoch": 0.4166666666666667, |
| "grad_norm": 2.4999575203414186, |
| "kl": 0.0628662109375, |
| "learning_rate": 1e-06, |
| "loss": 0.0137, |
| "num_tokens": 1749719.0, |
| "reward": 1.0104167312383652, |
| "reward_std": 0.9736887365579605, |
| "rewards/warm_up_reward/mean": 0.8420138955116272, |
| "rewards/warm_up_reward/std": 0.822531133890152, |
| "step": 40 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.9479217529297, |
| "epoch": 0.4270833333333333, |
| "grad_norm": 2.2639443093591205, |
| "kl": 0.0675048828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0087, |
| "num_tokens": 1793436.0, |
| "reward": 0.9229167401790619, |
| "reward_std": 0.9153347015380859, |
| "rewards/warm_up_reward/mean": 0.7690972089767456, |
| "rewards/warm_up_reward/std": 0.8183709383010864, |
| "step": 41 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.58333587646484, |
| "epoch": 0.4375, |
| "grad_norm": 2.5193978009218965, |
| "kl": 0.07177734375, |
| "learning_rate": 1e-06, |
| "loss": 0.0392, |
| "num_tokens": 1837562.0, |
| "reward": 1.006250038743019, |
| "reward_std": 0.8919505327939987, |
| "rewards/warm_up_reward/mean": 0.8385416716337204, |
| "rewards/warm_up_reward/std": 0.8046689182519913, |
| "step": 42 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.37500381469727, |
| "epoch": 0.4479166666666667, |
| "grad_norm": 2.5496207567669034, |
| "kl": 0.06622314453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0452, |
| "num_tokens": 1880972.0, |
| "reward": 1.068750038743019, |
| "reward_std": 0.9111972749233246, |
| "rewards/warm_up_reward/mean": 0.890625, |
| "rewards/warm_up_reward/std": 0.7989336252212524, |
| "step": 43 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 130.83333587646484, |
| "epoch": 0.4583333333333333, |
| "grad_norm": 5.245713134644536, |
| "kl": 0.0999755859375, |
| "learning_rate": 1e-06, |
| "loss": -0.0051, |
| "num_tokens": 1923460.0, |
| "reward": 0.929166704416275, |
| "reward_std": 0.8137651234865189, |
| "rewards/warm_up_reward/mean": 0.7743055373430252, |
| "rewards/warm_up_reward/std": 0.8136637955904007, |
| "step": 44 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.2604217529297, |
| "epoch": 0.46875, |
| "grad_norm": 6.317144379422631, |
| "kl": 0.10107421875, |
| "learning_rate": 1e-06, |
| "loss": -0.0175, |
| "num_tokens": 1967315.0, |
| "reward": 1.1012500673532486, |
| "reward_std": 0.7948171943426132, |
| "rewards/warm_up_reward/mean": 0.9177083224058151, |
| "rewards/warm_up_reward/std": 0.7791551500558853, |
| "step": 45 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.5104217529297, |
| "epoch": 0.4791666666666667, |
| "grad_norm": 2.8932825846308172, |
| "kl": 0.0859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0029, |
| "num_tokens": 2010882.0, |
| "reward": 1.0656251162290573, |
| "reward_std": 0.9644656330347061, |
| "rewards/warm_up_reward/mean": 0.8880208432674408, |
| "rewards/warm_up_reward/std": 0.8194199502468109, |
| "step": 46 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.21875762939453, |
| "epoch": 0.4895833333333333, |
| "grad_norm": 2.9093719889125174, |
| "kl": 0.1007080078125, |
| "learning_rate": 1e-06, |
| "loss": -0.0275, |
| "num_tokens": 2054307.0, |
| "reward": 0.991666704416275, |
| "reward_std": 0.9324973523616791, |
| "rewards/warm_up_reward/mean": 0.8263888955116272, |
| "rewards/warm_up_reward/std": 0.8247981667518616, |
| "step": 47 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.5729217529297, |
| "epoch": 0.5, |
| "grad_norm": 2.3209209113606146, |
| "kl": 0.08441162109375, |
| "learning_rate": 1e-06, |
| "loss": 0.001, |
| "num_tokens": 2097424.0, |
| "reward": 1.1010417491197586, |
| "reward_std": 0.8305719494819641, |
| "rewards/warm_up_reward/mean": 0.9175347238779068, |
| "rewards/warm_up_reward/std": 0.7910451591014862, |
| "step": 48 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.50000762939453, |
| "epoch": 0.5104166666666666, |
| "grad_norm": 2.470827147583925, |
| "kl": 0.08355712890625, |
| "learning_rate": 1e-06, |
| "loss": 0.0433, |
| "num_tokens": 2141092.0, |
| "reward": 1.0208334028720856, |
| "reward_std": 0.9052031934261322, |
| "rewards/warm_up_reward/mean": 0.8506944477558136, |
| "rewards/warm_up_reward/std": 0.8133653849363327, |
| "step": 49 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.58333587646484, |
| "epoch": 0.5208333333333334, |
| "grad_norm": 2.314915736053297, |
| "kl": 0.0765380859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0188, |
| "num_tokens": 2184306.0, |
| "reward": 1.050000086426735, |
| "reward_std": 0.9242848604917526, |
| "rewards/warm_up_reward/mean": 0.8749999850988388, |
| "rewards/warm_up_reward/std": 0.7892495840787888, |
| "step": 50 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.92708778381348, |
| "epoch": 0.53125, |
| "grad_norm": 16.28250872141934, |
| "kl": 0.26171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0328, |
| "num_tokens": 2227673.0, |
| "reward": 1.1312500685453415, |
| "reward_std": 1.0009342432022095, |
| "rewards/warm_up_reward/mean": 0.9427083432674408, |
| "rewards/warm_up_reward/std": 0.8114291131496429, |
| "step": 51 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.92708587646484, |
| "epoch": 0.5416666666666666, |
| "grad_norm": 2.4052567403889364, |
| "kl": 0.0528564453125, |
| "learning_rate": 1e-06, |
| "loss": 0.0201, |
| "num_tokens": 2271064.0, |
| "reward": 1.2375000417232513, |
| "reward_std": 0.8487301468849182, |
| "rewards/warm_up_reward/mean": 1.03125, |
| "rewards/warm_up_reward/std": 0.7739475220441818, |
| "step": 52 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.1458396911621, |
| "epoch": 0.5520833333333334, |
| "grad_norm": 2.4582483635661467, |
| "kl": 0.05413818359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0429, |
| "num_tokens": 2315052.0, |
| "reward": 1.1104167699813843, |
| "reward_std": 0.9700468927621841, |
| "rewards/warm_up_reward/mean": 0.9253472238779068, |
| "rewards/warm_up_reward/std": 0.7955830246210098, |
| "step": 53 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.2083396911621, |
| "epoch": 0.5625, |
| "grad_norm": 2.4577305981193165, |
| "kl": 0.0728759765625, |
| "learning_rate": 1e-06, |
| "loss": -0.0131, |
| "num_tokens": 2358962.0, |
| "reward": 1.1750001013278961, |
| "reward_std": 0.9326367676258087, |
| "rewards/warm_up_reward/mean": 0.9791666716337204, |
| "rewards/warm_up_reward/std": 0.7897387892007828, |
| "step": 54 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.7916717529297, |
| "epoch": 0.5729166666666666, |
| "grad_norm": 2.302326467124225, |
| "kl": 0.0810546875, |
| "learning_rate": 1e-06, |
| "loss": 0.0291, |
| "num_tokens": 2402922.0, |
| "reward": 0.9875000566244125, |
| "reward_std": 1.0013651847839355, |
| "rewards/warm_up_reward/mean": 0.8229166716337204, |
| "rewards/warm_up_reward/std": 0.8247637003660202, |
| "step": 55 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.9479217529297, |
| "epoch": 0.5833333333333334, |
| "grad_norm": 2.580316913788417, |
| "kl": 0.068115234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0509, |
| "num_tokens": 2447065.0, |
| "reward": 1.183750033378601, |
| "reward_std": 0.9039967954158783, |
| "rewards/warm_up_reward/mean": 0.9864583313465118, |
| "rewards/warm_up_reward/std": 0.787983849644661, |
| "step": 56 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 142.05208587646484, |
| "epoch": 0.59375, |
| "grad_norm": 2.748934001160214, |
| "kl": 0.06414794921875, |
| "learning_rate": 1e-06, |
| "loss": -0.0078, |
| "num_tokens": 2490552.0, |
| "reward": 1.07750004529953, |
| "reward_std": 0.9763932228088379, |
| "rewards/warm_up_reward/mean": 0.8979166746139526, |
| "rewards/warm_up_reward/std": 0.8105349242687225, |
| "step": 57 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.0729217529297, |
| "epoch": 0.6041666666666666, |
| "grad_norm": 2.279728805159882, |
| "kl": 0.05682373046875, |
| "learning_rate": 1e-06, |
| "loss": 0.0054, |
| "num_tokens": 2534503.0, |
| "reward": 1.2562500685453415, |
| "reward_std": 0.8728772848844528, |
| "rewards/warm_up_reward/mean": 1.046875, |
| "rewards/warm_up_reward/std": 0.715716764330864, |
| "step": 58 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 136.77083587646484, |
| "epoch": 0.6145833333333334, |
| "grad_norm": 2.301940905359112, |
| "kl": 0.0634765625, |
| "learning_rate": 1e-06, |
| "loss": 0.0198, |
| "num_tokens": 2577603.0, |
| "reward": 1.2200001031160355, |
| "reward_std": 0.9238942861557007, |
| "rewards/warm_up_reward/mean": 1.0166666805744171, |
| "rewards/warm_up_reward/std": 0.7837828695774078, |
| "step": 59 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 149.3645896911621, |
| "epoch": 0.625, |
| "grad_norm": 2.3088797777576535, |
| "kl": 0.0701904296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0668, |
| "num_tokens": 2621852.0, |
| "reward": 1.0885417461395264, |
| "reward_std": 0.9921838045120239, |
| "rewards/warm_up_reward/mean": 0.9071180820465088, |
| "rewards/warm_up_reward/std": 0.8187949508428574, |
| "step": 60 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.9791717529297, |
| "epoch": 0.6354166666666666, |
| "grad_norm": 2.3531788106957063, |
| "kl": 0.0830078125, |
| "learning_rate": 1e-06, |
| "loss": 0.0715, |
| "num_tokens": 2665584.0, |
| "reward": 1.103541761636734, |
| "reward_std": 0.8904776722192764, |
| "rewards/warm_up_reward/mean": 0.9196180552244186, |
| "rewards/warm_up_reward/std": 0.7996000051498413, |
| "step": 61 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.5416717529297, |
| "epoch": 0.6458333333333334, |
| "grad_norm": 2.3780245058749894, |
| "kl": 0.0545654296875, |
| "learning_rate": 1e-06, |
| "loss": 0.0144, |
| "num_tokens": 2709058.0, |
| "reward": 1.2650001347064972, |
| "reward_std": 0.8583473563194275, |
| "rewards/warm_up_reward/mean": 1.0541666597127914, |
| "rewards/warm_up_reward/std": 0.753357321023941, |
| "step": 62 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 150.5208396911621, |
| "epoch": 0.65625, |
| "grad_norm": 2.291724896507479, |
| "kl": 0.05914306640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0102, |
| "num_tokens": 2753538.0, |
| "reward": 1.2431251406669617, |
| "reward_std": 0.8321643471717834, |
| "rewards/warm_up_reward/mean": 1.0359375476837158, |
| "rewards/warm_up_reward/std": 0.74222831428051, |
| "step": 63 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 143.2395896911621, |
| "epoch": 0.6666666666666666, |
| "grad_norm": 3.018275289311917, |
| "kl": 0.085693359375, |
| "learning_rate": 1e-06, |
| "loss": 0.0078, |
| "num_tokens": 2797253.0, |
| "reward": 1.1802085041999817, |
| "reward_std": 0.948539987206459, |
| "rewards/warm_up_reward/mean": 0.9835069626569748, |
| "rewards/warm_up_reward/std": 0.7748740911483765, |
| "step": 64 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 138.9791717529297, |
| "epoch": 0.6770833333333334, |
| "grad_norm": 2.219343433713507, |
| "kl": 0.06298828125, |
| "learning_rate": 1e-06, |
| "loss": 0.0203, |
| "num_tokens": 2840571.0, |
| "reward": 1.2229167520999908, |
| "reward_std": 0.783911868929863, |
| "rewards/warm_up_reward/mean": 1.0190972089767456, |
| "rewards/warm_up_reward/std": 0.7600821256637573, |
| "step": 65 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 152.8958396911621, |
| "epoch": 0.6875, |
| "grad_norm": 2.2126312475293877, |
| "kl": 0.0704345703125, |
| "learning_rate": 1e-06, |
| "loss": 0.0425, |
| "num_tokens": 2885201.0, |
| "reward": 1.1691668182611465, |
| "reward_std": 0.8853475451469421, |
| "rewards/warm_up_reward/mean": 0.9743055552244186, |
| "rewards/warm_up_reward/std": 0.7726792246103287, |
| "step": 66 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 147.61459350585938, |
| "epoch": 0.6979166666666666, |
| "grad_norm": 2.3729988144079748, |
| "kl": 0.05316162109375, |
| "learning_rate": 1e-06, |
| "loss": 0.0235, |
| "num_tokens": 2929396.0, |
| "reward": 1.3312501311302185, |
| "reward_std": 0.7987091541290283, |
| "rewards/warm_up_reward/mean": 1.109375, |
| "rewards/warm_up_reward/std": 0.7549550831317902, |
| "step": 67 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.1041717529297, |
| "epoch": 0.7083333333333334, |
| "grad_norm": 2.268946790543885, |
| "kl": 0.04766845703125, |
| "learning_rate": 1e-06, |
| "loss": -0.0027, |
| "num_tokens": 2973200.0, |
| "reward": 1.325416773557663, |
| "reward_std": 0.8379913568496704, |
| "rewards/warm_up_reward/mean": 1.1045138835906982, |
| "rewards/warm_up_reward/std": 0.7519785463809967, |
| "step": 68 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.4791717529297, |
| "epoch": 0.71875, |
| "grad_norm": 2.40410834715932, |
| "kl": 0.0625, |
| "learning_rate": 1e-06, |
| "loss": 0.0508, |
| "num_tokens": 3016728.0, |
| "reward": 1.143750011920929, |
| "reward_std": 0.8826231509447098, |
| "rewards/warm_up_reward/mean": 0.953125, |
| "rewards/warm_up_reward/std": 0.7572390139102936, |
| "step": 69 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 145.9791717529297, |
| "epoch": 0.7291666666666666, |
| "grad_norm": 2.377991017189631, |
| "kl": 0.06512451171875, |
| "learning_rate": 1e-06, |
| "loss": 0.0505, |
| "num_tokens": 3060790.0, |
| "reward": 1.2854167819023132, |
| "reward_std": 0.9141092300415039, |
| "rewards/warm_up_reward/mean": 1.0711805671453476, |
| "rewards/warm_up_reward/std": 0.7661919444799423, |
| "step": 70 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 144.7916717529297, |
| "epoch": 0.7395833333333334, |
| "grad_norm": 5.490360584080418, |
| "kl": 0.068115234375, |
| "learning_rate": 1e-06, |
| "loss": 0.0061, |
| "num_tokens": 3104672.0, |
| "reward": 1.2937501072883606, |
| "reward_std": 0.9385685622692108, |
| "rewards/warm_up_reward/mean": 1.078125, |
| "rewards/warm_up_reward/std": 0.7564428001642227, |
| "step": 71 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 132.94792366027832, |
| "epoch": 0.75, |
| "grad_norm": 2.705450393270237, |
| "kl": 0.0748291015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0032, |
| "num_tokens": 3147441.0, |
| "reward": 1.2687500715255737, |
| "reward_std": 0.9319685697555542, |
| "rewards/warm_up_reward/mean": 1.0572916567325592, |
| "rewards/warm_up_reward/std": 0.7771977633237839, |
| "step": 72 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 151.91667556762695, |
| "epoch": 0.7604166666666666, |
| "grad_norm": 2.254735299608175, |
| "kl": 0.07171630859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0105, |
| "num_tokens": 3192121.0, |
| "reward": 1.0045834183692932, |
| "reward_std": 0.8630332052707672, |
| "rewards/warm_up_reward/mean": 0.8371527940034866, |
| "rewards/warm_up_reward/std": 0.7969614416360855, |
| "step": 73 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 140.31250190734863, |
| "epoch": 0.7708333333333334, |
| "grad_norm": 2.4523220833578345, |
| "kl": 0.05816650390625, |
| "learning_rate": 1e-06, |
| "loss": 0.0158, |
| "num_tokens": 3235453.0, |
| "reward": 1.1837501227855682, |
| "reward_std": 0.9126418828964233, |
| "rewards/warm_up_reward/mean": 0.9864583015441895, |
| "rewards/warm_up_reward/std": 0.7863775044679642, |
| "step": 74 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.7395896911621, |
| "epoch": 0.78125, |
| "grad_norm": 2.2478331336635513, |
| "kl": 0.0609130859375, |
| "learning_rate": 1e-06, |
| "loss": 0.0284, |
| "num_tokens": 3278964.0, |
| "reward": 1.2054167687892914, |
| "reward_std": 0.9118891954421997, |
| "rewards/warm_up_reward/mean": 1.0045138746500015, |
| "rewards/warm_up_reward/std": 0.7829048186540604, |
| "step": 75 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.71875381469727, |
| "epoch": 0.7916666666666666, |
| "grad_norm": 2.6147718994418203, |
| "kl": 0.0904541015625, |
| "learning_rate": 1e-06, |
| "loss": 0.0143, |
| "num_tokens": 3322407.0, |
| "reward": 1.2660417556762695, |
| "reward_std": 0.9197860509157181, |
| "rewards/warm_up_reward/mean": 1.0550346970558167, |
| "rewards/warm_up_reward/std": 0.7622723281383514, |
| "step": 76 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 141.53125762939453, |
| "epoch": 0.8020833333333334, |
| "grad_norm": 2.530432648354832, |
| "kl": 0.093505859375, |
| "learning_rate": 1e-06, |
| "loss": -0.0104, |
| "num_tokens": 3366036.0, |
| "reward": 1.1020834147930145, |
| "reward_std": 0.902558371424675, |
| "rewards/warm_up_reward/mean": 0.9184028059244156, |
| "rewards/warm_up_reward/std": 0.7975014746189117, |
| "step": 77 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 146.0416717529297, |
| "epoch": 0.8125, |
| "grad_norm": 2.2386503327740015, |
| "kl": 0.0811767578125, |
| "learning_rate": 1e-06, |
| "loss": 0.0184, |
| "num_tokens": 3410026.0, |
| "reward": 1.3406251072883606, |
| "reward_std": 0.8862900286912918, |
| "rewards/warm_up_reward/mean": 1.1171874850988388, |
| "rewards/warm_up_reward/std": 0.7199237793684006, |
| "step": 78 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 133.6041717529297, |
| "epoch": 0.8229166666666666, |
| "grad_norm": 2.9736551256376744, |
| "kl": 0.0992431640625, |
| "learning_rate": 1e-06, |
| "loss": 0.0029, |
| "num_tokens": 3452750.0, |
| "reward": 1.3556251227855682, |
| "reward_std": 0.8310635536909103, |
| "rewards/warm_up_reward/mean": 1.129687488079071, |
| "rewards/warm_up_reward/std": 0.7288718819618225, |
| "step": 79 |
| }, |
| { |
| "clip_ratio": 0.0, |
| "completion_length": 139.45834350585938, |
| "epoch": 0.8333333333333334, |
| "grad_norm": 4.081234898460602, |
| "kl": 0.15234375, |
| "learning_rate": 1e-06, |
| "loss": -0.0154, |
| "num_tokens": 3496018.0, |
| "reward": 1.0906250923871994, |
| "reward_std": 0.8768025040626526, |
| "rewards/warm_up_reward/mean": 0.9088541716337204, |
| "rewards/warm_up_reward/std": 0.8029628545045853, |
| "step": 80 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 96, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 1, |
| "save_steps": 16, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 6, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|