{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.4380585246188891, "eval_steps": 500, "global_step": 2500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "completion_length": 5.8125, "epoch": 0.00017522340984755565, "grad_norm": 21.7247019198033, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.0, "reward": 1.4406715631484985, "reward_std": 0.22897478938102722, "rewards/accuracy_reward_stage2": 0.4406715929508209, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1 }, { "completion_length": 7.0, "epoch": 0.0003504468196951113, "grad_norm": 21.731991696214283, "kl": 0.000118255615234375, "learning_rate": 9.998247765901524e-07, "loss": 0.0, "reward": 1.4579381942749023, "reward_std": 0.21444088220596313, "rewards/accuracy_reward_stage2": 0.4579381048679352, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2 }, { "completion_length": 10.65625, "epoch": 0.0005256702295426669, "grad_norm": 22.526133155999663, "kl": 0.00025177001953125, "learning_rate": 9.99649553180305e-07, "loss": 0.0001, "reward": 1.4064089059829712, "reward_std": 0.30950504541397095, "rewards/accuracy_reward_stage2": 0.4064089059829712, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 3 }, { "completion_length": 12.078125, "epoch": 0.0007008936393902226, "grad_norm": 21.484916636473127, "kl": 0.00188446044921875, "learning_rate": 9.994743297704572e-07, "loss": 0.0008, "reward": 1.4104167222976685, "reward_std": 0.17264413833618164, "rewards/accuracy_reward_stage2": 0.5354166626930237, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 4 }, { "completion_length": 15.34375, "epoch": 0.0008761170492377782, "grad_norm": 27.629987527288048, "kl": -2.944469451904297e-05, "learning_rate": 9.992991063606097e-07, "loss": -0.0881, "reward": 1.2981054782867432, "reward_std": 0.22537116706371307, "rewards/accuracy_reward_stage2": 0.32935553789138794, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 5 }, { "completion_length": 10.59375, "epoch": 0.0010513404590853338, "grad_norm": 25.50093135228471, "kl": 0.00013637542724609375, "learning_rate": 9.991238829507622e-07, "loss": 0.0001, "reward": 1.583531379699707, "reward_std": 0.3322303295135498, "rewards/accuracy_reward_stage2": 0.5835314393043518, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 6 }, { "completion_length": 9.5, "epoch": 0.0012265638689328894, "grad_norm": 23.99101673923454, "kl": 0.0001049041748046875, "learning_rate": 9.989486595409147e-07, "loss": 0.0, "reward": 1.3512048721313477, "reward_std": 0.24498425424098969, "rewards/accuracy_reward_stage2": 0.3512047529220581, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 7 }, { "completion_length": 8.28125, "epoch": 0.0014017872787804452, "grad_norm": 50.2987810695245, "kl": 0.140625, "learning_rate": 9.98773436131067e-07, "loss": 0.0412, "reward": 1.3523304462432861, "reward_std": 0.32968342304229736, "rewards/accuracy_reward_stage2": 0.47733038663864136, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 8 }, { "completion_length": 9.15625, "epoch": 0.0015770106886280008, "grad_norm": 41.286331589381646, "kl": 0.07080078125, "learning_rate": 9.985982127212195e-07, "loss": 0.0218, "reward": 1.327319622039795, "reward_std": 0.24577535688877106, "rewards/accuracy_reward_stage2": 0.4523196518421173, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 9 }, { "completion_length": 13.734375, "epoch": 0.0017522340984755564, "grad_norm": 49.5957116925187, "kl": 0.0341796875, "learning_rate": 9.98422989311372e-07, "loss": 0.0137, "reward": 1.326066493988037, "reward_std": 0.2599124312400818, "rewards/accuracy_reward_stage2": 0.45106637477874756, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 10 }, { "completion_length": 11.53125, "epoch": 0.001927457508323112, "grad_norm": 21.31206755321316, "kl": 0.001068115234375, "learning_rate": 9.982477659015245e-07, "loss": 0.0004, "reward": 1.48616623878479, "reward_std": 0.2009294331073761, "rewards/accuracy_reward_stage2": 0.4861662685871124, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 11 }, { "completion_length": 11.609375, "epoch": 0.0021026809181706675, "grad_norm": 24.101789435137526, "kl": 0.000728607177734375, "learning_rate": 9.980725424916767e-07, "loss": 0.0003, "reward": 1.528315544128418, "reward_std": 0.16574808955192566, "rewards/accuracy_reward_stage2": 0.5283154845237732, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 12 }, { "completion_length": 10.203125, "epoch": 0.002277904328018223, "grad_norm": 15598.029114802546, "kl": 11.0, "learning_rate": 9.978973190818292e-07, "loss": 4.4264, "reward": 1.3424155712127686, "reward_std": 0.3205416202545166, "rewards/accuracy_reward_stage2": 0.46741557121276855, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 13 }, { "completion_length": 14.640625, "epoch": 0.0024531277378657787, "grad_norm": 22.566706425929812, "kl": 0.002044677734375, "learning_rate": 9.977220956719817e-07, "loss": 0.0008, "reward": 1.2618801593780518, "reward_std": 0.19473139941692352, "rewards/accuracy_reward_stage2": 0.2618802785873413, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 14 }, { "completion_length": 16.75, "epoch": 0.0026283511477133343, "grad_norm": 19.548309921488077, "kl": 0.0021514892578125, "learning_rate": 9.975468722621342e-07, "loss": 0.0009, "reward": 1.6370456218719482, "reward_std": 0.16209974884986877, "rewards/accuracy_reward_stage2": 0.6370455622673035, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 15 }, { "completion_length": 10.78125, "epoch": 0.0028035745575608903, "grad_norm": 25.48092585549323, "kl": 0.0034027099609375, "learning_rate": 9.973716488522867e-07, "loss": 0.0014, "reward": 1.4687082767486572, "reward_std": 0.3171447813510895, "rewards/accuracy_reward_stage2": 0.4687082767486572, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 16 }, { "completion_length": 7.609375, "epoch": 0.002978797967408446, "grad_norm": 19.336096324064826, "kl": 0.00045013427734375, "learning_rate": 9.97196425442439e-07, "loss": 0.0002, "reward": 1.5834205150604248, "reward_std": 0.09155820310115814, "rewards/accuracy_reward_stage2": 0.5834205150604248, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 17 }, { "completion_length": 7.328125, "epoch": 0.0031540213772560015, "grad_norm": 23.585621826315027, "kl": 0.00238037109375, "learning_rate": 9.970212020325915e-07, "loss": 0.001, "reward": 1.4562045335769653, "reward_std": 0.27990102767944336, "rewards/accuracy_reward_stage2": 0.45620453357696533, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 18 }, { "completion_length": 14.421875, "epoch": 0.003329244787103557, "grad_norm": 20.168254519701865, "kl": 0.004241943359375, "learning_rate": 9.96845978622744e-07, "loss": 0.0017, "reward": 1.3004417419433594, "reward_std": 0.14681744575500488, "rewards/accuracy_reward_stage2": 0.4254416525363922, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 19 }, { "completion_length": 10.015625, "epoch": 0.0035044681969511127, "grad_norm": 22.68796589254135, "kl": 0.0027618408203125, "learning_rate": 9.966707552128965e-07, "loss": 0.0011, "reward": 1.558898687362671, "reward_std": 0.1517297327518463, "rewards/accuracy_reward_stage2": 0.5588988065719604, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 20 }, { "completion_length": 8.859375, "epoch": 0.0036796916067986683, "grad_norm": 14.419698875883565, "kl": 0.00433349609375, "learning_rate": 9.964955318030487e-07, "loss": 0.0017, "reward": 1.6647343635559082, "reward_std": 0.10583889484405518, "rewards/accuracy_reward_stage2": 0.664734423160553, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 21 }, { "completion_length": 7.765625, "epoch": 0.003854915016646224, "grad_norm": 34.43768329839898, "kl": 0.10400390625, "learning_rate": 9.963203083932012e-07, "loss": 0.0415, "reward": 1.1504526138305664, "reward_std": 0.24974983930587769, "rewards/accuracy_reward_stage2": 0.2754526138305664, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 22 }, { "completion_length": 9.546875, "epoch": 0.0040301384264937795, "grad_norm": 29.758862471062894, "kl": 0.0086669921875, "learning_rate": 9.961450849833537e-07, "loss": 0.0035, "reward": 1.3589386940002441, "reward_std": 0.3868202865123749, "rewards/accuracy_reward_stage2": 0.3589387536048889, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 23 }, { "completion_length": 10.15625, "epoch": 0.004205361836341335, "grad_norm": 24.934275395499128, "kl": 0.002685546875, "learning_rate": 9.959698615735062e-07, "loss": 0.0011, "reward": 1.517673373222351, "reward_std": 0.23951569199562073, "rewards/accuracy_reward_stage2": 0.5176733732223511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 24 }, { "completion_length": 7.6875, "epoch": 0.004380585246188891, "grad_norm": 24.13104410678152, "kl": 0.003143310546875, "learning_rate": 9.957946381636585e-07, "loss": 0.0013, "reward": 1.4975221157073975, "reward_std": 0.24336925148963928, "rewards/accuracy_reward_stage2": 0.4975220859050751, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 25 }, { "completion_length": 8.421875, "epoch": 0.004555808656036446, "grad_norm": 25.957497353677354, "kl": 0.00439453125, "learning_rate": 9.95619414753811e-07, "loss": 0.0018, "reward": 1.740341305732727, "reward_std": 0.16394385695457458, "rewards/accuracy_reward_stage2": 0.7403413653373718, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 26 }, { "completion_length": 8.421875, "epoch": 0.004731032065884002, "grad_norm": 27210.57969180641, "kl": 26.75, "learning_rate": 9.954441913439635e-07, "loss": 10.7099, "reward": 1.423106074333191, "reward_std": 0.28896230459213257, "rewards/accuracy_reward_stage2": 0.5481060743331909, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 27 }, { "completion_length": 10.609375, "epoch": 0.0049062554757315574, "grad_norm": 25.254963010996637, "kl": 0.0011138916015625, "learning_rate": 9.95268967934116e-07, "loss": 0.0004, "reward": 1.488661289215088, "reward_std": 0.26590585708618164, "rewards/accuracy_reward_stage2": 0.48866117000579834, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 28 }, { "completion_length": 9.296875, "epoch": 0.005081478885579113, "grad_norm": 15.420666005058992, "kl": 0.002838134765625, "learning_rate": 9.950937445242685e-07, "loss": 0.0011, "reward": 1.4329993724822998, "reward_std": 0.12809374928474426, "rewards/accuracy_reward_stage2": 0.4329993724822998, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 29 }, { "completion_length": 16.546875, "epoch": 0.005256702295426669, "grad_norm": 100.09886669879327, "kl": 0.232421875, "learning_rate": 9.94918521114421e-07, "loss": 0.0926, "reward": 1.0829801559448242, "reward_std": 0.15251481533050537, "rewards/accuracy_reward_stage2": 0.20798009634017944, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 30 }, { "completion_length": 9.140625, "epoch": 0.005431925705274224, "grad_norm": 21.597575604241964, "kl": 0.0042724609375, "learning_rate": 9.947432977045732e-07, "loss": 0.0017, "reward": 1.712631344795227, "reward_std": 0.22612644731998444, "rewards/accuracy_reward_stage2": 0.7126312255859375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 31 }, { "completion_length": 11.84375, "epoch": 0.005607149115121781, "grad_norm": 30.567268636442027, "kl": 0.0128173828125, "learning_rate": 9.945680742947257e-07, "loss": -0.0297, "reward": 1.4765079021453857, "reward_std": 0.23709246516227722, "rewards/accuracy_reward_stage2": 0.49213287234306335, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 32 }, { "completion_length": 10.578125, "epoch": 0.005782372524969336, "grad_norm": 19.899263114078344, "kl": 0.0166015625, "learning_rate": 9.94392850884878e-07, "loss": 0.0066, "reward": 1.3554813861846924, "reward_std": 0.21323856711387634, "rewards/accuracy_reward_stage2": 0.35548141598701477, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 33 }, { "completion_length": 13.8125, "epoch": 0.005957595934816892, "grad_norm": 62.522704996084954, "kl": 0.1806640625, "learning_rate": 9.942176274750305e-07, "loss": 0.0721, "reward": 1.532733678817749, "reward_std": 0.2642131447792053, "rewards/accuracy_reward_stage2": 0.6577336192131042, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 34 }, { "completion_length": 10.203125, "epoch": 0.0061328193446644474, "grad_norm": 28.84899146020609, "kl": 0.0081787109375, "learning_rate": 9.94042404065183e-07, "loss": -0.0315, "reward": 1.599797248840332, "reward_std": 0.2751314043998718, "rewards/accuracy_reward_stage2": 0.6154221296310425, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 35 }, { "completion_length": 15.390625, "epoch": 0.006308042754512003, "grad_norm": 869.9467292590601, "kl": 0.96875, "learning_rate": 9.938671806553355e-07, "loss": 0.387, "reward": 1.1933510303497314, "reward_std": 0.19862015545368195, "rewards/accuracy_reward_stage2": 0.31835103034973145, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 36 }, { "completion_length": 6.15625, "epoch": 0.006483266164359559, "grad_norm": 21.28271109324732, "kl": 0.016845703125, "learning_rate": 9.93691957245488e-07, "loss": -0.0307, "reward": 1.716348648071289, "reward_std": 0.13987597823143005, "rewards/accuracy_reward_stage2": 0.7475985884666443, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 37 }, { "completion_length": 7.859375, "epoch": 0.006658489574207114, "grad_norm": 18.801528535727066, "kl": 0.01611328125, "learning_rate": 9.935167338356405e-07, "loss": 0.0064, "reward": 1.427717924118042, "reward_std": 0.21595898270606995, "rewards/accuracy_reward_stage2": 0.5527178049087524, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 38 }, { "completion_length": 13.234375, "epoch": 0.00683371298405467, "grad_norm": 29.72698332741626, "kl": 0.010498046875, "learning_rate": 9.933415104257928e-07, "loss": 0.0042, "reward": 1.4073545932769775, "reward_std": 0.4115482568740845, "rewards/accuracy_reward_stage2": 0.5323545336723328, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 39 }, { "completion_length": 18.5625, "epoch": 0.007008936393902225, "grad_norm": 190.47366474232058, "kl": 0.3203125, "learning_rate": 9.931662870159453e-07, "loss": 0.1277, "reward": 1.3717286586761475, "reward_std": 0.20918840169906616, "rewards/accuracy_reward_stage2": 0.4967285692691803, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 40 }, { "completion_length": 10.796875, "epoch": 0.007184159803749781, "grad_norm": 30.35224097860874, "kl": 0.0201416015625, "learning_rate": 9.929910636060978e-07, "loss": 0.0081, "reward": 1.571852445602417, "reward_std": 0.2101229727268219, "rewards/accuracy_reward_stage2": 0.5718523859977722, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 41 }, { "completion_length": 12.609375, "epoch": 0.007359383213597337, "grad_norm": 20.959404085955548, "kl": 0.00677490234375, "learning_rate": 9.928158401962502e-07, "loss": 0.0027, "reward": 1.6106300354003906, "reward_std": 0.15248414874076843, "rewards/accuracy_reward_stage2": 0.6106299757957458, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 42 }, { "completion_length": 12.125, "epoch": 0.007534606623444892, "grad_norm": 40.0503689273909, "kl": 0.376953125, "learning_rate": 9.926406167864027e-07, "loss": 0.1509, "reward": 1.3072917461395264, "reward_std": 0.17887625098228455, "rewards/accuracy_reward_stage2": 0.4322916567325592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 43 }, { "completion_length": 8.09375, "epoch": 0.007709830033292448, "grad_norm": 25.498158819945512, "kl": 0.043701171875, "learning_rate": 9.92465393376555e-07, "loss": -0.0115, "reward": 1.400193452835083, "reward_std": 0.17675068974494934, "rewards/accuracy_reward_stage2": 0.540818452835083, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 44 }, { "completion_length": 6.765625, "epoch": 0.007885053443140003, "grad_norm": 26.292798175814653, "kl": 0.0184326171875, "learning_rate": 9.922901699667075e-07, "loss": 0.0074, "reward": 1.4580981731414795, "reward_std": 0.3192833662033081, "rewards/accuracy_reward_stage2": 0.5830981731414795, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 45 }, { "completion_length": 6.3125, "epoch": 0.008060276852987559, "grad_norm": 29.8457885783044, "kl": 0.01373291015625, "learning_rate": 9.9211494655686e-07, "loss": 0.0055, "reward": 1.3718205690383911, "reward_std": 0.4017482399940491, "rewards/accuracy_reward_stage2": 0.4968205690383911, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 46 }, { "completion_length": 6.96875, "epoch": 0.008235500262835115, "grad_norm": 23.87042828545471, "kl": 0.0194091796875, "learning_rate": 9.919397231470123e-07, "loss": 0.0078, "reward": 1.5246155261993408, "reward_std": 0.2239200323820114, "rewards/accuracy_reward_stage2": 0.5246155858039856, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 47 }, { "completion_length": 8.515625, "epoch": 0.00841072367268267, "grad_norm": 23.48554567269952, "kl": 0.00982666015625, "learning_rate": 9.917644997371648e-07, "loss": 0.0039, "reward": 1.3785715103149414, "reward_std": 0.2797955572605133, "rewards/accuracy_reward_stage2": 0.5035715699195862, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 48 }, { "completion_length": 8.03125, "epoch": 0.008585947082530226, "grad_norm": 17.54951377855767, "kl": 0.01318359375, "learning_rate": 9.915892763273173e-07, "loss": 0.0053, "reward": 1.6302083730697632, "reward_std": 0.16204530000686646, "rewards/accuracy_reward_stage2": 0.6302083730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 49 }, { "completion_length": 6.765625, "epoch": 0.008761170492377781, "grad_norm": 23.217378435807788, "kl": 0.007720947265625, "learning_rate": 9.914140529174698e-07, "loss": 0.0031, "reward": 1.4840940237045288, "reward_std": 0.25985440611839294, "rewards/accuracy_reward_stage2": 0.4840940237045288, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 50 }, { "completion_length": 13.5, "epoch": 0.008936393902225337, "grad_norm": 24.87705970165469, "kl": 0.046875, "learning_rate": 9.912388295076223e-07, "loss": 0.0188, "reward": 1.477467656135559, "reward_std": 0.2629283666610718, "rewards/accuracy_reward_stage2": 0.4774676561355591, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 51 }, { "completion_length": 25.15625, "epoch": 0.009111617312072893, "grad_norm": 23.93595159530324, "kl": 0.51171875, "learning_rate": 9.910636060977745e-07, "loss": 0.2041, "reward": 1.425516963005066, "reward_std": 0.12461623549461365, "rewards/accuracy_reward_stage2": 0.5505169630050659, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 52 }, { "completion_length": 18.171875, "epoch": 0.009286840721920448, "grad_norm": 22.11918874616949, "kl": 0.0225830078125, "learning_rate": 9.90888382687927e-07, "loss": 0.009, "reward": 1.2883203029632568, "reward_std": 0.19033360481262207, "rewards/accuracy_reward_stage2": 0.4133202135562897, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 53 }, { "completion_length": 17.046875, "epoch": 0.009462064131768004, "grad_norm": 22.519091215947434, "kl": 0.041259765625, "learning_rate": 9.907131592780795e-07, "loss": 0.0165, "reward": 1.5168977975845337, "reward_std": 0.20637044310569763, "rewards/accuracy_reward_stage2": 0.5168977975845337, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 54 }, { "completion_length": 11.1875, "epoch": 0.00963728754161556, "grad_norm": 19.29424591739688, "kl": 0.01031494140625, "learning_rate": 9.90537935868232e-07, "loss": 0.0041, "reward": 1.6363108158111572, "reward_std": 0.20468105375766754, "rewards/accuracy_reward_stage2": 0.6363107562065125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 55 }, { "completion_length": 13.265625, "epoch": 0.009812510951463115, "grad_norm": 23.610180528134936, "kl": 0.0341796875, "learning_rate": 9.903627124583845e-07, "loss": -0.0103, "reward": 1.6153689622879028, "reward_std": 0.20442932844161987, "rewards/accuracy_reward_stage2": 0.6309939622879028, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 56 }, { "completion_length": 11.53125, "epoch": 0.00998773436131067, "grad_norm": 25.012855879784862, "kl": 0.0439453125, "learning_rate": 9.901874890485368e-07, "loss": 0.0176, "reward": 1.570845603942871, "reward_std": 0.19931158423423767, "rewards/accuracy_reward_stage2": 0.5708456039428711, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 57 }, { "completion_length": 12.484375, "epoch": 0.010162957771158226, "grad_norm": 19.3673100075188, "kl": 0.046142578125, "learning_rate": 9.900122656386893e-07, "loss": 0.0185, "reward": 1.5261080265045166, "reward_std": 0.1789964884519577, "rewards/accuracy_reward_stage2": 0.5261080265045166, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 58 }, { "completion_length": 9.53125, "epoch": 0.010338181181005782, "grad_norm": 22.076786740581635, "kl": 0.041015625, "learning_rate": 9.898370422288418e-07, "loss": 0.0164, "reward": 1.528951644897461, "reward_std": 0.17195484042167664, "rewards/accuracy_reward_stage2": 0.5289516448974609, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 59 }, { "completion_length": 9.65625, "epoch": 0.010513404590853337, "grad_norm": 21.532828207340156, "kl": 0.06494140625, "learning_rate": 9.89661818818994e-07, "loss": 0.0259, "reward": 1.2520328760147095, "reward_std": 0.11828687787055969, "rewards/accuracy_reward_stage2": 0.2520328760147095, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 60 }, { "completion_length": 7.109375, "epoch": 0.010688628000700893, "grad_norm": 18.27714256983686, "kl": 0.016357421875, "learning_rate": 9.894865954091465e-07, "loss": 0.0066, "reward": 1.7229888439178467, "reward_std": 0.22104474902153015, "rewards/accuracy_reward_stage2": 0.7229888439178467, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 61 }, { "completion_length": 21.140625, "epoch": 0.010863851410548448, "grad_norm": 26.309629270176462, "kl": 0.0927734375, "learning_rate": 9.89311371999299e-07, "loss": -0.0065, "reward": 1.72660493850708, "reward_std": 0.32230430841445923, "rewards/accuracy_reward_stage2": 0.7422299981117249, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 62 }, { "completion_length": 18.921875, "epoch": 0.011039074820396006, "grad_norm": 23.324464380030182, "kl": 0.042724609375, "learning_rate": 9.891361485894515e-07, "loss": 0.0171, "reward": 1.2932167053222656, "reward_std": 0.20824560523033142, "rewards/accuracy_reward_stage2": 0.2932167053222656, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 63 }, { "completion_length": 22.0625, "epoch": 0.011214298230243561, "grad_norm": 510.2753190723213, "kl": 0.94921875, "learning_rate": 9.88960925179604e-07, "loss": 0.3816, "reward": 1.3846971988677979, "reward_std": 0.15436303615570068, "rewards/accuracy_reward_stage2": 0.5096973180770874, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 64 }, { "completion_length": 9.78125, "epoch": 0.011389521640091117, "grad_norm": 20.624173694033985, "kl": 0.06640625, "learning_rate": 9.887857017697563e-07, "loss": 0.0265, "reward": 1.6437493562698364, "reward_std": 0.13226813077926636, "rewards/accuracy_reward_stage2": 0.6437492966651917, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 65 }, { "completion_length": 23.34375, "epoch": 0.011564745049938673, "grad_norm": 7774.154134478514, "kl": 7.4375, "learning_rate": 9.886104783599088e-07, "loss": 2.9733, "reward": 1.7207577228546143, "reward_std": 0.13807401061058044, "rewards/accuracy_reward_stage2": 0.8457577228546143, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 66 }, { "completion_length": 9.78125, "epoch": 0.011739968459786228, "grad_norm": 17.504390886758873, "kl": 0.07275390625, "learning_rate": 9.884352549500613e-07, "loss": 0.0291, "reward": 1.1457009315490723, "reward_std": 0.1326243132352829, "rewards/accuracy_reward_stage2": 0.14570099115371704, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 67 }, { "completion_length": 15.34375, "epoch": 0.011915191869633784, "grad_norm": 23.114615139412418, "kl": 0.017822265625, "learning_rate": 9.882600315402138e-07, "loss": 0.0071, "reward": 1.6107048988342285, "reward_std": 0.16821706295013428, "rewards/accuracy_reward_stage2": 0.610704779624939, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 68 }, { "completion_length": 11.65625, "epoch": 0.01209041527948134, "grad_norm": 25.7631490100802, "kl": 0.04833984375, "learning_rate": 9.880848081303663e-07, "loss": 0.0193, "reward": 1.669920802116394, "reward_std": 0.2990760803222656, "rewards/accuracy_reward_stage2": 0.6699207425117493, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 69 }, { "completion_length": 7.0625, "epoch": 0.012265638689328895, "grad_norm": 28.192508330216686, "kl": 0.2373046875, "learning_rate": 9.879095847205188e-07, "loss": 0.0948, "reward": 1.5916601419448853, "reward_std": 0.1377098262310028, "rewards/accuracy_reward_stage2": 0.7166601419448853, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 70 }, { "completion_length": 10.515625, "epoch": 0.01244086209917645, "grad_norm": 29.211176981993752, "kl": 0.022705078125, "learning_rate": 9.87734361310671e-07, "loss": 0.0091, "reward": 1.3620529174804688, "reward_std": 0.28657591342926025, "rewards/accuracy_reward_stage2": 0.362052857875824, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 71 }, { "completion_length": 7.03125, "epoch": 0.012616085509024006, "grad_norm": 62.034643138243744, "kl": 0.232421875, "learning_rate": 9.875591379008235e-07, "loss": 0.0926, "reward": 1.5957125425338745, "reward_std": 0.1589427888393402, "rewards/accuracy_reward_stage2": 0.7207125425338745, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 72 }, { "completion_length": 15.046875, "epoch": 0.012791308918871562, "grad_norm": 49.291115562011576, "kl": 0.072265625, "learning_rate": 9.873839144909758e-07, "loss": 0.0289, "reward": 1.7115159034729004, "reward_std": 0.20133166015148163, "rewards/accuracy_reward_stage2": 0.7115159034729004, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 73 }, { "completion_length": 11.203125, "epoch": 0.012966532328719117, "grad_norm": 21.002854737310127, "kl": 0.107421875, "learning_rate": 9.872086910811283e-07, "loss": 0.043, "reward": 1.5917012691497803, "reward_std": 0.16557535529136658, "rewards/accuracy_reward_stage2": 0.5917012691497803, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 74 }, { "completion_length": 9.46875, "epoch": 0.013141755738566673, "grad_norm": 20.822893777959763, "kl": 0.06591796875, "learning_rate": 9.870334676712808e-07, "loss": 0.0264, "reward": 1.4202609062194824, "reward_std": 0.19799599051475525, "rewards/accuracy_reward_stage2": 0.42026087641716003, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 75 }, { "completion_length": 11.796875, "epoch": 0.013316979148414228, "grad_norm": 24.661401462976446, "kl": 0.047119140625, "learning_rate": 9.868582442614333e-07, "loss": 0.0188, "reward": 1.6601495742797852, "reward_std": 0.16851621866226196, "rewards/accuracy_reward_stage2": 0.6601495146751404, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 76 }, { "completion_length": 8.359375, "epoch": 0.013492202558261784, "grad_norm": 17.354177040158966, "kl": 0.0242919921875, "learning_rate": 9.866830208515858e-07, "loss": 0.0097, "reward": 1.1423872709274292, "reward_std": 0.08182623237371445, "rewards/accuracy_reward_stage2": 0.2673872113227844, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 77 }, { "completion_length": 16.0, "epoch": 0.01366742596810934, "grad_norm": 19.041542561969962, "kl": 0.0255126953125, "learning_rate": 9.86507797441738e-07, "loss": 0.0102, "reward": 1.4512853622436523, "reward_std": 0.13536491990089417, "rewards/accuracy_reward_stage2": 0.4512854814529419, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 78 }, { "completion_length": 12.9375, "epoch": 0.013842649377956895, "grad_norm": 16.90288627870279, "kl": 0.0242919921875, "learning_rate": 9.863325740318906e-07, "loss": 0.0097, "reward": 1.3414337635040283, "reward_std": 0.1197996586561203, "rewards/accuracy_reward_stage2": 0.3414338231086731, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 79 }, { "completion_length": 6.78125, "epoch": 0.01401787278780445, "grad_norm": 25.60916886197121, "kl": 0.24609375, "learning_rate": 9.86157350622043e-07, "loss": 0.0984, "reward": 1.5731947422027588, "reward_std": 0.2476077377796173, "rewards/accuracy_reward_stage2": 0.698194682598114, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 80 }, { "completion_length": 9.734375, "epoch": 0.014193096197652006, "grad_norm": 21.284203853284396, "kl": 0.023681640625, "learning_rate": 9.859821272121955e-07, "loss": 0.0094, "reward": 1.851102352142334, "reward_std": 0.10228273272514343, "rewards/accuracy_reward_stage2": 0.8511022329330444, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 81 }, { "completion_length": 17.515625, "epoch": 0.014368319607499562, "grad_norm": 22.317783264001836, "kl": 0.04931640625, "learning_rate": 9.85806903802348e-07, "loss": 0.0197, "reward": 1.4761021137237549, "reward_std": 0.19037950038909912, "rewards/accuracy_reward_stage2": 0.4761021137237549, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 82 }, { "completion_length": 7.703125, "epoch": 0.014543543017347118, "grad_norm": 17.43143667281669, "kl": 0.0133056640625, "learning_rate": 9.856316803925005e-07, "loss": 0.0053, "reward": 1.3431739807128906, "reward_std": 0.19102345407009125, "rewards/accuracy_reward_stage2": 0.3431740403175354, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 83 }, { "completion_length": 15.0, "epoch": 0.014718766427194673, "grad_norm": 29.21779230277411, "kl": 0.373046875, "learning_rate": 9.854564569826528e-07, "loss": 0.1492, "reward": 1.3184058666229248, "reward_std": 0.2299778163433075, "rewards/accuracy_reward_stage2": 0.4434059262275696, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 84 }, { "completion_length": 8.328125, "epoch": 0.014893989837042229, "grad_norm": 21.284746149411724, "kl": 0.046142578125, "learning_rate": 9.852812335728053e-07, "loss": 0.0184, "reward": 1.5364583730697632, "reward_std": 0.18556493520736694, "rewards/accuracy_reward_stage2": 0.5364583730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 85 }, { "completion_length": 18.109375, "epoch": 0.015069213246889784, "grad_norm": 21.554042596269834, "kl": 0.58203125, "learning_rate": 9.851060101629576e-07, "loss": 0.2327, "reward": 1.343637228012085, "reward_std": 0.22345028817653656, "rewards/accuracy_reward_stage2": 0.46863725781440735, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 86 }, { "completion_length": 13.40625, "epoch": 0.01524443665673734, "grad_norm": 22.937433205252535, "kl": 0.044189453125, "learning_rate": 9.8493078675311e-07, "loss": 0.0177, "reward": 1.577603816986084, "reward_std": 0.11516597867012024, "rewards/accuracy_reward_stage2": 0.5776037573814392, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 87 }, { "completion_length": 16.046875, "epoch": 0.015419660066584896, "grad_norm": 20.89853454660946, "kl": 0.04248046875, "learning_rate": 9.847555633432626e-07, "loss": 0.017, "reward": 1.4262290000915527, "reward_std": 0.2058263123035431, "rewards/accuracy_reward_stage2": 0.4262291193008423, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 88 }, { "completion_length": 9.234375, "epoch": 0.015594883476432451, "grad_norm": 13.755470671094079, "kl": 0.01080322265625, "learning_rate": 9.84580339933415e-07, "loss": 0.0043, "reward": 1.6458332538604736, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.6458333134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 89 }, { "completion_length": 6.765625, "epoch": 0.015770106886280007, "grad_norm": 11.623954354278077, "kl": 0.0299072265625, "learning_rate": 9.844051165235676e-07, "loss": 0.012, "reward": 1.609375, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 90 }, { "completion_length": 10.78125, "epoch": 0.015945330296127564, "grad_norm": 18.861997300165676, "kl": 0.0341796875, "learning_rate": 9.8422989311372e-07, "loss": 0.0137, "reward": 1.506962537765503, "reward_std": 0.18392382562160492, "rewards/accuracy_reward_stage2": 0.5069626569747925, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 91 }, { "completion_length": 9.25, "epoch": 0.016120553705975118, "grad_norm": 18104.437031192745, "kl": 31.0, "learning_rate": 9.840546697038723e-07, "loss": 12.3281, "reward": 1.3662315607070923, "reward_std": 0.11293835937976837, "rewards/accuracy_reward_stage2": 0.5068565607070923, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 92 }, { "completion_length": 24.9375, "epoch": 0.016295777115822675, "grad_norm": 4068.679729400268, "kl": 81.5, "learning_rate": 9.838794462940248e-07, "loss": 32.587, "reward": 1.2212448120117188, "reward_std": 0.21262651681900024, "rewards/accuracy_reward_stage2": 0.4868698716163635, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 93 }, { "completion_length": 10.703125, "epoch": 0.01647100052567023, "grad_norm": 21.628345918894205, "kl": 0.072265625, "learning_rate": 9.837042228841773e-07, "loss": 0.0289, "reward": 1.5775998830795288, "reward_std": 0.231684148311615, "rewards/accuracy_reward_stage2": 0.7025998830795288, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 94 }, { "completion_length": 11.5, "epoch": 0.016646223935517786, "grad_norm": 26.44437366688261, "kl": 0.0125732421875, "learning_rate": 9.835289994743298e-07, "loss": 0.005, "reward": 1.5052083730697632, "reward_std": 0.2688094973564148, "rewards/accuracy_reward_stage2": 0.5052083134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 95 }, { "completion_length": 13.40625, "epoch": 0.01682144734536534, "grad_norm": 27.942135859948205, "kl": 0.05615234375, "learning_rate": 9.833537760644823e-07, "loss": -0.0218, "reward": 1.30165433883667, "reward_std": 0.2656284272670746, "rewards/accuracy_reward_stage2": 0.31727930903434753, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 96 }, { "completion_length": 16.5625, "epoch": 0.016996670755212898, "grad_norm": 23.460188028730357, "kl": 0.07177734375, "learning_rate": 9.831785526546346e-07, "loss": 0.0288, "reward": 1.3574440479278564, "reward_std": 0.17292845249176025, "rewards/accuracy_reward_stage2": 0.35744398832321167, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 97 }, { "completion_length": 8.453125, "epoch": 0.01717189416506045, "grad_norm": 18.965752150922896, "kl": 0.053466796875, "learning_rate": 9.83003329244787e-07, "loss": -0.0116, "reward": 1.6651774644851685, "reward_std": 0.19083932042121887, "rewards/accuracy_reward_stage2": 0.8058024644851685, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 98 }, { "completion_length": 13.875, "epoch": 0.01734711757490801, "grad_norm": 18.059570150986527, "kl": 0.0235595703125, "learning_rate": 9.828281058349396e-07, "loss": 0.0094, "reward": 1.640625, "reward_std": 0.2472364604473114, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 99 }, { "completion_length": 7.65625, "epoch": 0.017522340984755563, "grad_norm": 18.09970933493801, "kl": 0.0213623046875, "learning_rate": 9.826528824250918e-07, "loss": 0.0085, "reward": 1.4744908809661865, "reward_std": 0.2173069417476654, "rewards/accuracy_reward_stage2": 0.5994908809661865, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 100 }, { "completion_length": 7.5625, "epoch": 0.01769756439460312, "grad_norm": 27.227153684766957, "kl": 0.03369140625, "learning_rate": 9.824776590152443e-07, "loss": -0.0158, "reward": 1.4680397510528564, "reward_std": 0.26545101404190063, "rewards/accuracy_reward_stage2": 0.48366478085517883, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 101 }, { "completion_length": 12.3125, "epoch": 0.017872787804450674, "grad_norm": 21.805155630136777, "kl": 0.0098876953125, "learning_rate": 9.823024356053968e-07, "loss": 0.0039, "reward": 1.4559073448181152, "reward_std": 0.13833250105381012, "rewards/accuracy_reward_stage2": 0.45590728521347046, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 102 }, { "completion_length": 9.703125, "epoch": 0.01804801121429823, "grad_norm": 23.892836207416806, "kl": 0.020263671875, "learning_rate": 9.821272121955493e-07, "loss": 0.0081, "reward": 1.4594056606292725, "reward_std": 0.19585129618644714, "rewards/accuracy_reward_stage2": 0.7094056606292725, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 103 }, { "completion_length": 11.40625, "epoch": 0.018223234624145785, "grad_norm": 16.16795985343101, "kl": 0.024169921875, "learning_rate": 9.819519887857018e-07, "loss": 0.0097, "reward": 1.7032923698425293, "reward_std": 0.10313989222049713, "rewards/accuracy_reward_stage2": 0.7032923102378845, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 104 }, { "completion_length": 15.15625, "epoch": 0.018398458033993342, "grad_norm": 58.162353972663645, "kl": 0.486328125, "learning_rate": 9.81776765375854e-07, "loss": 0.1505, "reward": 1.5770833492279053, "reward_std": 0.23222008347511292, "rewards/accuracy_reward_stage2": 0.7177083492279053, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 105 }, { "completion_length": 10.890625, "epoch": 0.018573681443840896, "grad_norm": 20.3955912999849, "kl": 0.041259765625, "learning_rate": 9.816015419660066e-07, "loss": -0.0051, "reward": 1.7585219144821167, "reward_std": 0.18110564351081848, "rewards/accuracy_reward_stage2": 0.7741469144821167, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 106 }, { "completion_length": 7.9375, "epoch": 0.018748904853688454, "grad_norm": 33.10455387599469, "kl": 0.21484375, "learning_rate": 9.81426318556159e-07, "loss": 0.0861, "reward": 1.446201205253601, "reward_std": 0.26556122303009033, "rewards/accuracy_reward_stage2": 0.5712012052536011, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 107 }, { "completion_length": 9.453125, "epoch": 0.018924128263536007, "grad_norm": 28.010482936540864, "kl": 0.051513671875, "learning_rate": 9.812510951463116e-07, "loss": 0.0206, "reward": 1.5572054386138916, "reward_std": 0.18969415128231049, "rewards/accuracy_reward_stage2": 0.5572054386138916, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 108 }, { "completion_length": 13.609375, "epoch": 0.019099351673383565, "grad_norm": 22.80737044173408, "kl": 0.0888671875, "learning_rate": 9.81075871736464e-07, "loss": 0.0356, "reward": 1.2530102729797363, "reward_std": 0.27315306663513184, "rewards/accuracy_reward_stage2": 0.5030102729797363, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 109 }, { "completion_length": 7.265625, "epoch": 0.01927457508323112, "grad_norm": 19.569412790924364, "kl": 0.047119140625, "learning_rate": 9.809006483266164e-07, "loss": 0.0188, "reward": 1.7912015914916992, "reward_std": 0.22704292833805084, "rewards/accuracy_reward_stage2": 0.7912015318870544, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 110 }, { "completion_length": 7.5625, "epoch": 0.019449798493078676, "grad_norm": 25.806046811486276, "kl": 0.033935546875, "learning_rate": 9.807254249167688e-07, "loss": 0.0136, "reward": 1.692245364189148, "reward_std": 0.31582602858543396, "rewards/accuracy_reward_stage2": 0.6922453045845032, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 111 }, { "completion_length": 13.234375, "epoch": 0.01962502190292623, "grad_norm": 15.656314149690356, "kl": 0.05810546875, "learning_rate": 9.805502015069213e-07, "loss": 0.0232, "reward": 1.7992335557937622, "reward_std": 0.11169708520174026, "rewards/accuracy_reward_stage2": 0.7992335557937622, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 112 }, { "completion_length": 7.859375, "epoch": 0.019800245312773787, "grad_norm": 20.00385022632407, "kl": 0.0228271484375, "learning_rate": 9.803749780970736e-07, "loss": 0.0091, "reward": 1.5724248886108398, "reward_std": 0.23601973056793213, "rewards/accuracy_reward_stage2": 0.6974248886108398, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 113 }, { "completion_length": 10.046875, "epoch": 0.01997546872262134, "grad_norm": 19.010960047122616, "kl": 0.034423828125, "learning_rate": 9.801997546872261e-07, "loss": 0.0137, "reward": 1.6812996864318848, "reward_std": 0.17957565188407898, "rewards/accuracy_reward_stage2": 0.6812995672225952, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 114 }, { "completion_length": 14.515625, "epoch": 0.020150692132468898, "grad_norm": 19.989242334690157, "kl": 0.044677734375, "learning_rate": 9.800245312773786e-07, "loss": 0.0179, "reward": 1.6909170150756836, "reward_std": 0.1381261646747589, "rewards/accuracy_reward_stage2": 0.6909170150756836, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 115 }, { "completion_length": 8.96875, "epoch": 0.020325915542316452, "grad_norm": 21.574664105458634, "kl": 0.07275390625, "learning_rate": 9.79849307867531e-07, "loss": 0.0291, "reward": 1.5188727378845215, "reward_std": 0.21951830387115479, "rewards/accuracy_reward_stage2": 0.5188726186752319, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 116 }, { "completion_length": 9.0625, "epoch": 0.02050113895216401, "grad_norm": 21.074363399702218, "kl": 0.0296630859375, "learning_rate": 9.796740844576836e-07, "loss": -0.0757, "reward": 1.3273824453353882, "reward_std": 0.2471882849931717, "rewards/accuracy_reward_stage2": 0.3586324453353882, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 117 }, { "completion_length": 14.859375, "epoch": 0.020676362362011563, "grad_norm": 24.287310771519245, "kl": 0.0303955078125, "learning_rate": 9.794988610478359e-07, "loss": 0.0122, "reward": 1.3982466459274292, "reward_std": 0.24839340150356293, "rewards/accuracy_reward_stage2": 0.3982466161251068, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 118 }, { "completion_length": 12.390625, "epoch": 0.02085158577185912, "grad_norm": 23.822191359887427, "kl": 0.0380859375, "learning_rate": 9.793236376379884e-07, "loss": 0.0152, "reward": 1.2920645475387573, "reward_std": 0.15751829743385315, "rewards/accuracy_reward_stage2": 0.41706451773643494, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 119 }, { "completion_length": 9.8125, "epoch": 0.021026809181706674, "grad_norm": 20.83707735551321, "kl": 0.043212890625, "learning_rate": 9.791484142281409e-07, "loss": -0.0259, "reward": 1.4067494869232178, "reward_std": 0.21363842487335205, "rewards/accuracy_reward_stage2": 0.42237451672554016, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 120 }, { "completion_length": 11.75, "epoch": 0.021202032591554232, "grad_norm": 23.337085102664894, "kl": 0.0201416015625, "learning_rate": 9.789731908182933e-07, "loss": 0.0081, "reward": 1.697341799736023, "reward_std": 0.2553454339504242, "rewards/accuracy_reward_stage2": 0.697341799736023, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 121 }, { "completion_length": 10.0625, "epoch": 0.021377256001401786, "grad_norm": 23.09151113763775, "kl": 0.049072265625, "learning_rate": 9.787979674084458e-07, "loss": 0.0196, "reward": 1.4421296119689941, "reward_std": 0.34498897194862366, "rewards/accuracy_reward_stage2": 0.5671296119689941, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 122 }, { "completion_length": 9.21875, "epoch": 0.021552479411249343, "grad_norm": 18.3096250105465, "kl": 0.038818359375, "learning_rate": 9.786227439985981e-07, "loss": 0.0156, "reward": 1.2355936765670776, "reward_std": 0.211252823472023, "rewards/accuracy_reward_stage2": 0.36059367656707764, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 123 }, { "completion_length": 13.375, "epoch": 0.021727702821096897, "grad_norm": 17.483371856266107, "kl": 0.035888671875, "learning_rate": 9.784475205887506e-07, "loss": 0.0144, "reward": 1.6861112117767334, "reward_std": 0.12975779175758362, "rewards/accuracy_reward_stage2": 0.6861111521720886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 124 }, { "completion_length": 8.53125, "epoch": 0.021902926230944454, "grad_norm": 14.352172913149598, "kl": 0.11572265625, "learning_rate": 9.78272297178903e-07, "loss": 0.0463, "reward": 1.4479167461395264, "reward_std": 0.06200198456645012, "rewards/accuracy_reward_stage2": 0.5729166865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 125 }, { "completion_length": 11.484375, "epoch": 0.02207814964079201, "grad_norm": 18.73850177803472, "kl": 0.049560546875, "learning_rate": 9.780970737690554e-07, "loss": -0.0115, "reward": 1.3510587215423584, "reward_std": 0.2711937725543976, "rewards/accuracy_reward_stage2": 0.3666836619377136, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 126 }, { "completion_length": 6.65625, "epoch": 0.022253373050639565, "grad_norm": 19.53612239258542, "kl": 0.03173828125, "learning_rate": 9.779218503592079e-07, "loss": 0.0127, "reward": 1.7808170318603516, "reward_std": 0.14916422963142395, "rewards/accuracy_reward_stage2": 0.7808170318603516, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 127 }, { "completion_length": 12.125, "epoch": 0.022428596460487123, "grad_norm": 24.190435873177584, "kl": 0.0128173828125, "learning_rate": 9.777466269493604e-07, "loss": 0.0051, "reward": 1.7245370149612427, "reward_std": 0.22146297991275787, "rewards/accuracy_reward_stage2": 0.7245370149612427, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 128 }, { "completion_length": 12.3125, "epoch": 0.022603819870334677, "grad_norm": 24.571036917758526, "kl": 0.09912109375, "learning_rate": 9.775714035395129e-07, "loss": -0.0045, "reward": 1.5884959697723389, "reward_std": 0.20376023650169373, "rewards/accuracy_reward_stage2": 0.6041209697723389, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 129 }, { "completion_length": 8.75, "epoch": 0.022779043280182234, "grad_norm": 17.644400774669826, "kl": 0.04638671875, "learning_rate": 9.773961801296654e-07, "loss": 0.0186, "reward": 1.5527987480163574, "reward_std": 0.11296023428440094, "rewards/accuracy_reward_stage2": 0.6777988076210022, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 130 }, { "completion_length": 8.546875, "epoch": 0.022954266690029788, "grad_norm": 970.1329773138383, "kl": 1.8203125, "learning_rate": 9.772209567198178e-07, "loss": 0.728, "reward": 1.5097854137420654, "reward_std": 0.06071118637919426, "rewards/accuracy_reward_stage2": 0.6347853541374207, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 131 }, { "completion_length": 8.140625, "epoch": 0.023129490099877345, "grad_norm": 13.697885280020044, "kl": 0.043212890625, "learning_rate": 9.770457333099701e-07, "loss": 0.0172, "reward": 1.610494613647461, "reward_std": 0.12853994965553284, "rewards/accuracy_reward_stage2": 0.6104945540428162, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 132 }, { "completion_length": 9.265625, "epoch": 0.0233047135097249, "grad_norm": 14.959578456824348, "kl": 0.040771484375, "learning_rate": 9.768705099001226e-07, "loss": 0.0163, "reward": 1.4808006286621094, "reward_std": 0.2097875326871872, "rewards/accuracy_reward_stage2": 0.48080065846443176, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 133 }, { "completion_length": 7.921875, "epoch": 0.023479936919572456, "grad_norm": 17.78254745647574, "kl": 0.0240478515625, "learning_rate": 9.766952864902751e-07, "loss": 0.0096, "reward": 1.9173030853271484, "reward_std": 0.1365506649017334, "rewards/accuracy_reward_stage2": 0.9173030853271484, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 134 }, { "completion_length": 11.65625, "epoch": 0.02365516032942001, "grad_norm": 33.701675996088376, "kl": 0.0198974609375, "learning_rate": 9.765200630804274e-07, "loss": 0.008, "reward": 1.2959372997283936, "reward_std": 0.22484754025936127, "rewards/accuracy_reward_stage2": 0.42093732953071594, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 135 }, { "completion_length": 10.296875, "epoch": 0.023830383739267567, "grad_norm": 26.21797926317752, "kl": 0.027099609375, "learning_rate": 9.763448396705799e-07, "loss": 0.0109, "reward": 1.248408555984497, "reward_std": 0.2923775911331177, "rewards/accuracy_reward_stage2": 0.24840857088565826, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 136 }, { "completion_length": 11.21875, "epoch": 0.02400560714911512, "grad_norm": 18.800406173940722, "kl": 0.051025390625, "learning_rate": 9.761696162607324e-07, "loss": 0.0205, "reward": 1.3072917461395264, "reward_std": 0.19727420806884766, "rewards/accuracy_reward_stage2": 0.4322916567325592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 137 }, { "completion_length": 10.359375, "epoch": 0.02418083055896268, "grad_norm": 25.398987769188867, "kl": 0.052978515625, "learning_rate": 9.759943928508849e-07, "loss": 0.0212, "reward": 1.514993667602539, "reward_std": 0.3276137113571167, "rewards/accuracy_reward_stage2": 0.5149936676025391, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 138 }, { "completion_length": 9.8125, "epoch": 0.024356053968810232, "grad_norm": 24.531852080551108, "kl": 0.053466796875, "learning_rate": 9.758191694410374e-07, "loss": 0.0214, "reward": 1.5364623069763184, "reward_std": 0.19039994478225708, "rewards/accuracy_reward_stage2": 0.5364623665809631, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 139 }, { "completion_length": 9.125, "epoch": 0.02453127737865779, "grad_norm": 40.319697076275226, "kl": 0.2138671875, "learning_rate": 9.756439460311896e-07, "loss": 0.0853, "reward": 1.2618070840835571, "reward_std": 0.18684542179107666, "rewards/accuracy_reward_stage2": 0.38680708408355713, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 140 }, { "completion_length": 11.734375, "epoch": 0.024706500788505344, "grad_norm": 39.53407951213018, "kl": 0.080078125, "learning_rate": 9.754687226213421e-07, "loss": 0.0321, "reward": 1.3220620155334473, "reward_std": 0.20267724990844727, "rewards/accuracy_reward_stage2": 0.3220618963241577, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 141 }, { "completion_length": 14.375, "epoch": 0.0248817241983529, "grad_norm": 16.356615395919924, "kl": 0.031005859375, "learning_rate": 9.752934992114946e-07, "loss": 0.0124, "reward": 1.3732510805130005, "reward_std": 0.09298402070999146, "rewards/accuracy_reward_stage2": 0.4982510209083557, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 142 }, { "completion_length": 14.5, "epoch": 0.025056947608200455, "grad_norm": 20.485881481845645, "kl": 0.06591796875, "learning_rate": 9.751182758016471e-07, "loss": 0.0264, "reward": 1.42328941822052, "reward_std": 0.1043790802359581, "rewards/accuracy_reward_stage2": 0.42328938841819763, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 143 }, { "completion_length": 13.03125, "epoch": 0.025232171018048012, "grad_norm": 692.1862884429493, "kl": 1.8828125, "learning_rate": 9.749430523917996e-07, "loss": 0.7519, "reward": 1.450892686843872, "reward_std": 0.22691306471824646, "rewards/accuracy_reward_stage2": 0.5758926868438721, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 144 }, { "completion_length": 6.90625, "epoch": 0.025407394427895566, "grad_norm": 18.93751606206503, "kl": 0.01904296875, "learning_rate": 9.74767828981952e-07, "loss": 0.0076, "reward": 1.556060552597046, "reward_std": 0.1706404983997345, "rewards/accuracy_reward_stage2": 0.5560606122016907, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 145 }, { "completion_length": 9.671875, "epoch": 0.025582617837743123, "grad_norm": 17.819420207093465, "kl": 0.0478515625, "learning_rate": 9.745926055721044e-07, "loss": 0.0191, "reward": 1.4579105377197266, "reward_std": 0.08065672963857651, "rewards/accuracy_reward_stage2": 0.4579104781150818, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 146 }, { "completion_length": 8.5, "epoch": 0.025757841247590677, "grad_norm": 22.182762747653655, "kl": 0.041259765625, "learning_rate": 9.744173821622569e-07, "loss": 0.0165, "reward": 1.6844103336334229, "reward_std": 0.13641130924224854, "rewards/accuracy_reward_stage2": 0.6844102144241333, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 147 }, { "completion_length": 13.921875, "epoch": 0.025933064657438235, "grad_norm": 16.81373447634611, "kl": 0.1357421875, "learning_rate": 9.742421587524092e-07, "loss": 0.0544, "reward": 1.344390869140625, "reward_std": 0.13242456316947937, "rewards/accuracy_reward_stage2": 0.46939074993133545, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 148 }, { "completion_length": 15.359375, "epoch": 0.02610828806728579, "grad_norm": 60.27386553230432, "kl": 0.6953125, "learning_rate": 9.740669353425617e-07, "loss": 0.2776, "reward": 1.1980289220809937, "reward_std": 0.04343012720346451, "rewards/accuracy_reward_stage2": 0.4480289816856384, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 149 }, { "completion_length": 18.3125, "epoch": 0.026283511477133346, "grad_norm": 24.141913410064443, "kl": 0.06884765625, "learning_rate": 9.738917119327141e-07, "loss": 0.0275, "reward": 1.2728391885757446, "reward_std": 0.19992250204086304, "rewards/accuracy_reward_stage2": 0.39783918857574463, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 150 }, { "completion_length": 11.3125, "epoch": 0.0264587348869809, "grad_norm": 24.36720305750165, "kl": 0.10791015625, "learning_rate": 9.737164885228666e-07, "loss": 0.0433, "reward": 1.547379970550537, "reward_std": 0.1736781895160675, "rewards/accuracy_reward_stage2": 0.6723799109458923, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 151 }, { "completion_length": 14.453125, "epoch": 0.026633958296828457, "grad_norm": 22.95786679226241, "kl": 0.06494140625, "learning_rate": 9.735412651130191e-07, "loss": 0.0259, "reward": 1.5407392978668213, "reward_std": 0.15379472076892853, "rewards/accuracy_reward_stage2": 0.5407392978668213, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 152 }, { "completion_length": 8.1875, "epoch": 0.02680918170667601, "grad_norm": 21.91910468296093, "kl": 0.287109375, "learning_rate": 9.733660417031714e-07, "loss": 0.1147, "reward": 1.4202229976654053, "reward_std": 0.07398553192615509, "rewards/accuracy_reward_stage2": 0.5452229976654053, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 153 }, { "completion_length": 11.21875, "epoch": 0.026984405116523568, "grad_norm": 22.202942153600965, "kl": 0.03173828125, "learning_rate": 9.73190818293324e-07, "loss": 0.0127, "reward": 1.4828336238861084, "reward_std": 0.1846798062324524, "rewards/accuracy_reward_stage2": 0.4828336834907532, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 154 }, { "completion_length": 14.796875, "epoch": 0.027159628526371122, "grad_norm": 16.491804505949233, "kl": 0.031005859375, "learning_rate": 9.730155948834764e-07, "loss": -0.0318, "reward": 1.6083829402923584, "reward_std": 0.1673525720834732, "rewards/accuracy_reward_stage2": 0.6240079402923584, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 155 }, { "completion_length": 7.953125, "epoch": 0.02733485193621868, "grad_norm": 19.67446432653083, "kl": 0.01806640625, "learning_rate": 9.728403714736289e-07, "loss": -0.0369, "reward": 1.6960554122924805, "reward_std": 0.167361319065094, "rewards/accuracy_reward_stage2": 0.7116804718971252, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 156 }, { "completion_length": 8.796875, "epoch": 0.027510075346066233, "grad_norm": 17.68099975023594, "kl": 0.1435546875, "learning_rate": 9.726651480637814e-07, "loss": 0.0572, "reward": 1.2521253824234009, "reward_std": 0.09048113971948624, "rewards/accuracy_reward_stage2": 0.5021253824234009, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 157 }, { "completion_length": 9.84375, "epoch": 0.02768529875591379, "grad_norm": 19.589334117115808, "kl": 0.02685546875, "learning_rate": 9.724899246539337e-07, "loss": 0.0107, "reward": 1.4374645948410034, "reward_std": 0.22100204229354858, "rewards/accuracy_reward_stage2": 0.5624645352363586, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 158 }, { "completion_length": 7.90625, "epoch": 0.027860522165761344, "grad_norm": 16.88466818293425, "kl": 0.1435546875, "learning_rate": 9.723147012440862e-07, "loss": 0.0575, "reward": 1.286747694015503, "reward_std": 0.07868210971355438, "rewards/accuracy_reward_stage2": 0.41174769401550293, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 159 }, { "completion_length": 9.859375, "epoch": 0.0280357455756089, "grad_norm": 24.05845918121416, "kl": 0.287109375, "learning_rate": 9.721394778342387e-07, "loss": 0.0704, "reward": 1.385817289352417, "reward_std": 0.24782794713974, "rewards/accuracy_reward_stage2": 0.5420673489570618, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 160 }, { "completion_length": 11.796875, "epoch": 0.028210968985456455, "grad_norm": 24.903654926382757, "kl": 0.2236328125, "learning_rate": 9.71964254424391e-07, "loss": 0.0894, "reward": 1.41621732711792, "reward_std": 0.21123412251472473, "rewards/accuracy_reward_stage2": 0.5412173271179199, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 161 }, { "completion_length": 5.984375, "epoch": 0.028386192395304013, "grad_norm": 18.93121586861874, "kl": 0.08984375, "learning_rate": 9.717890310145434e-07, "loss": 0.0143, "reward": 1.6027777194976807, "reward_std": 0.15436765551567078, "rewards/accuracy_reward_stage2": 0.7434027791023254, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 162 }, { "completion_length": 12.09375, "epoch": 0.028561415805151567, "grad_norm": 21.863912604715342, "kl": 0.035888671875, "learning_rate": 9.71613807604696e-07, "loss": 0.0143, "reward": 1.4885514974594116, "reward_std": 0.197072833776474, "rewards/accuracy_reward_stage2": 0.4885514974594116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 163 }, { "completion_length": 9.609375, "epoch": 0.028736639214999124, "grad_norm": 12.925116224769734, "kl": 0.0289306640625, "learning_rate": 9.714385841948484e-07, "loss": -0.0258, "reward": 1.3930555582046509, "reward_std": 0.1789308786392212, "rewards/accuracy_reward_stage2": 0.5336805582046509, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 164 }, { "completion_length": 10.921875, "epoch": 0.028911862624846678, "grad_norm": 23.33889593451729, "kl": 0.12255859375, "learning_rate": 9.71263360785001e-07, "loss": 0.049, "reward": 1.521234154701233, "reward_std": 0.2500312328338623, "rewards/accuracy_reward_stage2": 0.5212341547012329, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 165 }, { "completion_length": 17.75, "epoch": 0.029087086034694235, "grad_norm": 27.358894272466987, "kl": 0.038818359375, "learning_rate": 9.710881373751532e-07, "loss": 0.0156, "reward": 1.204958200454712, "reward_std": 0.1550707370042801, "rewards/accuracy_reward_stage2": 0.7049582004547119, "rewards/format_reward_stage1_pointerpad": 0.5, "scores/accuracy_reward_stage2": 0.5, "step": 166 }, { "completion_length": 11.21875, "epoch": 0.029262309444541793, "grad_norm": 469.49636178742867, "kl": 1.5078125, "learning_rate": 9.709129139653057e-07, "loss": 0.569, "reward": 1.3361520767211914, "reward_std": 0.29897385835647583, "rewards/accuracy_reward_stage2": 0.6017770767211914, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 167 }, { "completion_length": 12.328125, "epoch": 0.029437532854389346, "grad_norm": 239.23585360135849, "kl": 0.828125, "learning_rate": 9.707376905554582e-07, "loss": 0.2873, "reward": 1.2365100383758545, "reward_std": 0.15128737688064575, "rewards/accuracy_reward_stage2": 0.5021350383758545, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 168 }, { "completion_length": 8.453125, "epoch": 0.029612756264236904, "grad_norm": 24.00686702492152, "kl": 0.07861328125, "learning_rate": 9.705624671456107e-07, "loss": 0.0315, "reward": 1.5308879613876343, "reward_std": 0.12414561212062836, "rewards/accuracy_reward_stage2": 0.6558879613876343, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 169 }, { "completion_length": 10.9375, "epoch": 0.029787979674084458, "grad_norm": 16.30119126944678, "kl": 0.1953125, "learning_rate": 9.703872437357632e-07, "loss": 0.0782, "reward": 1.1700856685638428, "reward_std": 0.08398524671792984, "rewards/accuracy_reward_stage2": 0.42008569836616516, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 170 }, { "completion_length": 11.578125, "epoch": 0.029963203083932015, "grad_norm": 19.33276262182998, "kl": 0.0791015625, "learning_rate": 9.702120203259154e-07, "loss": 0.0316, "reward": 1.5743356943130493, "reward_std": 0.08988235145807266, "rewards/accuracy_reward_stage2": 0.6993356943130493, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 171 }, { "completion_length": 7.609375, "epoch": 0.03013842649377957, "grad_norm": 17.819788413421964, "kl": 0.024169921875, "learning_rate": 9.70036796916068e-07, "loss": 0.0097, "reward": 1.546875, "reward_std": 0.19044628739356995, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 172 }, { "completion_length": 8.59375, "epoch": 0.030313649903627126, "grad_norm": 17.311509596973274, "kl": 0.017578125, "learning_rate": 9.698615735062204e-07, "loss": 0.007, "reward": 1.609375, "reward_std": 0.23144522309303284, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 173 }, { "completion_length": 15.359375, "epoch": 0.03048887331347468, "grad_norm": 18.7946667032825, "kl": 0.2890625, "learning_rate": 9.696863500963727e-07, "loss": 0.1151, "reward": 1.5292927026748657, "reward_std": 0.15632925927639008, "rewards/accuracy_reward_stage2": 0.6542927026748657, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 174 }, { "completion_length": 9.875, "epoch": 0.030664096723322237, "grad_norm": 23.606318369291483, "kl": 0.0322265625, "learning_rate": 9.695111266865252e-07, "loss": 0.0129, "reward": 1.5562366247177124, "reward_std": 0.22413820028305054, "rewards/accuracy_reward_stage2": 0.5562366247177124, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 175 }, { "completion_length": 6.15625, "epoch": 0.03083932013316979, "grad_norm": 20.00259671512067, "kl": 0.051025390625, "learning_rate": 9.693359032766777e-07, "loss": 0.0204, "reward": 1.7697513103485107, "reward_std": 0.18249884247779846, "rewards/accuracy_reward_stage2": 0.7697513103485107, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 176 }, { "completion_length": 13.484375, "epoch": 0.03101454354301735, "grad_norm": 20.382631126127297, "kl": 0.08544921875, "learning_rate": 9.691606798668302e-07, "loss": 0.0341, "reward": 1.5370434522628784, "reward_std": 0.2536548376083374, "rewards/accuracy_reward_stage2": 0.5370435118675232, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 177 }, { "completion_length": 7.84375, "epoch": 0.031189766952864902, "grad_norm": 15.427236203193937, "kl": 0.03759765625, "learning_rate": 9.689854564569827e-07, "loss": 0.0151, "reward": 1.4042786359786987, "reward_std": 0.19416913390159607, "rewards/accuracy_reward_stage2": 0.5292786359786987, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 178 }, { "completion_length": 5.234375, "epoch": 0.03136499036271246, "grad_norm": 16.107007209106428, "kl": 0.0294189453125, "learning_rate": 9.68810233047135e-07, "loss": 0.0118, "reward": 1.453125, "reward_std": 0.12255740165710449, "rewards/accuracy_reward_stage2": 0.453125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 179 }, { "completion_length": 8.0, "epoch": 0.03154021377256001, "grad_norm": 29.971484972401523, "kl": 0.146484375, "learning_rate": 9.686350096372874e-07, "loss": 0.0584, "reward": 1.516639232635498, "reward_std": 0.3135683536529541, "rewards/accuracy_reward_stage2": 0.5166392922401428, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 180 }, { "completion_length": 18.203125, "epoch": 0.03171543718240757, "grad_norm": 20.36688864200073, "kl": 0.56640625, "learning_rate": 9.6845978622744e-07, "loss": 0.227, "reward": 1.384101390838623, "reward_std": 0.19463613629341125, "rewards/accuracy_reward_stage2": 0.5091014504432678, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 181 }, { "completion_length": 9.65625, "epoch": 0.03189066059225513, "grad_norm": 24.92217917167458, "kl": 0.314453125, "learning_rate": 9.682845628175924e-07, "loss": 0.0966, "reward": 1.561603307723999, "reward_std": 0.21581391990184784, "rewards/accuracy_reward_stage2": 0.702228307723999, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 182 }, { "completion_length": 7.84375, "epoch": 0.03206588400210268, "grad_norm": 17.739746197602084, "kl": 0.2294921875, "learning_rate": 9.68109339407745e-07, "loss": 0.0914, "reward": 1.410539150238037, "reward_std": 0.17888331413269043, "rewards/accuracy_reward_stage2": 0.5355392098426819, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 183 }, { "completion_length": 6.40625, "epoch": 0.032241107411950236, "grad_norm": 21.32780493688504, "kl": 0.017333984375, "learning_rate": 9.679341159978974e-07, "loss": 0.0069, "reward": 1.5488324165344238, "reward_std": 0.17159403860569, "rewards/accuracy_reward_stage2": 0.5488324165344238, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 184 }, { "completion_length": 8.65625, "epoch": 0.03241633082179779, "grad_norm": 10.965259085064249, "kl": 0.0186767578125, "learning_rate": 9.677588925880497e-07, "loss": 0.0075, "reward": 1.53125, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 185 }, { "completion_length": 6.46875, "epoch": 0.03259155423164535, "grad_norm": 31.56035135634499, "kl": 0.025146484375, "learning_rate": 9.675836691782022e-07, "loss": 0.01, "reward": 1.390625, "reward_std": 0.2688094973564148, "rewards/accuracy_reward_stage2": 0.390625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 186 }, { "completion_length": 9.453125, "epoch": 0.032766777641492904, "grad_norm": 21.307798882002583, "kl": 0.0859375, "learning_rate": 9.674084457683545e-07, "loss": 0.0344, "reward": 1.7048611640930176, "reward_std": 0.1740472912788391, "rewards/accuracy_reward_stage2": 0.7048612236976624, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 187 }, { "completion_length": 12.828125, "epoch": 0.03294200105134046, "grad_norm": 27.47034209459053, "kl": 0.5703125, "learning_rate": 9.67233222358507e-07, "loss": 0.2284, "reward": 1.5460162162780762, "reward_std": 0.13096138834953308, "rewards/accuracy_reward_stage2": 0.671016275882721, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 188 }, { "completion_length": 11.078125, "epoch": 0.03311722446118801, "grad_norm": 613.5396132456912, "kl": 0.87109375, "learning_rate": 9.670579989486595e-07, "loss": 0.3496, "reward": 1.3907642364501953, "reward_std": 0.11250603199005127, "rewards/accuracy_reward_stage2": 0.5157641768455505, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 189 }, { "completion_length": 11.125, "epoch": 0.03329244787103557, "grad_norm": 22.822592319874765, "kl": 0.06884765625, "learning_rate": 9.66882775538812e-07, "loss": 0.0274, "reward": 1.5781188011169434, "reward_std": 0.17464013397693634, "rewards/accuracy_reward_stage2": 0.5781188011169434, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 190 }, { "completion_length": 12.0, "epoch": 0.03346767128088313, "grad_norm": 23.259903802071136, "kl": 0.07763671875, "learning_rate": 9.667075521289644e-07, "loss": 0.031, "reward": 1.4189984798431396, "reward_std": 0.2558039128780365, "rewards/accuracy_reward_stage2": 0.4189985394477844, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 191 }, { "completion_length": 9.234375, "epoch": 0.03364289469073068, "grad_norm": 28.16087420682577, "kl": 0.04296875, "learning_rate": 9.66532328719117e-07, "loss": 0.0172, "reward": 1.5628974437713623, "reward_std": 0.2809803783893585, "rewards/accuracy_reward_stage2": 0.5628974437713623, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 192 }, { "completion_length": 8.625, "epoch": 0.033818118100578234, "grad_norm": 18.956653677151603, "kl": 0.099609375, "learning_rate": 9.663571053092692e-07, "loss": 0.0399, "reward": 1.7109836339950562, "reward_std": 0.09243927150964737, "rewards/accuracy_reward_stage2": 0.7109836339950562, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 193 }, { "completion_length": 7.90625, "epoch": 0.033993341510425795, "grad_norm": 20.13826896551377, "kl": 0.04833984375, "learning_rate": 9.661818818994217e-07, "loss": 0.0194, "reward": 1.592308759689331, "reward_std": 0.09325343370437622, "rewards/accuracy_reward_stage2": 0.592308759689331, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 194 }, { "completion_length": 13.1875, "epoch": 0.03416856492027335, "grad_norm": 29.30599734850387, "kl": 0.03173828125, "learning_rate": 9.660066584895742e-07, "loss": 0.0127, "reward": 1.53125, "reward_std": 0.28566449880599976, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 195 }, { "completion_length": 14.375, "epoch": 0.0343437883301209, "grad_norm": 47.13706959580855, "kl": 0.2236328125, "learning_rate": 9.658314350797267e-07, "loss": 0.0496, "reward": 1.4006702899932861, "reward_std": 0.13923153281211853, "rewards/accuracy_reward_stage2": 0.5412952303886414, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 196 }, { "completion_length": 8.3125, "epoch": 0.03451901173996846, "grad_norm": 22.833902373519475, "kl": 0.1318359375, "learning_rate": 9.656562116698792e-07, "loss": 0.0524, "reward": 1.2781250476837158, "reward_std": 0.12562815845012665, "rewards/accuracy_reward_stage2": 0.528124988079071, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 197 }, { "completion_length": 16.59375, "epoch": 0.03469423514981602, "grad_norm": 665.8480304252839, "kl": 3.71875, "learning_rate": 9.654809882600315e-07, "loss": 1.4827, "reward": 1.6354167461395264, "reward_std": 0.1997472047805786, "rewards/accuracy_reward_stage2": 0.8854166865348816, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 198 }, { "completion_length": 10.0, "epoch": 0.03486945855966357, "grad_norm": 24.530226882177114, "kl": 0.115234375, "learning_rate": 9.65305764850184e-07, "loss": 0.0461, "reward": 1.4895833730697632, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.6145833730697632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 199 }, { "completion_length": 7.5, "epoch": 0.035044681969511125, "grad_norm": 16.836862354365994, "kl": 0.0072021484375, "learning_rate": 9.651305414403364e-07, "loss": 0.0029, "reward": 1.490378499031067, "reward_std": 0.11636392772197723, "rewards/accuracy_reward_stage2": 0.6153784990310669, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 200 }, { "completion_length": 11.125, "epoch": 0.03521990537935868, "grad_norm": 22.060189107320923, "kl": 0.054931640625, "learning_rate": 9.649553180304887e-07, "loss": 0.022, "reward": 1.3880213499069214, "reward_std": 0.16327911615371704, "rewards/accuracy_reward_stage2": 0.3880213499069214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 201 }, { "completion_length": 8.25, "epoch": 0.03539512878920624, "grad_norm": 19.505067645334403, "kl": 0.039306640625, "learning_rate": 9.647800946206412e-07, "loss": -0.0284, "reward": 1.773182988166809, "reward_std": 0.1636386662721634, "rewards/accuracy_reward_stage2": 0.7888079881668091, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 202 }, { "completion_length": 11.125, "epoch": 0.035570352199053794, "grad_norm": 26.28569376820353, "kl": 0.0673828125, "learning_rate": 9.646048712107937e-07, "loss": 0.0269, "reward": 1.1831471920013428, "reward_std": 0.20850315690040588, "rewards/accuracy_reward_stage2": 0.43314728140830994, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 203 }, { "completion_length": 10.1875, "epoch": 0.03574557560890135, "grad_norm": 22.462333146273004, "kl": 0.06640625, "learning_rate": 9.644296478009462e-07, "loss": 0.0265, "reward": 1.3284682035446167, "reward_std": 0.23116865754127502, "rewards/accuracy_reward_stage2": 0.3284682035446167, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 204 }, { "completion_length": 23.359375, "epoch": 0.0359207990187489, "grad_norm": 43.353709695881726, "kl": 0.040283203125, "learning_rate": 9.642544243910987e-07, "loss": 0.0162, "reward": 1.2655521631240845, "reward_std": 0.29400354623794556, "rewards/accuracy_reward_stage2": 0.26555219292640686, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 205 }, { "completion_length": 8.25, "epoch": 0.03609602242859646, "grad_norm": 17.987108141910905, "kl": 0.04248046875, "learning_rate": 9.64079200981251e-07, "loss": 0.0171, "reward": 1.8145318031311035, "reward_std": 0.21234536170959473, "rewards/accuracy_reward_stage2": 0.8145317435264587, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 206 }, { "completion_length": 10.703125, "epoch": 0.036271245838444016, "grad_norm": 27.1027412842586, "kl": 0.412109375, "learning_rate": 9.639039775714035e-07, "loss": 0.1648, "reward": 1.2671058177947998, "reward_std": 0.24343198537826538, "rewards/accuracy_reward_stage2": 0.5171056985855103, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 207 }, { "completion_length": 11.21875, "epoch": 0.03644646924829157, "grad_norm": 11.33940008261332, "kl": 0.02978515625, "learning_rate": 9.63728754161556e-07, "loss": 0.0119, "reward": 1.3489539623260498, "reward_std": 0.0795942097902298, "rewards/accuracy_reward_stage2": 0.3489539623260498, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 208 }, { "completion_length": 7.265625, "epoch": 0.03662169265813913, "grad_norm": 18.64581413706451, "kl": 0.00799560546875, "learning_rate": 9.635535307517085e-07, "loss": -0.041, "reward": 1.5224037170410156, "reward_std": 0.18349644541740417, "rewards/accuracy_reward_stage2": 0.5380287170410156, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 209 }, { "completion_length": 10.203125, "epoch": 0.036796916067986685, "grad_norm": 26.531838880659922, "kl": 0.06494140625, "learning_rate": 9.63378307341861e-07, "loss": 0.026, "reward": 1.6403162479400635, "reward_std": 0.36213648319244385, "rewards/accuracy_reward_stage2": 0.640316367149353, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 210 }, { "completion_length": 11.421875, "epoch": 0.03697213947783424, "grad_norm": 19.67305372025666, "kl": 0.142578125, "learning_rate": 9.632030839320132e-07, "loss": 0.0571, "reward": 1.446283221244812, "reward_std": 0.11086312681436539, "rewards/accuracy_reward_stage2": 0.5712832808494568, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 211 }, { "completion_length": 7.921875, "epoch": 0.03714736288768179, "grad_norm": 18.73395276268249, "kl": 0.0216064453125, "learning_rate": 9.630278605221657e-07, "loss": 0.0087, "reward": 1.742701530456543, "reward_std": 0.164242684841156, "rewards/accuracy_reward_stage2": 0.7427014708518982, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 212 }, { "completion_length": 15.359375, "epoch": 0.03732258629752935, "grad_norm": 18.465353708149525, "kl": 0.045654296875, "learning_rate": 9.628526371123182e-07, "loss": 0.0183, "reward": 1.1487271785736084, "reward_std": 0.11971971392631531, "rewards/accuracy_reward_stage2": 0.2737271785736084, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 213 }, { "completion_length": 10.8125, "epoch": 0.03749780970737691, "grad_norm": 17.321449052890124, "kl": 0.02392578125, "learning_rate": 9.626774137024705e-07, "loss": 0.0096, "reward": 1.5291376113891602, "reward_std": 0.15100376307964325, "rewards/accuracy_reward_stage2": 0.5291374921798706, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 214 }, { "completion_length": 7.75, "epoch": 0.03767303311722446, "grad_norm": 21.689215854097338, "kl": 0.12109375, "learning_rate": 9.62502190292623e-07, "loss": 0.0485, "reward": 1.4410247802734375, "reward_std": 0.3011726140975952, "rewards/accuracy_reward_stage2": 0.5660248398780823, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 215 }, { "completion_length": 8.359375, "epoch": 0.037848256527072015, "grad_norm": 23.09518268911407, "kl": 0.0869140625, "learning_rate": 9.623269668827755e-07, "loss": -0.0094, "reward": 1.4675501585006714, "reward_std": 0.1814957708120346, "rewards/accuracy_reward_stage2": 0.483175128698349, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 216 }, { "completion_length": 12.734375, "epoch": 0.038023479936919576, "grad_norm": 19.483904450481027, "kl": 0.11962890625, "learning_rate": 9.62151743472928e-07, "loss": 0.0477, "reward": 1.4069256782531738, "reward_std": 0.17002731561660767, "rewards/accuracy_reward_stage2": 0.531925618648529, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 217 }, { "completion_length": 6.53125, "epoch": 0.03819870334676713, "grad_norm": 38.87952022905316, "kl": 0.09521484375, "learning_rate": 9.619765200630805e-07, "loss": -0.006, "reward": 1.8126009702682495, "reward_std": 0.19280743598937988, "rewards/accuracy_reward_stage2": 0.8282259702682495, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 218 }, { "completion_length": 9.546875, "epoch": 0.03837392675661468, "grad_norm": 18.565108749119865, "kl": 0.1416015625, "learning_rate": 9.618012966532327e-07, "loss": 0.0565, "reward": 1.5609869956970215, "reward_std": 0.1407136768102646, "rewards/accuracy_reward_stage2": 0.8109869956970215, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 219 }, { "completion_length": 8.65625, "epoch": 0.03854915016646224, "grad_norm": 15.210793388574377, "kl": 0.01385498046875, "learning_rate": 9.616260732433852e-07, "loss": 0.0055, "reward": 1.5824074745178223, "reward_std": 0.16188913583755493, "rewards/accuracy_reward_stage2": 0.5824074149131775, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 220 }, { "completion_length": 8.984375, "epoch": 0.0387243735763098, "grad_norm": 21.186984746548617, "kl": 0.056396484375, "learning_rate": 9.614508498335377e-07, "loss": 0.0225, "reward": 1.4985287189483643, "reward_std": 0.12607437372207642, "rewards/accuracy_reward_stage2": 0.49852871894836426, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 221 }, { "completion_length": 11.15625, "epoch": 0.03889959698615735, "grad_norm": 16.93524040273681, "kl": 0.060546875, "learning_rate": 9.612756264236902e-07, "loss": 0.0243, "reward": 1.6816024780273438, "reward_std": 0.13517498970031738, "rewards/accuracy_reward_stage2": 0.6816024780273438, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 222 }, { "completion_length": 10.109375, "epoch": 0.039074820396004906, "grad_norm": 23.577441458972622, "kl": 0.07275390625, "learning_rate": 9.611004030138427e-07, "loss": 0.0292, "reward": 1.5530338287353516, "reward_std": 0.18569764494895935, "rewards/accuracy_reward_stage2": 0.5530339479446411, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 223 }, { "completion_length": 15.015625, "epoch": 0.03925004380585246, "grad_norm": 6141.6201343970815, "kl": 15.6875, "learning_rate": 9.60925179603995e-07, "loss": 6.2679, "reward": 1.1519737243652344, "reward_std": 0.23728567361831665, "rewards/accuracy_reward_stage2": 0.417598694562912, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 224 }, { "completion_length": 10.21875, "epoch": 0.03942526721570002, "grad_norm": 21.489482009145725, "kl": 0.0240478515625, "learning_rate": 9.607499561941475e-07, "loss": 0.0096, "reward": 1.7004191875457764, "reward_std": 0.15592418611049652, "rewards/accuracy_reward_stage2": 0.7004191875457764, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 225 }, { "completion_length": 9.953125, "epoch": 0.039600490625547574, "grad_norm": 16.51799687820737, "kl": 0.0250244140625, "learning_rate": 9.605747327843e-07, "loss": 0.01, "reward": 1.6579914093017578, "reward_std": 0.16353479027748108, "rewards/accuracy_reward_stage2": 0.657991349697113, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 226 }, { "completion_length": 36.53125, "epoch": 0.03977571403539513, "grad_norm": 20.3875835203367, "kl": 0.018310546875, "learning_rate": 9.603995093744523e-07, "loss": 0.0073, "reward": 1.589333176612854, "reward_std": 0.1167701929807663, "rewards/accuracy_reward_stage2": 0.589333176612854, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 227 }, { "completion_length": 9.78125, "epoch": 0.03995093744524268, "grad_norm": 20.26695770586334, "kl": 0.061279296875, "learning_rate": 9.602242859646048e-07, "loss": -0.0197, "reward": 1.712099552154541, "reward_std": 0.15879106521606445, "rewards/accuracy_reward_stage2": 0.727724552154541, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 228 }, { "completion_length": 10.984375, "epoch": 0.04012616085509024, "grad_norm": 21.453556659540432, "kl": 0.0291748046875, "learning_rate": 9.600490625547573e-07, "loss": -0.0278, "reward": 1.6276353597640991, "reward_std": 0.25666865706443787, "rewards/accuracy_reward_stage2": 0.6432603001594543, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 229 }, { "completion_length": 8.453125, "epoch": 0.040301384264937797, "grad_norm": 24.431506061785814, "kl": 0.193359375, "learning_rate": 9.598738391449097e-07, "loss": 0.0777, "reward": 1.531674861907959, "reward_std": 0.14275917410850525, "rewards/accuracy_reward_stage2": 0.6566749811172485, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 230 }, { "completion_length": 10.921875, "epoch": 0.04047660767478535, "grad_norm": 17.59455777114902, "kl": 0.12353515625, "learning_rate": 9.596986157350622e-07, "loss": 0.0052, "reward": 1.4566197395324707, "reward_std": 0.21315625309944153, "rewards/accuracy_reward_stage2": 0.47224482893943787, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 231 }, { "completion_length": 11.5625, "epoch": 0.040651831084632904, "grad_norm": 16.538614976828548, "kl": 0.0478515625, "learning_rate": 9.595233923252145e-07, "loss": 0.0192, "reward": 1.4099462032318115, "reward_std": 0.165542870759964, "rewards/accuracy_reward_stage2": 0.4099462628364563, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 232 }, { "completion_length": 19.921875, "epoch": 0.040827054494480465, "grad_norm": 15.825866121650826, "kl": 0.035888671875, "learning_rate": 9.59348168915367e-07, "loss": 0.0144, "reward": 1.3888517618179321, "reward_std": 0.10451022535562515, "rewards/accuracy_reward_stage2": 0.38885173201560974, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 233 }, { "completion_length": 8.59375, "epoch": 0.04100227790432802, "grad_norm": 29.579691766926054, "kl": 0.0294189453125, "learning_rate": 9.591729455055195e-07, "loss": 0.0118, "reward": 1.328125, "reward_std": 0.22673699259757996, "rewards/accuracy_reward_stage2": 0.453125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 234 }, { "completion_length": 10.96875, "epoch": 0.04117750131417557, "grad_norm": 20.924805494763422, "kl": 0.0791015625, "learning_rate": 9.58997722095672e-07, "loss": 0.0316, "reward": 1.5717337131500244, "reward_std": 0.08275075256824493, "rewards/accuracy_reward_stage2": 0.5717335939407349, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 235 }, { "completion_length": 9.875, "epoch": 0.04135272472402313, "grad_norm": 23.14742741754885, "kl": 0.03076171875, "learning_rate": 9.588224986858245e-07, "loss": 0.0123, "reward": 1.7143429517745972, "reward_std": 0.18178583681583405, "rewards/accuracy_reward_stage2": 0.7143428921699524, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 236 }, { "completion_length": 10.59375, "epoch": 0.04152794813387069, "grad_norm": 16.969631438431694, "kl": 0.07666015625, "learning_rate": 9.586472752759768e-07, "loss": 0.0306, "reward": 1.559175729751587, "reward_std": 0.13105592131614685, "rewards/accuracy_reward_stage2": 0.5591757297515869, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 237 }, { "completion_length": 11.328125, "epoch": 0.04170317154371824, "grad_norm": 21.50047187718676, "kl": 0.0458984375, "learning_rate": 9.584720518661293e-07, "loss": 0.0183, "reward": 1.4090485572814941, "reward_std": 0.19534313678741455, "rewards/accuracy_reward_stage2": 0.4090486168861389, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 238 }, { "completion_length": 12.5625, "epoch": 0.041878394953565795, "grad_norm": 13.322354332909555, "kl": 0.023681640625, "learning_rate": 9.582968284562818e-07, "loss": 0.0095, "reward": 1.4014296531677246, "reward_std": 0.14201299846172333, "rewards/accuracy_reward_stage2": 0.4014296531677246, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 239 }, { "completion_length": 8.84375, "epoch": 0.04205361836341335, "grad_norm": 105.06945037302584, "kl": 0.14453125, "learning_rate": 9.58121605046434e-07, "loss": 0.0242, "reward": 1.6751351356506348, "reward_std": 0.24619035422801971, "rewards/accuracy_reward_stage2": 0.6907602548599243, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 240 }, { "completion_length": 8.578125, "epoch": 0.04222884177326091, "grad_norm": 25.160030176481186, "kl": 0.035888671875, "learning_rate": 9.579463816365865e-07, "loss": 0.0144, "reward": 1.5575356483459473, "reward_std": 0.26918134093284607, "rewards/accuracy_reward_stage2": 0.6825356483459473, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 241 }, { "completion_length": 10.609375, "epoch": 0.042404065183108464, "grad_norm": 20.556650961134515, "kl": 0.099609375, "learning_rate": 9.57771158226739e-07, "loss": 0.0398, "reward": 1.4830882549285889, "reward_std": 0.19809089601039886, "rewards/accuracy_reward_stage2": 0.48308834433555603, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 242 }, { "completion_length": 8.453125, "epoch": 0.04257928859295602, "grad_norm": 30.14382376308136, "kl": 0.201171875, "learning_rate": 9.575959348168915e-07, "loss": 0.0806, "reward": 1.6129651069641113, "reward_std": 0.27128833532333374, "rewards/accuracy_reward_stage2": 0.6129651665687561, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 243 }, { "completion_length": 11.328125, "epoch": 0.04275451200280357, "grad_norm": 324.7190701022341, "kl": 1.8359375, "learning_rate": 9.57420711407044e-07, "loss": 0.6912, "reward": 1.7415674924850464, "reward_std": 0.18711799383163452, "rewards/accuracy_reward_stage2": 0.8821924328804016, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 244 }, { "completion_length": 9.265625, "epoch": 0.04292973541265113, "grad_norm": 24.608966123142295, "kl": 0.2734375, "learning_rate": 9.572454879971965e-07, "loss": 0.1092, "reward": 1.4464468955993652, "reward_std": 0.3318367302417755, "rewards/accuracy_reward_stage2": 0.5714468955993652, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 245 }, { "completion_length": 8.546875, "epoch": 0.043104958822498686, "grad_norm": 19.36548616027605, "kl": 0.0284423828125, "learning_rate": 9.570702645873488e-07, "loss": 0.0114, "reward": 1.5153286457061768, "reward_std": 0.23770207166671753, "rewards/accuracy_reward_stage2": 0.515328586101532, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 246 }, { "completion_length": 14.15625, "epoch": 0.04328018223234624, "grad_norm": 22.541003255182787, "kl": 0.0223388671875, "learning_rate": 9.568950411775013e-07, "loss": 0.009, "reward": 1.5925946235656738, "reward_std": 0.2566668391227722, "rewards/accuracy_reward_stage2": 0.5925946235656738, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 247 }, { "completion_length": 10.71875, "epoch": 0.043455405642193794, "grad_norm": 22.651546161743862, "kl": 0.052734375, "learning_rate": 9.567198177676538e-07, "loss": 0.021, "reward": 1.6145833730697632, "reward_std": 0.21749193966388702, "rewards/accuracy_reward_stage2": 0.6145833730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 248 }, { "completion_length": 5.984375, "epoch": 0.043630629052041354, "grad_norm": 25.60105051795061, "kl": 0.042724609375, "learning_rate": 9.565445943578063e-07, "loss": 0.0171, "reward": 1.5998629331588745, "reward_std": 0.181601881980896, "rewards/accuracy_reward_stage2": 0.5998629331588745, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 249 }, { "completion_length": 12.25, "epoch": 0.04380585246188891, "grad_norm": 26.37616634616468, "kl": 0.23046875, "learning_rate": 9.563693709479585e-07, "loss": 0.0926, "reward": 1.4369782209396362, "reward_std": 0.2544781565666199, "rewards/accuracy_reward_stage2": 0.5619782209396362, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 250 }, { "completion_length": 11.46875, "epoch": 0.04398107587173646, "grad_norm": 19.591330163004343, "kl": 0.05517578125, "learning_rate": 9.56194147538111e-07, "loss": 0.0221, "reward": 1.517921805381775, "reward_std": 0.14252689480781555, "rewards/accuracy_reward_stage2": 0.5179218053817749, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 251 }, { "completion_length": 13.09375, "epoch": 0.04415629928158402, "grad_norm": 15.21166450844923, "kl": 0.059814453125, "learning_rate": 9.560189241282635e-07, "loss": 0.0239, "reward": 1.4374809265136719, "reward_std": 0.12924231588840485, "rewards/accuracy_reward_stage2": 0.4374809265136719, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 252 }, { "completion_length": 9.578125, "epoch": 0.04433152269143158, "grad_norm": 15.65910478320339, "kl": 0.037841796875, "learning_rate": 9.55843700718416e-07, "loss": -0.0266, "reward": 1.6806310415267944, "reward_std": 0.13534963130950928, "rewards/accuracy_reward_stage2": 0.6962560415267944, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 253 }, { "completion_length": 12.265625, "epoch": 0.04450674610127913, "grad_norm": 13.699123733968174, "kl": 0.060791015625, "learning_rate": 9.556684773085683e-07, "loss": 0.0243, "reward": 1.563589096069336, "reward_std": 0.07810796797275543, "rewards/accuracy_reward_stage2": 0.5635892152786255, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 254 }, { "completion_length": 8.0625, "epoch": 0.044681969511126685, "grad_norm": 18.550385686776043, "kl": 0.0732421875, "learning_rate": 9.554932538987208e-07, "loss": 0.0293, "reward": 1.4762643575668335, "reward_std": 0.1577366590499878, "rewards/accuracy_reward_stage2": 0.4762643277645111, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 255 }, { "completion_length": 8.3125, "epoch": 0.044857192920974245, "grad_norm": 26.99970659587625, "kl": 0.0869140625, "learning_rate": 9.553180304888733e-07, "loss": -0.1223, "reward": 1.3463577032089233, "reward_std": 0.3971595764160156, "rewards/accuracy_reward_stage2": 0.40885767340660095, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 256 }, { "completion_length": 11.203125, "epoch": 0.0450324163308218, "grad_norm": 22.516253290469027, "kl": 0.039794921875, "learning_rate": 9.551428070790258e-07, "loss": 0.0159, "reward": 1.410148024559021, "reward_std": 0.1681969165802002, "rewards/accuracy_reward_stage2": 0.4101479947566986, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 257 }, { "completion_length": 7.203125, "epoch": 0.04520763974066935, "grad_norm": 19.288619552831214, "kl": 0.1025390625, "learning_rate": 9.549675836691783e-07, "loss": -0.0266, "reward": 1.5533853769302368, "reward_std": 0.14150744676589966, "rewards/accuracy_reward_stage2": 0.7096354365348816, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 258 }, { "completion_length": 6.078125, "epoch": 0.04538286315051691, "grad_norm": 15.499767988058947, "kl": 0.09619140625, "learning_rate": 9.547923602593305e-07, "loss": 0.002, "reward": 1.5709052085876465, "reward_std": 0.1841139942407608, "rewards/accuracy_reward_stage2": 0.5865301489830017, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 259 }, { "completion_length": 11.859375, "epoch": 0.04555808656036447, "grad_norm": 341.9481715113229, "kl": 0.58984375, "learning_rate": 9.54617136849483e-07, "loss": 0.1918, "reward": 1.475005865097046, "reward_std": 0.13841910660266876, "rewards/accuracy_reward_stage2": 0.4906309247016907, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 260 }, { "completion_length": 15.5, "epoch": 0.04573330997021202, "grad_norm": 18.643239096535567, "kl": 0.1357421875, "learning_rate": 9.544419134396355e-07, "loss": 0.0539, "reward": 1.3988699913024902, "reward_std": 0.15246807038784027, "rewards/accuracy_reward_stage2": 0.523870050907135, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 261 }, { "completion_length": 11.671875, "epoch": 0.045908533380059575, "grad_norm": 23.82966694525005, "kl": 0.0654296875, "learning_rate": 9.54266690029788e-07, "loss": 0.0262, "reward": 1.5144448280334473, "reward_std": 0.19817392528057098, "rewards/accuracy_reward_stage2": 0.5144447684288025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 262 }, { "completion_length": 8.046875, "epoch": 0.04608375678990713, "grad_norm": 13.796449286878177, "kl": 0.08203125, "learning_rate": 9.540914666199403e-07, "loss": 0.0329, "reward": 1.663055419921875, "reward_std": 0.05650103837251663, "rewards/accuracy_reward_stage2": 0.663055419921875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 263 }, { "completion_length": 10.6875, "epoch": 0.04625898019975469, "grad_norm": 21.257988314253687, "kl": 0.0673828125, "learning_rate": 9.539162432100928e-07, "loss": 0.0269, "reward": 1.5275135040283203, "reward_std": 0.2669033110141754, "rewards/accuracy_reward_stage2": 0.6525135040283203, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 264 }, { "completion_length": 8.796875, "epoch": 0.046434203609602244, "grad_norm": 26.189523445618033, "kl": 0.0301513671875, "learning_rate": 9.537410198002453e-07, "loss": -0.0308, "reward": 1.8123893737792969, "reward_std": 0.16830343008041382, "rewards/accuracy_reward_stage2": 0.8280143737792969, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 265 }, { "completion_length": 6.578125, "epoch": 0.0466094270194498, "grad_norm": 17.083602252194122, "kl": 0.0172119140625, "learning_rate": 9.535657963903977e-07, "loss": 0.0069, "reward": 1.4092938899993896, "reward_std": 0.16730165481567383, "rewards/accuracy_reward_stage2": 0.4092938303947449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 266 }, { "completion_length": 10.3125, "epoch": 0.04678465042929735, "grad_norm": 37.184142614310694, "kl": 0.028564453125, "learning_rate": 9.533905729805502e-07, "loss": -0.0328, "reward": 1.5248304605484009, "reward_std": 0.3038603663444519, "rewards/accuracy_reward_stage2": 0.5404554605484009, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 267 }, { "completion_length": 10.609375, "epoch": 0.04695987383914491, "grad_norm": 28.95806658900415, "kl": 0.25390625, "learning_rate": 9.532153495707026e-07, "loss": 0.0159, "reward": 1.2882633209228516, "reward_std": 0.23559540510177612, "rewards/accuracy_reward_stage2": 0.44451335072517395, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 268 }, { "completion_length": 11.484375, "epoch": 0.047135097248992466, "grad_norm": 26.488973390928386, "kl": 0.10888671875, "learning_rate": 9.53040126160855e-07, "loss": -0.0298, "reward": 1.598239541053772, "reward_std": 0.2586630582809448, "rewards/accuracy_reward_stage2": 0.6294894814491272, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 269 }, { "completion_length": 8.484375, "epoch": 0.04731032065884002, "grad_norm": 20.04576924598462, "kl": 0.072265625, "learning_rate": 9.528649027510075e-07, "loss": 0.029, "reward": 1.3794233798980713, "reward_std": 0.20582807064056396, "rewards/accuracy_reward_stage2": 0.5044234395027161, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 270 }, { "completion_length": 11.609375, "epoch": 0.047485544068687574, "grad_norm": 20.371874049564635, "kl": 0.08740234375, "learning_rate": 9.526896793411599e-07, "loss": 0.035, "reward": 1.446256399154663, "reward_std": 0.20240430533885956, "rewards/accuracy_reward_stage2": 0.5712563395500183, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 271 }, { "completion_length": 5.703125, "epoch": 0.047660767478535135, "grad_norm": 21.311882099524407, "kl": 0.0888671875, "learning_rate": 9.525144559313124e-07, "loss": 0.0356, "reward": 1.736009120941162, "reward_std": 0.17594566941261292, "rewards/accuracy_reward_stage2": 0.7360091209411621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 272 }, { "completion_length": 9.15625, "epoch": 0.04783599088838269, "grad_norm": 29.67754488993763, "kl": 0.0179443359375, "learning_rate": 9.523392325214649e-07, "loss": 0.0072, "reward": 1.641929268836975, "reward_std": 0.17333316802978516, "rewards/accuracy_reward_stage2": 0.6419292688369751, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 273 }, { "completion_length": 9.21875, "epoch": 0.04801121429823024, "grad_norm": 16.305746612398746, "kl": 0.09521484375, "learning_rate": 9.521640091116173e-07, "loss": 0.0382, "reward": 1.406163215637207, "reward_std": 0.15640440583229065, "rewards/accuracy_reward_stage2": 0.531163215637207, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 274 }, { "completion_length": 18.671875, "epoch": 0.048186437708077796, "grad_norm": 19.104494178824076, "kl": 0.361328125, "learning_rate": 9.519887857017697e-07, "loss": 0.1008, "reward": 1.6140856742858887, "reward_std": 0.1861516237258911, "rewards/accuracy_reward_stage2": 0.7547106742858887, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 275 }, { "completion_length": 6.765625, "epoch": 0.04836166111792536, "grad_norm": 19.587912810004013, "kl": 0.061767578125, "learning_rate": 9.518135622919221e-07, "loss": 0.0247, "reward": 1.8026357889175415, "reward_std": 0.24802812933921814, "rewards/accuracy_reward_stage2": 0.8026357293128967, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 276 }, { "completion_length": 6.953125, "epoch": 0.04853688452777291, "grad_norm": 16.27617765583656, "kl": 0.055908203125, "learning_rate": 9.516383388820746e-07, "loss": -0.01, "reward": 1.5396757125854492, "reward_std": 0.21801412105560303, "rewards/accuracy_reward_stage2": 0.5553005933761597, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 277 }, { "completion_length": 12.9375, "epoch": 0.048712107937620465, "grad_norm": 15.60792995659862, "kl": 0.11328125, "learning_rate": 9.514631154722271e-07, "loss": 0.0011, "reward": 1.3102458715438843, "reward_std": 0.157129168510437, "rewards/accuracy_reward_stage2": 0.5758708715438843, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 278 }, { "completion_length": 5.65625, "epoch": 0.04888733134746802, "grad_norm": 21.31301576597148, "kl": 0.04931640625, "learning_rate": 9.512878920623794e-07, "loss": 0.0197, "reward": 1.7604167461395264, "reward_std": 0.257610946893692, "rewards/accuracy_reward_stage2": 0.7604166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 279 }, { "completion_length": 8.125, "epoch": 0.04906255475731558, "grad_norm": 17.03041073737609, "kl": 0.048095703125, "learning_rate": 9.511126686525319e-07, "loss": 0.0192, "reward": 1.6696319580078125, "reward_std": 0.06764136254787445, "rewards/accuracy_reward_stage2": 0.669631838798523, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 280 }, { "completion_length": 9.75, "epoch": 0.04923777816716313, "grad_norm": 22.55785616992177, "kl": 0.076171875, "learning_rate": 9.509374452426844e-07, "loss": 0.0304, "reward": 1.5457661151885986, "reward_std": 0.2752734422683716, "rewards/accuracy_reward_stage2": 0.6707661747932434, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 281 }, { "completion_length": 9.234375, "epoch": 0.04941300157701069, "grad_norm": 21.99612719765442, "kl": 0.07421875, "learning_rate": 9.507622218328368e-07, "loss": 0.0297, "reward": 1.654970407485962, "reward_std": 0.24402377009391785, "rewards/accuracy_reward_stage2": 0.6549703478813171, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 282 }, { "completion_length": 11.203125, "epoch": 0.04958822498685824, "grad_norm": 20.447129133637134, "kl": 0.0947265625, "learning_rate": 9.505869984229893e-07, "loss": 0.0379, "reward": 1.3550448417663574, "reward_std": 0.2543000876903534, "rewards/accuracy_reward_stage2": 0.4800449013710022, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 283 }, { "completion_length": 7.03125, "epoch": 0.0497634483967058, "grad_norm": 15.994507580460478, "kl": 0.0341796875, "learning_rate": 9.504117750131417e-07, "loss": 0.0137, "reward": 1.5851523876190186, "reward_std": 0.13426676392555237, "rewards/accuracy_reward_stage2": 0.5851523876190186, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 284 }, { "completion_length": 10.75, "epoch": 0.049938671806553356, "grad_norm": 16.657668769693082, "kl": 0.0703125, "learning_rate": 9.502365516032942e-07, "loss": 0.0281, "reward": 1.5128419399261475, "reward_std": 0.18941733241081238, "rewards/accuracy_reward_stage2": 0.5128419399261475, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 285 }, { "completion_length": 7.71875, "epoch": 0.05011389521640091, "grad_norm": 11.758117565473336, "kl": 0.044189453125, "learning_rate": 9.500613281934467e-07, "loss": 0.0176, "reward": 1.5194578170776367, "reward_std": 0.08947868645191193, "rewards/accuracy_reward_stage2": 0.5194578170776367, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 286 }, { "completion_length": 9.109375, "epoch": 0.050289118626248464, "grad_norm": 20.8562775866761, "kl": 0.0498046875, "learning_rate": 9.498861047835991e-07, "loss": 0.0199, "reward": 1.7933104038238525, "reward_std": 0.12623247504234314, "rewards/accuracy_reward_stage2": 0.793310284614563, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 287 }, { "completion_length": 6.546875, "epoch": 0.050464342036096024, "grad_norm": 18.39024977557091, "kl": 0.051025390625, "learning_rate": 9.497108813737515e-07, "loss": 0.0204, "reward": 1.5014456510543823, "reward_std": 0.20475083589553833, "rewards/accuracy_reward_stage2": 0.5014456510543823, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 288 }, { "completion_length": 15.171875, "epoch": 0.05063956544594358, "grad_norm": 18.14671972942763, "kl": 0.080078125, "learning_rate": 9.495356579639038e-07, "loss": 0.032, "reward": 1.4318616390228271, "reward_std": 0.1176854595541954, "rewards/accuracy_reward_stage2": 0.5568615794181824, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 289 }, { "completion_length": 13.765625, "epoch": 0.05081478885579113, "grad_norm": 20.710006975987632, "kl": 0.0537109375, "learning_rate": 9.493604345540563e-07, "loss": 0.0215, "reward": 1.6023633480072021, "reward_std": 0.2683194875717163, "rewards/accuracy_reward_stage2": 0.6023632884025574, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 290 }, { "completion_length": 8.984375, "epoch": 0.050990012265638686, "grad_norm": 49.22708752391607, "kl": 0.2353515625, "learning_rate": 9.491852111442088e-07, "loss": 0.0759, "reward": 1.523097038269043, "reward_std": 0.23898158967494965, "rewards/accuracy_reward_stage2": 0.5387219190597534, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 291 }, { "completion_length": 8.109375, "epoch": 0.05116523567548625, "grad_norm": 20.121543362765728, "kl": 0.057861328125, "learning_rate": 9.490099877343612e-07, "loss": 0.0232, "reward": 1.5405619144439697, "reward_std": 0.1676030457019806, "rewards/accuracy_reward_stage2": 0.6655619144439697, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 292 }, { "completion_length": 13.5625, "epoch": 0.0513404590853338, "grad_norm": 24.313452223867653, "kl": 0.054443359375, "learning_rate": 9.488347643245137e-07, "loss": -0.0331, "reward": 1.5306828022003174, "reward_std": 0.23595470190048218, "rewards/accuracy_reward_stage2": 0.6869328022003174, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 293 }, { "completion_length": 26.109375, "epoch": 0.051515682495181354, "grad_norm": 23.43936822314985, "kl": 0.0164794921875, "learning_rate": 9.486595409146662e-07, "loss": 0.0066, "reward": 1.4214849472045898, "reward_std": 0.15756914019584656, "rewards/accuracy_reward_stage2": 0.42148491740226746, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 294 }, { "completion_length": 12.265625, "epoch": 0.051690905905028915, "grad_norm": 22.691844775763478, "kl": 0.146484375, "learning_rate": 9.484843175048186e-07, "loss": 0.0378, "reward": 1.3974359035491943, "reward_std": 0.29218339920043945, "rewards/accuracy_reward_stage2": 0.5380609035491943, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 295 }, { "completion_length": 8.65625, "epoch": 0.05186612931487647, "grad_norm": 20.479831261545776, "kl": 0.031982421875, "learning_rate": 9.483090940949711e-07, "loss": -0.0613, "reward": 1.5576322078704834, "reward_std": 0.31971651315689087, "rewards/accuracy_reward_stage2": 0.5888821482658386, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 296 }, { "completion_length": 23.4375, "epoch": 0.05204135272472402, "grad_norm": 22.74094957435536, "kl": 0.0230712890625, "learning_rate": 9.481338706851235e-07, "loss": -0.035, "reward": 1.1994527578353882, "reward_std": 0.17752505838871002, "rewards/accuracy_reward_stage2": 0.4650777578353882, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 297 }, { "completion_length": 12.359375, "epoch": 0.05221657613457158, "grad_norm": 21.441989920997887, "kl": 0.04248046875, "learning_rate": 9.47958647275276e-07, "loss": 0.017, "reward": 1.5468885898590088, "reward_std": 0.1514553427696228, "rewards/accuracy_reward_stage2": 0.6718886494636536, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 298 }, { "completion_length": 12.609375, "epoch": 0.05239179954441914, "grad_norm": 4830.82411347626, "kl": 24.5, "learning_rate": 9.477834238654284e-07, "loss": 9.7416, "reward": 1.2803688049316406, "reward_std": 0.22106263041496277, "rewards/accuracy_reward_stage2": 0.5303688049316406, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 299 }, { "completion_length": 15.171875, "epoch": 0.05256702295426669, "grad_norm": 66.7580675171776, "kl": 0.609375, "learning_rate": 9.476082004555808e-07, "loss": 0.1995, "reward": 1.222217321395874, "reward_std": 0.054408349096775055, "rewards/accuracy_reward_stage2": 0.487842321395874, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 300 }, { "completion_length": 14.90625, "epoch": 0.052742246364114245, "grad_norm": 13.78642449839868, "kl": 0.0250244140625, "learning_rate": 9.474329770457332e-07, "loss": -0.0342, "reward": 1.2080720663070679, "reward_std": 0.16576477885246277, "rewards/accuracy_reward_stage2": 0.34869706630706787, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 301 }, { "completion_length": 15.796875, "epoch": 0.0529174697739618, "grad_norm": 17.396925885377456, "kl": 0.049072265625, "learning_rate": 9.472577536358857e-07, "loss": 0.0196, "reward": 1.5201051235198975, "reward_std": 0.056752100586891174, "rewards/accuracy_reward_stage2": 0.5201051235198975, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 302 }, { "completion_length": 12.078125, "epoch": 0.05309269318380936, "grad_norm": 24.39287992746118, "kl": 0.1259765625, "learning_rate": 9.470825302260381e-07, "loss": 0.0503, "reward": 1.6268336772918701, "reward_std": 0.25979822874069214, "rewards/accuracy_reward_stage2": 0.6268336772918701, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 303 }, { "completion_length": 8.984375, "epoch": 0.053267916593656914, "grad_norm": 26.97048476925272, "kl": 0.1552734375, "learning_rate": 9.469073068161906e-07, "loss": 0.062, "reward": 1.4804831743240356, "reward_std": 0.30746644735336304, "rewards/accuracy_reward_stage2": 0.6054832339286804, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 304 }, { "completion_length": 8.828125, "epoch": 0.05344314000350447, "grad_norm": 30.65849433100143, "kl": 0.1064453125, "learning_rate": 9.46732083406343e-07, "loss": 0.0426, "reward": 1.2948065996170044, "reward_std": 0.18823125958442688, "rewards/accuracy_reward_stage2": 0.5448065996170044, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 305 }, { "completion_length": 10.09375, "epoch": 0.05361836341335202, "grad_norm": 17.884322716284345, "kl": 0.026123046875, "learning_rate": 9.465568599964955e-07, "loss": -0.0088, "reward": 1.3229167461395264, "reward_std": 0.16480545699596405, "rewards/accuracy_reward_stage2": 0.3385416865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 306 }, { "completion_length": 10.921875, "epoch": 0.05379358682319958, "grad_norm": 17.82060901728658, "kl": 0.3359375, "learning_rate": 9.46381636586648e-07, "loss": 0.1344, "reward": 1.2772254943847656, "reward_std": 0.1089072972536087, "rewards/accuracy_reward_stage2": 0.4022255539894104, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 307 }, { "completion_length": 8.46875, "epoch": 0.053968810233047136, "grad_norm": 16.952050026411197, "kl": 0.05029296875, "learning_rate": 9.462064131768004e-07, "loss": 0.02, "reward": 1.5841929912567139, "reward_std": 0.1349220871925354, "rewards/accuracy_reward_stage2": 0.5841929316520691, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 308 }, { "completion_length": 14.390625, "epoch": 0.05414403364289469, "grad_norm": 19.823545204054554, "kl": 0.67578125, "learning_rate": 9.460311897669528e-07, "loss": 0.2257, "reward": 1.2973132133483887, "reward_std": 0.160364031791687, "rewards/accuracy_reward_stage2": 0.5629382729530334, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 309 }, { "completion_length": 12.109375, "epoch": 0.054319257052742244, "grad_norm": 19.287387696238504, "kl": 0.0361328125, "learning_rate": 9.458559663571053e-07, "loss": -0.0218, "reward": 1.4892075061798096, "reward_std": 0.202066108584404, "rewards/accuracy_reward_stage2": 0.5048325061798096, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 310 }, { "completion_length": 19.59375, "epoch": 0.054494480462589805, "grad_norm": 23.88832240645103, "kl": 0.0458984375, "learning_rate": 9.456807429472577e-07, "loss": 0.0183, "reward": 1.44734525680542, "reward_std": 0.27426010370254517, "rewards/accuracy_reward_stage2": 0.4473453164100647, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 311 }, { "completion_length": 16.0625, "epoch": 0.05466970387243736, "grad_norm": 12.613548315395109, "kl": 0.039794921875, "learning_rate": 9.455055195374102e-07, "loss": 0.0159, "reward": 1.5471065044403076, "reward_std": 0.14003218710422516, "rewards/accuracy_reward_stage2": 0.5471064448356628, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 312 }, { "completion_length": 8.640625, "epoch": 0.05484492728228491, "grad_norm": 30.149557832680912, "kl": 0.07470703125, "learning_rate": 9.453302961275626e-07, "loss": 0.0297, "reward": 1.7912230491638184, "reward_std": 0.20346251130104065, "rewards/accuracy_reward_stage2": 0.7912230491638184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 313 }, { "completion_length": 13.71875, "epoch": 0.055020150692132466, "grad_norm": 22.805893325483133, "kl": 0.72265625, "learning_rate": 9.45155072717715e-07, "loss": 0.2892, "reward": 1.4171767234802246, "reward_std": 0.16180284321308136, "rewards/accuracy_reward_stage2": 0.5421766638755798, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 314 }, { "completion_length": 11.3125, "epoch": 0.05519537410198003, "grad_norm": 31.939022622697205, "kl": 0.059814453125, "learning_rate": 9.449798493078675e-07, "loss": 0.024, "reward": 1.6428941488265991, "reward_std": 0.279352605342865, "rewards/accuracy_reward_stage2": 0.6428941488265991, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 315 }, { "completion_length": 15.921875, "epoch": 0.05537059751182758, "grad_norm": 31.633316769847468, "kl": 0.1171875, "learning_rate": 9.448046258980199e-07, "loss": 0.0469, "reward": 1.643958568572998, "reward_std": 0.3066912591457367, "rewards/accuracy_reward_stage2": 0.643958568572998, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 316 }, { "completion_length": 10.5625, "epoch": 0.055545820921675135, "grad_norm": 22.638846768381686, "kl": 0.030517578125, "learning_rate": 9.446294024881724e-07, "loss": 0.0122, "reward": 1.6205029487609863, "reward_std": 0.18559257686138153, "rewards/accuracy_reward_stage2": 0.6205028295516968, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 317 }, { "completion_length": 10.1875, "epoch": 0.05572104433152269, "grad_norm": 16.301947212185045, "kl": 0.0123291015625, "learning_rate": 9.444541790783249e-07, "loss": 0.0049, "reward": 1.2974097728729248, "reward_std": 0.1253841519355774, "rewards/accuracy_reward_stage2": 0.29740971326828003, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 318 }, { "completion_length": 8.4375, "epoch": 0.05589626774137025, "grad_norm": 16.27137921980481, "kl": 0.0771484375, "learning_rate": 9.442789556684772e-07, "loss": 0.0308, "reward": 1.131199836730957, "reward_std": 0.13367994129657745, "rewards/accuracy_reward_stage2": 0.13119982182979584, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 319 }, { "completion_length": 17.5625, "epoch": 0.0560714911512178, "grad_norm": 23.532132630398923, "kl": 0.064453125, "learning_rate": 9.441037322586297e-07, "loss": 0.0259, "reward": 1.3425979614257812, "reward_std": 0.18130367994308472, "rewards/accuracy_reward_stage2": 0.342598021030426, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 320 }, { "completion_length": 7.59375, "epoch": 0.05624671456106536, "grad_norm": 19.109109165832518, "kl": 0.0546875, "learning_rate": 9.439285088487821e-07, "loss": -0.0223, "reward": 1.7482370138168335, "reward_std": 0.240419402718544, "rewards/accuracy_reward_stage2": 0.7638620138168335, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 321 }, { "completion_length": 13.734375, "epoch": 0.05642193797091291, "grad_norm": 14.806289223451508, "kl": 0.04541015625, "learning_rate": 9.437532854389346e-07, "loss": 0.0182, "reward": 1.4349133968353271, "reward_std": 0.15795834362506866, "rewards/accuracy_reward_stage2": 0.5599132776260376, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 322 }, { "completion_length": 9.90625, "epoch": 0.05659716138076047, "grad_norm": 17.926016568620387, "kl": 0.059326171875, "learning_rate": 9.435780620290871e-07, "loss": 0.0238, "reward": 1.3682993650436401, "reward_std": 0.14938199520111084, "rewards/accuracy_reward_stage2": 0.36829936504364014, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 323 }, { "completion_length": 6.109375, "epoch": 0.056772384790608026, "grad_norm": 15.622725365840726, "kl": 0.0301513671875, "learning_rate": 9.434028386192395e-07, "loss": 0.0121, "reward": 1.6096426248550415, "reward_std": 0.09859603643417358, "rewards/accuracy_reward_stage2": 0.6096425652503967, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 324 }, { "completion_length": 22.296875, "epoch": 0.05694760820045558, "grad_norm": 19.94432851352937, "kl": 0.515625, "learning_rate": 9.43227615209392e-07, "loss": 0.207, "reward": 1.4566096067428589, "reward_std": 0.2075975239276886, "rewards/accuracy_reward_stage2": 0.5816096067428589, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 325 }, { "completion_length": 10.359375, "epoch": 0.05712283161030313, "grad_norm": 17.786476616869685, "kl": 0.0927734375, "learning_rate": 9.430523917995444e-07, "loss": -0.0072, "reward": 1.559728741645813, "reward_std": 0.19865679740905762, "rewards/accuracy_reward_stage2": 0.575353741645813, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 326 }, { "completion_length": 13.203125, "epoch": 0.057298055020150694, "grad_norm": 22.973658958195003, "kl": 0.06298828125, "learning_rate": 9.428771683896968e-07, "loss": -0.0036, "reward": 1.4841365814208984, "reward_std": 0.25597521662712097, "rewards/accuracy_reward_stage2": 0.4997614920139313, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 327 }, { "completion_length": 26.09375, "epoch": 0.05747327842999825, "grad_norm": 17.487533600335915, "kl": 0.61328125, "learning_rate": 9.427019449798493e-07, "loss": 0.2008, "reward": 1.189457654953003, "reward_std": 0.17859560251235962, "rewards/accuracy_reward_stage2": 0.3300827145576477, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 328 }, { "completion_length": 14.28125, "epoch": 0.0576485018398458, "grad_norm": 21.218654010279625, "kl": 0.609375, "learning_rate": 9.425267215700016e-07, "loss": 0.1987, "reward": 1.3703992366790771, "reward_std": 0.24765318632125854, "rewards/accuracy_reward_stage2": 0.6360243558883667, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 329 }, { "completion_length": 11.234375, "epoch": 0.057823725249693356, "grad_norm": 21.651742707988532, "kl": 0.031005859375, "learning_rate": 9.423514981601541e-07, "loss": 0.0124, "reward": 1.5708177089691162, "reward_std": 0.18964162468910217, "rewards/accuracy_reward_stage2": 0.5708176493644714, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 330 }, { "completion_length": 9.140625, "epoch": 0.057998948659540916, "grad_norm": 35.78706914854555, "kl": 0.050048828125, "learning_rate": 9.421762747503066e-07, "loss": 0.02, "reward": 1.3698612451553345, "reward_std": 0.20854762196540833, "rewards/accuracy_reward_stage2": 0.49486127495765686, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 331 }, { "completion_length": 10.171875, "epoch": 0.05817417206938847, "grad_norm": 17.065846478050958, "kl": 0.0625, "learning_rate": 9.42001051340459e-07, "loss": 0.025, "reward": 1.7728002071380615, "reward_std": 0.1718043088912964, "rewards/accuracy_reward_stage2": 0.772800087928772, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 332 }, { "completion_length": 7.828125, "epoch": 0.058349395479236024, "grad_norm": 21.44285617956672, "kl": 0.055908203125, "learning_rate": 9.418258279306115e-07, "loss": 0.0007, "reward": 1.5830440521240234, "reward_std": 0.10806939750909805, "rewards/accuracy_reward_stage2": 0.5986689925193787, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 333 }, { "completion_length": 9.5, "epoch": 0.058524618889083585, "grad_norm": 19.21932855206352, "kl": 0.09326171875, "learning_rate": 9.41650604520764e-07, "loss": -0.0019, "reward": 1.4759433269500732, "reward_std": 0.20894023776054382, "rewards/accuracy_reward_stage2": 0.49156832695007324, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 334 }, { "completion_length": 6.234375, "epoch": 0.05869984229893114, "grad_norm": 17.213037529404584, "kl": 0.0179443359375, "learning_rate": 9.414753811109164e-07, "loss": 0.0072, "reward": 1.8309895992279053, "reward_std": 0.12448026239871979, "rewards/accuracy_reward_stage2": 0.8309895992279053, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 335 }, { "completion_length": 10.0, "epoch": 0.05887506570877869, "grad_norm": 27.770533441967974, "kl": 0.072265625, "learning_rate": 9.413001577010689e-07, "loss": 0.0289, "reward": 1.7088299989700317, "reward_std": 0.23482096195220947, "rewards/accuracy_reward_stage2": 0.7088299989700317, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 336 }, { "completion_length": 12.90625, "epoch": 0.05905028911862625, "grad_norm": 18.69296153324102, "kl": 0.00616455078125, "learning_rate": 9.411249342912213e-07, "loss": 0.0025, "reward": 1.7130486965179443, "reward_std": 0.1294686496257782, "rewards/accuracy_reward_stage2": 0.7130487561225891, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 337 }, { "completion_length": 11.625, "epoch": 0.05922551252847381, "grad_norm": 21.091997815143415, "kl": 0.08642578125, "learning_rate": 9.409497108813738e-07, "loss": 0.0346, "reward": 1.5273933410644531, "reward_std": 0.21970880031585693, "rewards/accuracy_reward_stage2": 0.5273933410644531, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 338 }, { "completion_length": 13.03125, "epoch": 0.05940073593832136, "grad_norm": 22.056625658238307, "kl": 0.083984375, "learning_rate": 9.407744874715261e-07, "loss": 0.0335, "reward": 1.563867449760437, "reward_std": 0.1931018829345703, "rewards/accuracy_reward_stage2": 0.563867449760437, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 339 }, { "completion_length": 19.734375, "epoch": 0.059575959348168915, "grad_norm": 25.210516531222016, "kl": 0.283203125, "learning_rate": 9.405992640616785e-07, "loss": 0.1131, "reward": 1.47365403175354, "reward_std": 0.2418113648891449, "rewards/accuracy_reward_stage2": 0.59865403175354, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 340 }, { "completion_length": 7.671875, "epoch": 0.05975118275801647, "grad_norm": 17.408056064153456, "kl": 0.205078125, "learning_rate": 9.40424040651831e-07, "loss": 0.0379, "reward": 1.2369791269302368, "reward_std": 0.19781196117401123, "rewards/accuracy_reward_stage2": 0.3776041567325592, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 341 }, { "completion_length": 11.375, "epoch": 0.05992640616786403, "grad_norm": 26.883149428221756, "kl": 0.04345703125, "learning_rate": 9.402488172419835e-07, "loss": -0.0268, "reward": 1.6457767486572266, "reward_std": 0.25722378492355347, "rewards/accuracy_reward_stage2": 0.6614017486572266, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 342 }, { "completion_length": 7.5625, "epoch": 0.060101629577711584, "grad_norm": 11.1916374569426, "kl": 0.00958251953125, "learning_rate": 9.400735938321359e-07, "loss": 0.0038, "reward": 1.6783901453018188, "reward_std": 0.01817590743303299, "rewards/accuracy_reward_stage2": 0.6783901453018188, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 343 }, { "completion_length": 9.625, "epoch": 0.06027685298755914, "grad_norm": 26.840387014062035, "kl": 0.028076171875, "learning_rate": 9.398983704222884e-07, "loss": 0.0112, "reward": 1.472599983215332, "reward_std": 0.14326484501361847, "rewards/accuracy_reward_stage2": 0.47259995341300964, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 344 }, { "completion_length": 8.78125, "epoch": 0.06045207639740669, "grad_norm": 30.13359702730389, "kl": 0.0167236328125, "learning_rate": 9.397231470124408e-07, "loss": -0.0375, "reward": 1.7736797332763672, "reward_std": 0.2013120949268341, "rewards/accuracy_reward_stage2": 0.7893046736717224, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 345 }, { "completion_length": 11.734375, "epoch": 0.06062729980725425, "grad_norm": 30.760314151166217, "kl": 0.126953125, "learning_rate": 9.395479236025933e-07, "loss": 0.0507, "reward": 1.2022664546966553, "reward_std": 0.29217326641082764, "rewards/accuracy_reward_stage2": 0.4522664546966553, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 346 }, { "completion_length": 23.3125, "epoch": 0.060802523217101806, "grad_norm": 17.509360599599663, "kl": 0.04345703125, "learning_rate": 9.393727001927458e-07, "loss": 0.0174, "reward": 1.3837076425552368, "reward_std": 0.17910131812095642, "rewards/accuracy_reward_stage2": 0.3837076425552368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 347 }, { "completion_length": 12.0, "epoch": 0.06097774662694936, "grad_norm": 24.40584268231279, "kl": 0.06298828125, "learning_rate": 9.391974767828981e-07, "loss": 0.0252, "reward": 1.7270774841308594, "reward_std": 0.21028944849967957, "rewards/accuracy_reward_stage2": 0.7270774841308594, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 348 }, { "completion_length": 14.609375, "epoch": 0.061152970036796914, "grad_norm": 22.108695434224188, "kl": 0.06103515625, "learning_rate": 9.390222533730506e-07, "loss": 0.0245, "reward": 1.4529306888580322, "reward_std": 0.16741645336151123, "rewards/accuracy_reward_stage2": 0.4529306888580322, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 349 }, { "completion_length": 9.171875, "epoch": 0.061328193446644474, "grad_norm": 17.749474904467053, "kl": 0.0181884765625, "learning_rate": 9.388470299632031e-07, "loss": -0.0369, "reward": 1.59375, "reward_std": 0.19727617502212524, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 350 }, { "completion_length": 17.65625, "epoch": 0.06150341685649203, "grad_norm": 23.483625219058727, "kl": 0.04443359375, "learning_rate": 9.386718065533555e-07, "loss": -0.0655, "reward": 1.537459373474121, "reward_std": 0.2557547390460968, "rewards/accuracy_reward_stage2": 0.5843343734741211, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 351 }, { "completion_length": 15.296875, "epoch": 0.06167864026633958, "grad_norm": 159.74195424359493, "kl": 0.2001953125, "learning_rate": 9.384965831435079e-07, "loss": 0.0358, "reward": 1.5681722164154053, "reward_std": 0.21072784066200256, "rewards/accuracy_reward_stage2": 0.58379727602005, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 352 }, { "completion_length": 10.3125, "epoch": 0.061853863676187136, "grad_norm": 18.886814832728522, "kl": 0.07958984375, "learning_rate": 9.383213597336603e-07, "loss": 0.0103, "reward": 1.270545482635498, "reward_std": 0.19417250156402588, "rewards/accuracy_reward_stage2": 0.41117042303085327, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 353 }, { "completion_length": 9.359375, "epoch": 0.0620290870860347, "grad_norm": 18.37524567034427, "kl": 0.0299072265625, "learning_rate": 9.381461363238128e-07, "loss": 0.0119, "reward": 1.5955801010131836, "reward_std": 0.19248944520950317, "rewards/accuracy_reward_stage2": 0.7205801010131836, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 354 }, { "completion_length": 14.125, "epoch": 0.06220431049588225, "grad_norm": 20.849395826639736, "kl": 0.046630859375, "learning_rate": 9.379709129139653e-07, "loss": 0.0187, "reward": 1.4106394052505493, "reward_std": 0.1208919808268547, "rewards/accuracy_reward_stage2": 0.5356393456459045, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 355 }, { "completion_length": 10.203125, "epoch": 0.062379533905729805, "grad_norm": 19.576848945135378, "kl": 0.0257568359375, "learning_rate": 9.377956895041177e-07, "loss": 0.0103, "reward": 1.6396290063858032, "reward_std": 0.12200456112623215, "rewards/accuracy_reward_stage2": 0.6396290063858032, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 356 }, { "completion_length": 10.21875, "epoch": 0.06255475731557736, "grad_norm": 25.56816645699675, "kl": 0.041748046875, "learning_rate": 9.376204660942702e-07, "loss": 0.0167, "reward": 1.7575688362121582, "reward_std": 0.1697702407836914, "rewards/accuracy_reward_stage2": 0.757568895816803, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 357 }, { "completion_length": 9.375, "epoch": 0.06272998072542492, "grad_norm": 15.342675032171575, "kl": 0.0167236328125, "learning_rate": 9.374452426844227e-07, "loss": 0.0067, "reward": 1.46875, "reward_std": 0.24511480331420898, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 358 }, { "completion_length": 9.046875, "epoch": 0.06290520413527247, "grad_norm": 17.824968671605863, "kl": 0.0791015625, "learning_rate": 9.37270019274575e-07, "loss": 0.006, "reward": 1.485837697982788, "reward_std": 0.20334991812705994, "rewards/accuracy_reward_stage2": 0.5014628171920776, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 359 }, { "completion_length": 17.453125, "epoch": 0.06308042754512003, "grad_norm": 21233.154628289303, "kl": 744.0, "learning_rate": 9.370947958647275e-07, "loss": 298.3744, "reward": 1.3596529960632324, "reward_std": 0.16530917584896088, "rewards/accuracy_reward_stage2": 0.6252779364585876, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 360 }, { "completion_length": 16.34375, "epoch": 0.06325565095496759, "grad_norm": 23.684309225602988, "kl": 0.09716796875, "learning_rate": 9.369195724548799e-07, "loss": 0.0389, "reward": 1.4965417385101318, "reward_std": 0.23690475523471832, "rewards/accuracy_reward_stage2": 0.6215417981147766, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 361 }, { "completion_length": 8.265625, "epoch": 0.06343087436481513, "grad_norm": 19.810862008446907, "kl": 0.0830078125, "learning_rate": 9.367443490450324e-07, "loss": -0.0109, "reward": 1.4895833730697632, "reward_std": 0.3171301484107971, "rewards/accuracy_reward_stage2": 0.6302083134651184, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 362 }, { "completion_length": 10.140625, "epoch": 0.0636060977746627, "grad_norm": 18.18713950785703, "kl": 0.07275390625, "learning_rate": 9.365691256351849e-07, "loss": 0.029, "reward": 1.6478146314620972, "reward_std": 0.14585444331169128, "rewards/accuracy_reward_stage2": 0.6478146910667419, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 363 }, { "completion_length": 9.140625, "epoch": 0.06378132118451026, "grad_norm": 20.067637774027975, "kl": 0.049072265625, "learning_rate": 9.363939022253373e-07, "loss": -0.0214, "reward": 1.633749008178711, "reward_std": 0.17472119629383087, "rewards/accuracy_reward_stage2": 0.6493740081787109, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 364 }, { "completion_length": 5.875, "epoch": 0.0639565445943578, "grad_norm": 19.42945311409606, "kl": 0.052001953125, "learning_rate": 9.362186788154897e-07, "loss": 0.0209, "reward": 1.6205922365188599, "reward_std": 0.23425078392028809, "rewards/accuracy_reward_stage2": 0.6205922365188599, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 365 }, { "completion_length": 8.4375, "epoch": 0.06413176800420536, "grad_norm": 19.86294029189364, "kl": 0.0830078125, "learning_rate": 9.360434554056421e-07, "loss": -0.0057, "reward": 1.3854167461395264, "reward_std": 0.22538167238235474, "rewards/accuracy_reward_stage2": 0.4166666567325592, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 366 }, { "completion_length": 10.796875, "epoch": 0.06430699141405291, "grad_norm": 17.291194693608265, "kl": 0.06591796875, "learning_rate": 9.358682319957946e-07, "loss": -0.0179, "reward": 1.5046108961105347, "reward_std": 0.16259142756462097, "rewards/accuracy_reward_stage2": 0.5202358961105347, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 367 }, { "completion_length": 11.625, "epoch": 0.06448221482390047, "grad_norm": 19.514220261583535, "kl": 0.0230712890625, "learning_rate": 9.35693008585947e-07, "loss": 0.0092, "reward": 1.6788980960845947, "reward_std": 0.18751531839370728, "rewards/accuracy_reward_stage2": 0.67889803647995, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 368 }, { "completion_length": 10.71875, "epoch": 0.06465743823374803, "grad_norm": 18.037938987127585, "kl": 0.10791015625, "learning_rate": 9.355177851760994e-07, "loss": -0.0012, "reward": 1.3620235919952393, "reward_std": 0.16237865388393402, "rewards/accuracy_reward_stage2": 0.37764859199523926, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 369 }, { "completion_length": 9.265625, "epoch": 0.06483266164359558, "grad_norm": 19.647145326309186, "kl": 0.046875, "learning_rate": 9.353425617662519e-07, "loss": -0.0101, "reward": 1.6776671409606934, "reward_std": 0.31430885195732117, "rewards/accuracy_reward_stage2": 0.6932921409606934, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 370 }, { "completion_length": 7.234375, "epoch": 0.06500788505344314, "grad_norm": 19.609023289214672, "kl": 0.0194091796875, "learning_rate": 9.351673383564044e-07, "loss": 0.0078, "reward": 1.6241884231567383, "reward_std": 0.24334561824798584, "rewards/accuracy_reward_stage2": 0.6241884827613831, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 371 }, { "completion_length": 9.203125, "epoch": 0.0651831084632907, "grad_norm": 21.693746313134152, "kl": 0.0830078125, "learning_rate": 9.349921149465568e-07, "loss": 0.0332, "reward": 1.557944893836975, "reward_std": 0.14944539964199066, "rewards/accuracy_reward_stage2": 0.6829449534416199, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 372 }, { "completion_length": 7.5, "epoch": 0.06535833187313825, "grad_norm": 20.281466370040626, "kl": 0.08984375, "learning_rate": 9.348168915367093e-07, "loss": 0.036, "reward": 1.3900837898254395, "reward_std": 0.13699333369731903, "rewards/accuracy_reward_stage2": 0.39008378982543945, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 373 }, { "completion_length": 12.078125, "epoch": 0.06553355528298581, "grad_norm": 21.899711129212353, "kl": 0.326171875, "learning_rate": 9.346416681268617e-07, "loss": 0.1302, "reward": 1.506643533706665, "reward_std": 0.21940842270851135, "rewards/accuracy_reward_stage2": 0.6316434741020203, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 374 }, { "completion_length": 10.171875, "epoch": 0.06570877869283337, "grad_norm": 18.654545463446304, "kl": 0.078125, "learning_rate": 9.344664447170142e-07, "loss": 0.0312, "reward": 1.4036774635314941, "reward_std": 0.1636413335800171, "rewards/accuracy_reward_stage2": 0.40367743372917175, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 375 }, { "completion_length": 8.109375, "epoch": 0.06588400210268092, "grad_norm": 17.667479564188806, "kl": 0.03662109375, "learning_rate": 9.342912213071667e-07, "loss": 0.0146, "reward": 1.6772925853729248, "reward_std": 0.11799340695142746, "rewards/accuracy_reward_stage2": 0.6772925853729248, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 376 }, { "completion_length": 7.0, "epoch": 0.06605922551252848, "grad_norm": 22.152832316564922, "kl": 0.01324462890625, "learning_rate": 9.34115997897319e-07, "loss": 0.0053, "reward": 1.5811469554901123, "reward_std": 0.16719821095466614, "rewards/accuracy_reward_stage2": 0.5811468958854675, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 377 }, { "completion_length": 12.15625, "epoch": 0.06623444892237602, "grad_norm": 19.79514474324505, "kl": 0.05126953125, "learning_rate": 9.339407744874714e-07, "loss": -0.0115, "reward": 1.5288242101669312, "reward_std": 0.2464839220046997, "rewards/accuracy_reward_stage2": 0.5444492101669312, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 378 }, { "completion_length": 10.625, "epoch": 0.06640967233222358, "grad_norm": 15.80397044871378, "kl": 0.03076171875, "learning_rate": 9.337655510776239e-07, "loss": 0.0123, "reward": 1.7461693286895752, "reward_std": 0.1648191660642624, "rewards/accuracy_reward_stage2": 0.7461693286895752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 379 }, { "completion_length": 15.578125, "epoch": 0.06658489574207115, "grad_norm": 39.80712653417323, "kl": 0.50390625, "learning_rate": 9.335903276677763e-07, "loss": 0.1679, "reward": 1.3081011772155762, "reward_std": 0.25190237164497375, "rewards/accuracy_reward_stage2": 0.46435117721557617, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 380 }, { "completion_length": 11.78125, "epoch": 0.06676011915191869, "grad_norm": 16.030393941102684, "kl": 0.5703125, "learning_rate": 9.334151042579288e-07, "loss": 0.227, "reward": 1.43631911277771, "reward_std": 0.10598289966583252, "rewards/accuracy_reward_stage2": 0.6863189935684204, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 381 }, { "completion_length": 13.875, "epoch": 0.06693534256176625, "grad_norm": 21.087588784066092, "kl": 0.099609375, "learning_rate": 9.332398808480812e-07, "loss": 0.04, "reward": 1.2481482028961182, "reward_std": 0.18241316080093384, "rewards/accuracy_reward_stage2": 0.24814806878566742, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 382 }, { "completion_length": 10.734375, "epoch": 0.06711056597161381, "grad_norm": 20.186397153368166, "kl": 0.037353515625, "learning_rate": 9.330646574382337e-07, "loss": 0.015, "reward": 1.5044660568237305, "reward_std": 0.1688692569732666, "rewards/accuracy_reward_stage2": 0.6294660568237305, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 383 }, { "completion_length": 9.90625, "epoch": 0.06728578938146136, "grad_norm": 25.32946244457715, "kl": 0.0341796875, "learning_rate": 9.328894340283862e-07, "loss": 0.0137, "reward": 1.4899933338165283, "reward_std": 0.23859579861164093, "rewards/accuracy_reward_stage2": 0.4899933338165283, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 384 }, { "completion_length": 13.328125, "epoch": 0.06746101279130892, "grad_norm": 26.81316686056031, "kl": 0.57421875, "learning_rate": 9.327142106185386e-07, "loss": 0.2292, "reward": 1.4508566856384277, "reward_std": 0.2757464051246643, "rewards/accuracy_reward_stage2": 0.5758566856384277, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 385 }, { "completion_length": 9.28125, "epoch": 0.06763623620115647, "grad_norm": 19.521819857840825, "kl": 0.048583984375, "learning_rate": 9.325389872086911e-07, "loss": -0.0711, "reward": 1.7960493564605713, "reward_std": 0.20955920219421387, "rewards/accuracy_reward_stage2": 0.8429244160652161, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 386 }, { "completion_length": 10.046875, "epoch": 0.06781145961100403, "grad_norm": 19.68601688056184, "kl": 0.03759765625, "learning_rate": 9.323637637988436e-07, "loss": 0.0151, "reward": 1.511404037475586, "reward_std": 0.18920361995697021, "rewards/accuracy_reward_stage2": 0.5114039778709412, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 387 }, { "completion_length": 10.078125, "epoch": 0.06798668302085159, "grad_norm": 27.240554115165228, "kl": 0.031982421875, "learning_rate": 9.321885403889959e-07, "loss": 0.0127, "reward": 1.5362706184387207, "reward_std": 0.20674368739128113, "rewards/accuracy_reward_stage2": 0.6612705588340759, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 388 }, { "completion_length": 15.53125, "epoch": 0.06816190643069914, "grad_norm": 31.545316926959348, "kl": 0.2109375, "learning_rate": 9.320133169791484e-07, "loss": 0.0843, "reward": 1.3991228342056274, "reward_std": 0.323274701833725, "rewards/accuracy_reward_stage2": 0.5241228342056274, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 389 }, { "completion_length": 8.34375, "epoch": 0.0683371298405467, "grad_norm": 16.876944193659174, "kl": 0.08642578125, "learning_rate": 9.318380935693007e-07, "loss": -0.0036, "reward": 1.538655400276184, "reward_std": 0.16559931635856628, "rewards/accuracy_reward_stage2": 0.5542804598808289, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 390 }, { "completion_length": 9.5625, "epoch": 0.06851235325039426, "grad_norm": 19.364008138324078, "kl": 0.052734375, "learning_rate": 9.316628701594532e-07, "loss": 0.0211, "reward": 1.417523741722107, "reward_std": 0.19266514480113983, "rewards/accuracy_reward_stage2": 0.41752374172210693, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 391 }, { "completion_length": 11.015625, "epoch": 0.0686875766602418, "grad_norm": 20.188722390380764, "kl": 0.0198974609375, "learning_rate": 9.314876467496057e-07, "loss": 0.008, "reward": 1.3770326375961304, "reward_std": 0.15552134811878204, "rewards/accuracy_reward_stage2": 0.37703263759613037, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 392 }, { "completion_length": 9.34375, "epoch": 0.06886280007008937, "grad_norm": 20.005146906885646, "kl": 0.044921875, "learning_rate": 9.313124233397581e-07, "loss": 0.018, "reward": 1.3810763359069824, "reward_std": 0.21949338912963867, "rewards/accuracy_reward_stage2": 0.3810763955116272, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 393 }, { "completion_length": 10.21875, "epoch": 0.06903802347993691, "grad_norm": 15.891615345327356, "kl": 0.051025390625, "learning_rate": 9.311371999299106e-07, "loss": 0.0204, "reward": 1.5088541507720947, "reward_std": 0.17497307062149048, "rewards/accuracy_reward_stage2": 0.5088541507720947, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 394 }, { "completion_length": 11.125, "epoch": 0.06921324688978447, "grad_norm": 23.512648748907708, "kl": 0.06640625, "learning_rate": 9.309619765200631e-07, "loss": 0.0266, "reward": 1.5210347175598145, "reward_std": 0.2370174527168274, "rewards/accuracy_reward_stage2": 0.6460347175598145, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 395 }, { "completion_length": 10.46875, "epoch": 0.06938847029963204, "grad_norm": 21.42132475023691, "kl": 0.04052734375, "learning_rate": 9.307867531102155e-07, "loss": -0.052, "reward": 1.5896108150482178, "reward_std": 0.26329123973846436, "rewards/accuracy_reward_stage2": 0.6208608150482178, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 396 }, { "completion_length": 17.03125, "epoch": 0.06956369370947958, "grad_norm": 14.95198588883508, "kl": 0.031494140625, "learning_rate": 9.30611529700368e-07, "loss": -0.0316, "reward": 1.5074900388717651, "reward_std": 0.06422868371009827, "rewards/accuracy_reward_stage2": 0.6481150984764099, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 397 }, { "completion_length": 9.703125, "epoch": 0.06973891711932714, "grad_norm": 20.44367679889482, "kl": 0.07958984375, "learning_rate": 9.304363062905203e-07, "loss": 0.0318, "reward": 1.5729174613952637, "reward_std": 0.09998870640993118, "rewards/accuracy_reward_stage2": 0.5729174613952637, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 398 }, { "completion_length": 24.828125, "epoch": 0.0699141405291747, "grad_norm": 19.893244845989614, "kl": 0.10009765625, "learning_rate": 9.302610828806728e-07, "loss": 0.0401, "reward": 1.135704517364502, "reward_std": 0.16893689334392548, "rewards/accuracy_reward_stage2": 0.26070448756217957, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 399 }, { "completion_length": 13.765625, "epoch": 0.07008936393902225, "grad_norm": 4169.748724851853, "kl": 21.125, "learning_rate": 9.300858594708253e-07, "loss": 8.4703, "reward": 1.3594677448272705, "reward_std": 0.1398237645626068, "rewards/accuracy_reward_stage2": 0.4844678044319153, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 400 }, { "completion_length": 9.5, "epoch": 0.07026458734886981, "grad_norm": 21.00576080891472, "kl": 0.08984375, "learning_rate": 9.299106360609777e-07, "loss": 0.0069, "reward": 1.8014570474624634, "reward_std": 0.22386255860328674, "rewards/accuracy_reward_stage2": 0.8170820474624634, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 401 }, { "completion_length": 10.234375, "epoch": 0.07043981075871736, "grad_norm": 14.49412707044264, "kl": 0.0576171875, "learning_rate": 9.297354126511302e-07, "loss": 0.0231, "reward": 1.4158527851104736, "reward_std": 0.09953659772872925, "rewards/accuracy_reward_stage2": 0.4158529043197632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 402 }, { "completion_length": 10.25, "epoch": 0.07061503416856492, "grad_norm": 32.36561156113094, "kl": 0.10693359375, "learning_rate": 9.295601892412826e-07, "loss": 0.0139, "reward": 1.72810697555542, "reward_std": 0.2724772095680237, "rewards/accuracy_reward_stage2": 0.7437319159507751, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 403 }, { "completion_length": 18.609375, "epoch": 0.07079025757841248, "grad_norm": 24.080748226483227, "kl": 0.345703125, "learning_rate": 9.29384965831435e-07, "loss": 0.1378, "reward": 1.2732133865356445, "reward_std": 0.15048734843730927, "rewards/accuracy_reward_stage2": 0.39821332693099976, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 404 }, { "completion_length": 20.515625, "epoch": 0.07096548098826003, "grad_norm": 264.3265949278815, "kl": 1.984375, "learning_rate": 9.292097424215875e-07, "loss": 0.7961, "reward": 1.4931068420410156, "reward_std": 0.14453980326652527, "rewards/accuracy_reward_stage2": 0.6181067824363708, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 405 }, { "completion_length": 10.375, "epoch": 0.07114070439810759, "grad_norm": 16.95554663885095, "kl": 0.0277099609375, "learning_rate": 9.290345190117399e-07, "loss": 0.0111, "reward": 1.7291667461395264, "reward_std": 0.1907956451177597, "rewards/accuracy_reward_stage2": 0.7291666269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 406 }, { "completion_length": 10.703125, "epoch": 0.07131592780795515, "grad_norm": 20.95046609185987, "kl": 0.0272216796875, "learning_rate": 9.288592956018924e-07, "loss": 0.0109, "reward": 1.4488990306854248, "reward_std": 0.25809937715530396, "rewards/accuracy_reward_stage2": 0.4488990306854248, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 407 }, { "completion_length": 10.15625, "epoch": 0.0714911512178027, "grad_norm": 18.718637839739095, "kl": 0.036376953125, "learning_rate": 9.286840721920448e-07, "loss": 0.0146, "reward": 1.5787631273269653, "reward_std": 0.16580891609191895, "rewards/accuracy_reward_stage2": 0.5787630677223206, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 408 }, { "completion_length": 7.21875, "epoch": 0.07166637462765026, "grad_norm": 21.22667824518068, "kl": 0.1162109375, "learning_rate": 9.285088487821972e-07, "loss": 0.0464, "reward": 1.5071234703063965, "reward_std": 0.18095630407333374, "rewards/accuracy_reward_stage2": 0.5071234703063965, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 409 }, { "completion_length": 10.390625, "epoch": 0.0718415980374978, "grad_norm": 23.411944773725754, "kl": 0.06787109375, "learning_rate": 9.283336253723497e-07, "loss": -0.0387, "reward": 1.7464570999145508, "reward_std": 0.20300233364105225, "rewards/accuracy_reward_stage2": 0.7777070999145508, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 410 }, { "completion_length": 12.8125, "epoch": 0.07201682144734536, "grad_norm": 26.486173925924497, "kl": 0.06103515625, "learning_rate": 9.281584019625022e-07, "loss": 0.0244, "reward": 1.3693530559539795, "reward_std": 0.24731206893920898, "rewards/accuracy_reward_stage2": 0.49435311555862427, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 411 }, { "completion_length": 14.84375, "epoch": 0.07219204485719292, "grad_norm": 25.114732676456597, "kl": 0.1298828125, "learning_rate": 9.279831785526546e-07, "loss": 0.0518, "reward": 1.5222240686416626, "reward_std": 0.22694925963878632, "rewards/accuracy_reward_stage2": 0.5222241282463074, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 412 }, { "completion_length": 13.5, "epoch": 0.07236726826704047, "grad_norm": 20.31226399381922, "kl": 0.01409912109375, "learning_rate": 9.278079551428071e-07, "loss": 0.0056, "reward": 1.2919033765792847, "reward_std": 0.1674969494342804, "rewards/accuracy_reward_stage2": 0.29190340638160706, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 413 }, { "completion_length": 8.125, "epoch": 0.07254249167688803, "grad_norm": 11.704296994412068, "kl": 0.04150390625, "learning_rate": 9.276327317329595e-07, "loss": 0.0166, "reward": 1.6002380847930908, "reward_std": 0.10007701814174652, "rewards/accuracy_reward_stage2": 0.6002380847930908, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 414 }, { "completion_length": 8.859375, "epoch": 0.0727177150867356, "grad_norm": 17.94825639927456, "kl": 0.10205078125, "learning_rate": 9.27457508323112e-07, "loss": 0.0408, "reward": 1.4635004997253418, "reward_std": 0.22981159389019012, "rewards/accuracy_reward_stage2": 0.4635005295276642, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 415 }, { "completion_length": 8.40625, "epoch": 0.07289293849658314, "grad_norm": 22.852074729973108, "kl": 0.03271484375, "learning_rate": 9.272822849132644e-07, "loss": -0.0311, "reward": 1.9208898544311523, "reward_std": 0.15941961109638214, "rewards/accuracy_reward_stage2": 0.9365148544311523, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 416 }, { "completion_length": 20.765625, "epoch": 0.0730681619064307, "grad_norm": 15.219734361543404, "kl": 0.0341796875, "learning_rate": 9.271070615034167e-07, "loss": 0.0137, "reward": 1.288794994354248, "reward_std": 0.07320894300937653, "rewards/accuracy_reward_stage2": 0.41379502415657043, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 417 }, { "completion_length": 9.953125, "epoch": 0.07324338531627826, "grad_norm": 18.93618282416882, "kl": 0.10009765625, "learning_rate": 9.269318380935692e-07, "loss": -0.0041, "reward": 1.4674339294433594, "reward_std": 0.21226957440376282, "rewards/accuracy_reward_stage2": 0.483058899641037, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 418 }, { "completion_length": 12.03125, "epoch": 0.07341860872612581, "grad_norm": 16.897489413117984, "kl": 0.0244140625, "learning_rate": 9.267566146837217e-07, "loss": 0.0098, "reward": 1.4678539037704468, "reward_std": 0.16594135761260986, "rewards/accuracy_reward_stage2": 0.4678539037704468, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 419 }, { "completion_length": 15.53125, "epoch": 0.07359383213597337, "grad_norm": 24.996218069156303, "kl": 0.37890625, "learning_rate": 9.265813912738741e-07, "loss": 0.1515, "reward": 1.382279634475708, "reward_std": 0.15624842047691345, "rewards/accuracy_reward_stage2": 0.5072795748710632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 420 }, { "completion_length": 9.75, "epoch": 0.07376905554582092, "grad_norm": 20.6302936890133, "kl": 0.057373046875, "learning_rate": 9.264061678640266e-07, "loss": 0.023, "reward": 1.6946684122085571, "reward_std": 0.241998553276062, "rewards/accuracy_reward_stage2": 0.6946684718132019, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 421 }, { "completion_length": 13.046875, "epoch": 0.07394427895566848, "grad_norm": 18.246674428454657, "kl": 0.09765625, "learning_rate": 9.26230944454179e-07, "loss": 0.01, "reward": 1.5222277641296387, "reward_std": 0.2169165462255478, "rewards/accuracy_reward_stage2": 0.5378527641296387, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 422 }, { "completion_length": 10.734375, "epoch": 0.07411950236551604, "grad_norm": 19.198742373526642, "kl": 0.65625, "learning_rate": 9.260557210443315e-07, "loss": 0.2619, "reward": 1.5240931510925293, "reward_std": 0.22133302688598633, "rewards/accuracy_reward_stage2": 0.7740931510925293, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 423 }, { "completion_length": 12.75, "epoch": 0.07429472577536358, "grad_norm": 29.79110571390762, "kl": 0.10302734375, "learning_rate": 9.25880497634484e-07, "loss": -0.003, "reward": 1.3967804908752441, "reward_std": 0.30737680196762085, "rewards/accuracy_reward_stage2": 0.4124056100845337, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 424 }, { "completion_length": 14.046875, "epoch": 0.07446994918521115, "grad_norm": 18.602900239899586, "kl": 0.59765625, "learning_rate": 9.257052742246364e-07, "loss": 0.1944, "reward": 1.226088285446167, "reward_std": 0.21388447284698486, "rewards/accuracy_reward_stage2": 0.36671334505081177, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 425 }, { "completion_length": 5.5625, "epoch": 0.0746451725950587, "grad_norm": 23.651939037577094, "kl": 0.03759765625, "learning_rate": 9.255300508147889e-07, "loss": 0.015, "reward": 1.5, "reward_std": 0.1293872892856598, "rewards/accuracy_reward_stage2": 0.5, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 426 }, { "completion_length": 11.671875, "epoch": 0.07482039600490625, "grad_norm": 24.861135751373336, "kl": 0.087890625, "learning_rate": 9.253548274049414e-07, "loss": 0.0036, "reward": 1.570847511291504, "reward_std": 0.3195498585700989, "rewards/accuracy_reward_stage2": 0.5864725112915039, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 427 }, { "completion_length": 9.78125, "epoch": 0.07499561941475381, "grad_norm": 20.03380077267981, "kl": 0.115234375, "learning_rate": 9.251796039950936e-07, "loss": 0.0461, "reward": 1.5266039371490479, "reward_std": 0.1536373347043991, "rewards/accuracy_reward_stage2": 0.6516038179397583, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 428 }, { "completion_length": 12.78125, "epoch": 0.07517084282460136, "grad_norm": 65.42975776154991, "kl": 0.6875, "learning_rate": 9.250043805852461e-07, "loss": 0.2751, "reward": 1.4417392015457153, "reward_std": 0.29339122772216797, "rewards/accuracy_reward_stage2": 0.6917392015457153, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 429 }, { "completion_length": 9.421875, "epoch": 0.07534606623444892, "grad_norm": 20.85029400969406, "kl": 0.0250244140625, "learning_rate": 9.248291571753985e-07, "loss": 0.01, "reward": 1.3675525188446045, "reward_std": 0.1472133994102478, "rewards/accuracy_reward_stage2": 0.3675525486469269, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 430 }, { "completion_length": 9.3125, "epoch": 0.07552128964429648, "grad_norm": 17.12020230180985, "kl": 0.0150146484375, "learning_rate": 9.24653933765551e-07, "loss": 0.006, "reward": 1.7181713581085205, "reward_std": 0.13219161331653595, "rewards/accuracy_reward_stage2": 0.7181712985038757, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 431 }, { "completion_length": 9.453125, "epoch": 0.07569651305414403, "grad_norm": 19.27576364331227, "kl": 0.0634765625, "learning_rate": 9.244787103557035e-07, "loss": 0.0254, "reward": 1.5303231477737427, "reward_std": 0.11499994993209839, "rewards/accuracy_reward_stage2": 0.5303231477737427, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 432 }, { "completion_length": 12.71875, "epoch": 0.07587173646399159, "grad_norm": 12.586077511720019, "kl": 0.0157470703125, "learning_rate": 9.243034869458559e-07, "loss": 0.0063, "reward": 1.4600911140441895, "reward_std": 0.0769738256931305, "rewards/accuracy_reward_stage2": 0.4600910544395447, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 433 }, { "completion_length": 14.078125, "epoch": 0.07604695987383915, "grad_norm": 18.27881518277824, "kl": 0.126953125, "learning_rate": 9.241282635360084e-07, "loss": 0.0507, "reward": 1.414088487625122, "reward_std": 0.14201867580413818, "rewards/accuracy_reward_stage2": 0.5390884280204773, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 434 }, { "completion_length": 14.734375, "epoch": 0.0762221832836867, "grad_norm": 22.462968714576906, "kl": 0.1640625, "learning_rate": 9.239530401261609e-07, "loss": 0.0218, "reward": 1.7298152446746826, "reward_std": 0.18984611332416534, "rewards/accuracy_reward_stage2": 0.7454402446746826, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 435 }, { "completion_length": 26.984375, "epoch": 0.07639740669353426, "grad_norm": 37.120742747022334, "kl": 0.5625, "learning_rate": 9.237778167163133e-07, "loss": 0.2252, "reward": 1.5777366161346436, "reward_std": 0.23983044922351837, "rewards/accuracy_reward_stage2": 0.7027365565299988, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 436 }, { "completion_length": 13.609375, "epoch": 0.0765726301033818, "grad_norm": 21.32126027829951, "kl": 0.291015625, "learning_rate": 9.236025933064658e-07, "loss": 0.0866, "reward": 1.5508217811584473, "reward_std": 0.11196690797805786, "rewards/accuracy_reward_stage2": 0.691446840763092, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 437 }, { "completion_length": 13.125, "epoch": 0.07674785351322937, "grad_norm": 21.206277419580225, "kl": 0.0830078125, "learning_rate": 9.234273698966181e-07, "loss": -0.0022, "reward": 1.3115644454956055, "reward_std": 0.2376328855752945, "rewards/accuracy_reward_stage2": 0.45218944549560547, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 438 }, { "completion_length": 15.359375, "epoch": 0.07692307692307693, "grad_norm": 26.441305499162887, "kl": 0.69921875, "learning_rate": 9.232521464867706e-07, "loss": 0.2789, "reward": 1.5329861640930176, "reward_std": 0.2549724876880646, "rewards/accuracy_reward_stage2": 0.6579861044883728, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 439 }, { "completion_length": 7.375, "epoch": 0.07709830033292447, "grad_norm": 14.831592080269788, "kl": 0.0184326171875, "learning_rate": 9.230769230769231e-07, "loss": 0.0074, "reward": 1.5251660346984863, "reward_std": 0.14086659252643585, "rewards/accuracy_reward_stage2": 0.5251659154891968, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 440 }, { "completion_length": 10.125, "epoch": 0.07727352374277204, "grad_norm": 18.685402304111808, "kl": 0.078125, "learning_rate": 9.229016996670754e-07, "loss": 0.0314, "reward": 1.652919054031372, "reward_std": 0.17189809679985046, "rewards/accuracy_reward_stage2": 0.6529191136360168, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 441 }, { "completion_length": 9.203125, "epoch": 0.0774487471526196, "grad_norm": 26.496965759362393, "kl": 0.01141357421875, "learning_rate": 9.227264762572279e-07, "loss": 0.0046, "reward": 1.6230113506317139, "reward_std": 0.20974057912826538, "rewards/accuracy_reward_stage2": 0.6230113506317139, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 442 }, { "completion_length": 7.078125, "epoch": 0.07762397056246714, "grad_norm": 17.947336239229962, "kl": 0.01226806640625, "learning_rate": 9.225512528473803e-07, "loss": 0.0049, "reward": 1.6121759414672852, "reward_std": 0.21027937531471252, "rewards/accuracy_reward_stage2": 0.6121759414672852, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 443 }, { "completion_length": 10.046875, "epoch": 0.0777991939723147, "grad_norm": 20.823151026736113, "kl": 0.0206298828125, "learning_rate": 9.223760294375328e-07, "loss": -0.0359, "reward": 1.5672154426574707, "reward_std": 0.18561364710330963, "rewards/accuracy_reward_stage2": 0.5828403234481812, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 444 }, { "completion_length": 18.609375, "epoch": 0.07797441738216225, "grad_norm": 20.451095462177015, "kl": 0.08203125, "learning_rate": 9.222008060276853e-07, "loss": 0.0329, "reward": 1.4407696723937988, "reward_std": 0.23721659183502197, "rewards/accuracy_reward_stage2": 0.44076964259147644, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 445 }, { "completion_length": 9.75, "epoch": 0.07814964079200981, "grad_norm": 16.20511513569212, "kl": 0.0439453125, "learning_rate": 9.220255826178377e-07, "loss": 0.0176, "reward": 1.6019539833068848, "reward_std": 0.21476775407791138, "rewards/accuracy_reward_stage2": 0.6019538640975952, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 446 }, { "completion_length": 8.578125, "epoch": 0.07832486420185737, "grad_norm": 21.53926182512, "kl": 0.061279296875, "learning_rate": 9.218503592079901e-07, "loss": -0.0197, "reward": 1.5372408628463745, "reward_std": 0.29406264424324036, "rewards/accuracy_reward_stage2": 0.5528658628463745, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 447 }, { "completion_length": 8.3125, "epoch": 0.07850008761170492, "grad_norm": 20.87221627271646, "kl": 0.041259765625, "learning_rate": 9.216751357981426e-07, "loss": -0.0151, "reward": 1.622206449508667, "reward_std": 0.32090505957603455, "rewards/accuracy_reward_stage2": 0.637831449508667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 448 }, { "completion_length": 9.15625, "epoch": 0.07867531102155248, "grad_norm": 16.58816774301801, "kl": 0.041748046875, "learning_rate": 9.21499912388295e-07, "loss": 0.0166, "reward": 1.6170015335083008, "reward_std": 0.19486872851848602, "rewards/accuracy_reward_stage2": 0.617001473903656, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 449 }, { "completion_length": 12.109375, "epoch": 0.07885053443140004, "grad_norm": 17.47196711067821, "kl": 0.5703125, "learning_rate": 9.213246889784475e-07, "loss": 0.2271, "reward": 1.5230088233947754, "reward_std": 0.13385896384716034, "rewards/accuracy_reward_stage2": 0.6480089426040649, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 450 }, { "completion_length": 8.046875, "epoch": 0.07902575784124759, "grad_norm": 14.006565039229681, "kl": 0.041748046875, "learning_rate": 9.211494655685999e-07, "loss": -0.0167, "reward": 1.8111279010772705, "reward_std": 0.2181154489517212, "rewards/accuracy_reward_stage2": 0.8267529010772705, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 451 }, { "completion_length": 10.4375, "epoch": 0.07920098125109515, "grad_norm": 13.338040288753435, "kl": 0.04833984375, "learning_rate": 9.209742421587524e-07, "loss": 0.0194, "reward": 1.522031307220459, "reward_std": 0.11794281750917435, "rewards/accuracy_reward_stage2": 0.5220313668251038, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 452 }, { "completion_length": 14.875, "epoch": 0.0793762046609427, "grad_norm": 18.92124589768796, "kl": 0.4453125, "learning_rate": 9.207990187489049e-07, "loss": 0.1449, "reward": 1.508528232574463, "reward_std": 0.17721973359584808, "rewards/accuracy_reward_stage2": 0.6491532325744629, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 453 }, { "completion_length": 7.0625, "epoch": 0.07955142807079026, "grad_norm": 11.358183463925144, "kl": 0.028564453125, "learning_rate": 9.206237953390572e-07, "loss": 0.0114, "reward": 1.5925219058990479, "reward_std": 0.06521537899971008, "rewards/accuracy_reward_stage2": 0.5925219058990479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 454 }, { "completion_length": 13.53125, "epoch": 0.07972665148063782, "grad_norm": 19.019904562195457, "kl": 0.08203125, "learning_rate": 9.204485719292097e-07, "loss": -0.011, "reward": 1.2978602647781372, "reward_std": 0.13462838530540466, "rewards/accuracy_reward_stage2": 0.3134852647781372, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 455 }, { "completion_length": 11.421875, "epoch": 0.07990187489048536, "grad_norm": 23.626297967066854, "kl": 0.1171875, "learning_rate": 9.202733485193622e-07, "loss": -0.0206, "reward": 1.5690983533859253, "reward_std": 0.20688967406749725, "rewards/accuracy_reward_stage2": 0.6003483533859253, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 456 }, { "completion_length": 12.046875, "epoch": 0.08007709830033292, "grad_norm": 22.91529648896093, "kl": 0.47265625, "learning_rate": 9.200981251095145e-07, "loss": 0.1892, "reward": 1.5967607498168945, "reward_std": 0.2147434800863266, "rewards/accuracy_reward_stage2": 0.7217606902122498, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 457 }, { "completion_length": 13.40625, "epoch": 0.08025232171018049, "grad_norm": 23.42855017991445, "kl": 0.09228515625, "learning_rate": 9.19922901699667e-07, "loss": 0.0368, "reward": 1.515872836112976, "reward_std": 0.1746286153793335, "rewards/accuracy_reward_stage2": 0.5158728361129761, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 458 }, { "completion_length": 8.53125, "epoch": 0.08042754512002803, "grad_norm": 19.36224437018435, "kl": 0.07177734375, "learning_rate": 9.197476782898194e-07, "loss": 0.0287, "reward": 1.6671662330627441, "reward_std": 0.20749810338020325, "rewards/accuracy_reward_stage2": 0.6671661734580994, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 459 }, { "completion_length": 9.484375, "epoch": 0.08060276852987559, "grad_norm": 18.931505006258217, "kl": 0.037109375, "learning_rate": 9.195724548799719e-07, "loss": 0.0148, "reward": 1.524126648902893, "reward_std": 0.16327914595603943, "rewards/accuracy_reward_stage2": 0.5241267085075378, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 460 }, { "completion_length": 8.859375, "epoch": 0.08077799193972315, "grad_norm": 21.76679733035785, "kl": 0.072265625, "learning_rate": 9.193972314701244e-07, "loss": 0.0289, "reward": 1.5988078117370605, "reward_std": 0.2698323130607605, "rewards/accuracy_reward_stage2": 0.5988078713417053, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 461 }, { "completion_length": 12.75, "epoch": 0.0809532153495707, "grad_norm": 259.33214865484473, "kl": 0.5390625, "learning_rate": 9.192220080602768e-07, "loss": 0.1409, "reward": 1.3012222051620483, "reward_std": 0.2742873728275299, "rewards/accuracy_reward_stage2": 0.4574722647666931, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 462 }, { "completion_length": 10.4375, "epoch": 0.08112843875941826, "grad_norm": 15.7184925250878, "kl": 0.052978515625, "learning_rate": 9.190467846504293e-07, "loss": 0.0212, "reward": 1.3793818950653076, "reward_std": 0.14053119719028473, "rewards/accuracy_reward_stage2": 0.5043818950653076, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 463 }, { "completion_length": 8.359375, "epoch": 0.08130366216926581, "grad_norm": 18.581197943613866, "kl": 0.032470703125, "learning_rate": 9.188715612405818e-07, "loss": 0.013, "reward": 1.5649161338806152, "reward_std": 0.12751111388206482, "rewards/accuracy_reward_stage2": 0.5649161338806152, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 464 }, { "completion_length": 9.296875, "epoch": 0.08147888557911337, "grad_norm": 13.875201754235798, "kl": 0.09765625, "learning_rate": 9.186963378307342e-07, "loss": 0.039, "reward": 1.5525639057159424, "reward_std": 0.12691347301006317, "rewards/accuracy_reward_stage2": 0.5525639653205872, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 465 }, { "completion_length": 14.25, "epoch": 0.08165410898896093, "grad_norm": 18.189817374897615, "kl": 0.0244140625, "learning_rate": 9.185211144208866e-07, "loss": 0.0098, "reward": 1.468153476715088, "reward_std": 0.0900755375623703, "rewards/accuracy_reward_stage2": 0.5931534767150879, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 466 }, { "completion_length": 23.375, "epoch": 0.08182933239880848, "grad_norm": 26.444868554171173, "kl": 0.05419921875, "learning_rate": 9.183458910110389e-07, "loss": 0.0217, "reward": 1.593287467956543, "reward_std": 0.23012332618236542, "rewards/accuracy_reward_stage2": 0.5932875871658325, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 467 }, { "completion_length": 27.75, "epoch": 0.08200455580865604, "grad_norm": 8966.745368887247, "kl": 55.75, "learning_rate": 9.181706676011914e-07, "loss": 22.3474, "reward": 1.203883171081543, "reward_std": 0.20250242948532104, "rewards/accuracy_reward_stage2": 0.32888320088386536, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 468 }, { "completion_length": 9.78125, "epoch": 0.0821797792185036, "grad_norm": 11.436562594290804, "kl": 0.0133056640625, "learning_rate": 9.179954441913439e-07, "loss": 0.0053, "reward": 1.7812447547912598, "reward_std": 0.04915858805179596, "rewards/accuracy_reward_stage2": 0.9062446355819702, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 469 }, { "completion_length": 10.078125, "epoch": 0.08235500262835115, "grad_norm": 21.126468460943666, "kl": 0.06103515625, "learning_rate": 9.178202207814963e-07, "loss": -0.0045, "reward": 1.70155930519104, "reward_std": 0.2857385575771332, "rewards/accuracy_reward_stage2": 0.7171843647956848, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 470 }, { "completion_length": 14.890625, "epoch": 0.0825302260381987, "grad_norm": 19.202675304680323, "kl": 0.07958984375, "learning_rate": 9.176449973716488e-07, "loss": 0.0027, "reward": 1.693793773651123, "reward_std": 0.1584872305393219, "rewards/accuracy_reward_stage2": 0.709418773651123, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 471 }, { "completion_length": 9.6875, "epoch": 0.08270544944804625, "grad_norm": 16.471679971821743, "kl": 0.057861328125, "learning_rate": 9.174697739618013e-07, "loss": 0.0231, "reward": 1.8260822296142578, "reward_std": 0.0860002413392067, "rewards/accuracy_reward_stage2": 0.8260822892189026, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 472 }, { "completion_length": 11.15625, "epoch": 0.08288067285789381, "grad_norm": 23.791810653654068, "kl": 0.287109375, "learning_rate": 9.172945505519537e-07, "loss": 0.1152, "reward": 1.402888536453247, "reward_std": 0.2622658908367157, "rewards/accuracy_reward_stage2": 0.5278885364532471, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 473 }, { "completion_length": 17.8125, "epoch": 0.08305589626774137, "grad_norm": 25.791584456588865, "kl": 0.115234375, "learning_rate": 9.171193271421062e-07, "loss": 0.046, "reward": 1.5935009717941284, "reward_std": 0.1915404051542282, "rewards/accuracy_reward_stage2": 0.5935010313987732, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 474 }, { "completion_length": 10.90625, "epoch": 0.08323111967758892, "grad_norm": 24.400165974590752, "kl": 0.09716796875, "learning_rate": 9.169441037322586e-07, "loss": 0.0389, "reward": 1.6456031799316406, "reward_std": 0.29340386390686035, "rewards/accuracy_reward_stage2": 0.6456031799316406, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 475 }, { "completion_length": 11.03125, "epoch": 0.08340634308743648, "grad_norm": 22.10942774313132, "kl": 0.08544921875, "learning_rate": 9.167688803224111e-07, "loss": 0.0342, "reward": 1.499727487564087, "reward_std": 0.16826727986335754, "rewards/accuracy_reward_stage2": 0.49972739815711975, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 476 }, { "completion_length": 16.078125, "epoch": 0.08358156649728404, "grad_norm": 25.856498433525953, "kl": 0.322265625, "learning_rate": 9.165936569125636e-07, "loss": 0.129, "reward": 1.40625, "reward_std": 0.1293872892856598, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 477 }, { "completion_length": 11.671875, "epoch": 0.08375678990713159, "grad_norm": 22.729798625639337, "kl": 0.07177734375, "learning_rate": 9.164184335027159e-07, "loss": -0.0026, "reward": 1.5009512901306152, "reward_std": 0.3437076210975647, "rewards/accuracy_reward_stage2": 0.51657634973526, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 478 }, { "completion_length": 21.171875, "epoch": 0.08393201331697915, "grad_norm": 118.09404479510592, "kl": 1.09375, "learning_rate": 9.162432100928683e-07, "loss": 0.4047, "reward": 1.2600040435791016, "reward_std": 0.1994476616382599, "rewards/accuracy_reward_stage2": 0.525628924369812, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 479 }, { "completion_length": 13.625, "epoch": 0.0841072367268267, "grad_norm": 22.19379717741815, "kl": 0.06103515625, "learning_rate": 9.160679866830208e-07, "loss": -0.0089, "reward": 1.350834846496582, "reward_std": 0.29887160658836365, "rewards/accuracy_reward_stage2": 0.36645978689193726, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 480 }, { "completion_length": 12.46875, "epoch": 0.08428246013667426, "grad_norm": 17.494509829942675, "kl": 0.0361328125, "learning_rate": 9.158927632731732e-07, "loss": -0.0732, "reward": 1.7071616649627686, "reward_std": 0.26007020473480225, "rewards/accuracy_reward_stage2": 0.7384116053581238, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 481 }, { "completion_length": 9.5625, "epoch": 0.08445768354652182, "grad_norm": 26.620563297769827, "kl": 0.1201171875, "learning_rate": 9.157175398633257e-07, "loss": -0.0193, "reward": 1.4905469417572021, "reward_std": 0.2239820957183838, "rewards/accuracy_reward_stage2": 0.5217969417572021, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 482 }, { "completion_length": 8.921875, "epoch": 0.08463290695636937, "grad_norm": 20.51588760061051, "kl": 0.05322265625, "learning_rate": 9.155423164534781e-07, "loss": 0.0213, "reward": 1.6132653951644897, "reward_std": 0.21259036660194397, "rewards/accuracy_reward_stage2": 0.6132654547691345, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 483 }, { "completion_length": 15.375, "epoch": 0.08480813036621693, "grad_norm": 20.733659007741288, "kl": 0.57421875, "learning_rate": 9.153670930436306e-07, "loss": 0.2298, "reward": 1.191416621208191, "reward_std": 0.15625491738319397, "rewards/accuracy_reward_stage2": 0.44141659140586853, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 484 }, { "completion_length": 9.234375, "epoch": 0.08498335377606449, "grad_norm": 17.187443879963023, "kl": 0.06884765625, "learning_rate": 9.151918696337831e-07, "loss": 0.0276, "reward": 1.4177746772766113, "reward_std": 0.16705992817878723, "rewards/accuracy_reward_stage2": 0.5427746772766113, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 485 }, { "completion_length": 12.234375, "epoch": 0.08515857718591203, "grad_norm": 25.971877155932173, "kl": 0.11669921875, "learning_rate": 9.150166462239355e-07, "loss": 0.0468, "reward": 1.5548030138015747, "reward_std": 0.32056811451911926, "rewards/accuracy_reward_stage2": 0.6798031330108643, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 486 }, { "completion_length": 10.265625, "epoch": 0.0853338005957596, "grad_norm": 21.44425590514259, "kl": 0.023681640625, "learning_rate": 9.148414228140879e-07, "loss": 0.0095, "reward": 1.6888264417648315, "reward_std": 0.18528994917869568, "rewards/accuracy_reward_stage2": 0.6888264417648315, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 487 }, { "completion_length": 11.0625, "epoch": 0.08550902400560714, "grad_norm": 15.23099896578219, "kl": 0.6015625, "learning_rate": 9.146661994042404e-07, "loss": 0.2397, "reward": 1.6460518836975098, "reward_std": 0.07517996430397034, "rewards/accuracy_reward_stage2": 0.7710518836975098, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 488 }, { "completion_length": 12.421875, "epoch": 0.0856842474154547, "grad_norm": 18.419604037862406, "kl": 0.06494140625, "learning_rate": 9.144909759943928e-07, "loss": 0.026, "reward": 1.7371560335159302, "reward_std": 0.2151593118906021, "rewards/accuracy_reward_stage2": 0.7371560335159302, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 489 }, { "completion_length": 12.578125, "epoch": 0.08585947082530226, "grad_norm": 18.257266241360327, "kl": 0.0213623046875, "learning_rate": 9.143157525845453e-07, "loss": 0.0085, "reward": 1.5157694816589355, "reward_std": 0.23306876420974731, "rewards/accuracy_reward_stage2": 0.5157694220542908, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 490 }, { "completion_length": 15.25, "epoch": 0.08603469423514981, "grad_norm": 13.805727437190503, "kl": 0.01123046875, "learning_rate": 9.141405291746977e-07, "loss": 0.0045, "reward": 1.2439332008361816, "reward_std": 0.136602520942688, "rewards/accuracy_reward_stage2": 0.36893314123153687, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 491 }, { "completion_length": 11.859375, "epoch": 0.08620991764499737, "grad_norm": 17.94928563968683, "kl": 0.032958984375, "learning_rate": 9.139653057648501e-07, "loss": 0.0131, "reward": 1.6556003093719482, "reward_std": 0.11049705743789673, "rewards/accuracy_reward_stage2": 0.655600368976593, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 492 }, { "completion_length": 11.578125, "epoch": 0.08638514105484493, "grad_norm": 16.562608575277626, "kl": 0.09228515625, "learning_rate": 9.137900823550026e-07, "loss": 0.037, "reward": 1.4523134231567383, "reward_std": 0.13723313808441162, "rewards/accuracy_reward_stage2": 0.5773133635520935, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 493 }, { "completion_length": 7.03125, "epoch": 0.08656036446469248, "grad_norm": 16.621714925514897, "kl": 0.044189453125, "learning_rate": 9.13614858945155e-07, "loss": -0.0917, "reward": 1.780239224433899, "reward_std": 0.18271209299564362, "rewards/accuracy_reward_stage2": 0.8271142244338989, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 494 }, { "completion_length": 11.453125, "epoch": 0.08673558787454004, "grad_norm": 26.540387912232912, "kl": 0.05908203125, "learning_rate": 9.134396355353075e-07, "loss": 0.0236, "reward": 1.4195480346679688, "reward_std": 0.12336726486682892, "rewards/accuracy_reward_stage2": 0.544547975063324, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 495 }, { "completion_length": 9.90625, "epoch": 0.08691081128438759, "grad_norm": 21.874119279086194, "kl": 0.0732421875, "learning_rate": 9.1326441212546e-07, "loss": 0.0293, "reward": 1.704089641571045, "reward_std": 0.1853123903274536, "rewards/accuracy_reward_stage2": 0.7040896415710449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 496 }, { "completion_length": 17.984375, "epoch": 0.08708603469423515, "grad_norm": 16.270959865875156, "kl": 0.0712890625, "learning_rate": 9.130891887156123e-07, "loss": -0.0157, "reward": 1.3348397016525269, "reward_std": 0.14666268229484558, "rewards/accuracy_reward_stage2": 0.47546470165252686, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 497 }, { "completion_length": 8.953125, "epoch": 0.08726125810408271, "grad_norm": 20.903866923336594, "kl": 0.09326171875, "learning_rate": 9.129139653057648e-07, "loss": -0.0069, "reward": 1.4990664720535278, "reward_std": 0.2881520092487335, "rewards/accuracy_reward_stage2": 0.6396914720535278, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 498 }, { "completion_length": 14.921875, "epoch": 0.08743648151393026, "grad_norm": 17.22315948392657, "kl": 0.1201171875, "learning_rate": 9.127387418959172e-07, "loss": 0.0478, "reward": 1.448919653892517, "reward_std": 0.1353299617767334, "rewards/accuracy_reward_stage2": 0.5739197134971619, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 499 }, { "completion_length": 7.46875, "epoch": 0.08761170492377782, "grad_norm": 16.0429762085806, "kl": 0.02734375, "learning_rate": 9.125635184860697e-07, "loss": -0.0225, "reward": 1.3645137548446655, "reward_std": 0.20214247703552246, "rewards/accuracy_reward_stage2": 0.38013872504234314, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 500 }, { "completion_length": 7.109375, "epoch": 0.08778692833362538, "grad_norm": 13.494480051498577, "kl": 0.01904296875, "learning_rate": 9.123882950762222e-07, "loss": 0.0076, "reward": 1.6657228469848633, "reward_std": 0.10837189853191376, "rewards/accuracy_reward_stage2": 0.6657228469848633, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 501 }, { "completion_length": 11.078125, "epoch": 0.08796215174347292, "grad_norm": 20.820818788522654, "kl": 0.041748046875, "learning_rate": 9.122130716663746e-07, "loss": -0.0113, "reward": 1.6971039772033691, "reward_std": 0.17723074555397034, "rewards/accuracy_reward_stage2": 0.7127288579940796, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 502 }, { "completion_length": 8.9375, "epoch": 0.08813737515332049, "grad_norm": 20.591576189544814, "kl": 0.0859375, "learning_rate": 9.120378482565271e-07, "loss": 0.0345, "reward": 1.5953072309494019, "reward_std": 0.14238935708999634, "rewards/accuracy_reward_stage2": 0.5953072309494019, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 503 }, { "completion_length": 10.0, "epoch": 0.08831259856316805, "grad_norm": 21.333154774702535, "kl": 0.057861328125, "learning_rate": 9.118626248466796e-07, "loss": 0.0232, "reward": 1.5268161296844482, "reward_std": 0.17292073369026184, "rewards/accuracy_reward_stage2": 0.6518160700798035, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 504 }, { "completion_length": 6.71875, "epoch": 0.08848782197301559, "grad_norm": 15.041117262517627, "kl": 0.041259765625, "learning_rate": 9.116874014368319e-07, "loss": 0.0166, "reward": 1.7544504404067993, "reward_std": 0.07580053806304932, "rewards/accuracy_reward_stage2": 0.7544504404067993, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 505 }, { "completion_length": 7.59375, "epoch": 0.08866304538286315, "grad_norm": 26.12056744776605, "kl": 0.150390625, "learning_rate": 9.115121780269844e-07, "loss": 0.0602, "reward": 1.3547911643981934, "reward_std": 0.2755330204963684, "rewards/accuracy_reward_stage2": 0.47979116439819336, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 506 }, { "completion_length": 9.953125, "epoch": 0.0888382687927107, "grad_norm": 29.23205546489329, "kl": 0.042724609375, "learning_rate": 9.113369546171367e-07, "loss": 0.0171, "reward": 1.3176989555358887, "reward_std": 0.16319842636585236, "rewards/accuracy_reward_stage2": 0.44269901514053345, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 507 }, { "completion_length": 9.015625, "epoch": 0.08901349220255826, "grad_norm": 20.57669358054299, "kl": 0.0478515625, "learning_rate": 9.111617312072892e-07, "loss": 0.0191, "reward": 1.525465965270996, "reward_std": 0.12524645030498505, "rewards/accuracy_reward_stage2": 0.5254659652709961, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 508 }, { "completion_length": 12.09375, "epoch": 0.08918871561240582, "grad_norm": 1464.9969971244047, "kl": 2.078125, "learning_rate": 9.109865077974417e-07, "loss": 0.8816, "reward": 1.3966069221496582, "reward_std": 0.1975216120481491, "rewards/accuracy_reward_stage2": 0.5216069221496582, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 509 }, { "completion_length": 15.703125, "epoch": 0.08936393902225337, "grad_norm": 36.949477081771, "kl": 0.47265625, "learning_rate": 9.108112843875941e-07, "loss": 0.1891, "reward": 1.4456298351287842, "reward_std": 0.17112982273101807, "rewards/accuracy_reward_stage2": 0.570629894733429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 510 }, { "completion_length": 11.75, "epoch": 0.08953916243210093, "grad_norm": 33.871343877158345, "kl": 0.21484375, "learning_rate": 9.106360609777466e-07, "loss": 0.0186, "reward": 1.18915593624115, "reward_std": 0.30198466777801514, "rewards/accuracy_reward_stage2": 0.4704058766365051, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 511 }, { "completion_length": 15.96875, "epoch": 0.08971438584194849, "grad_norm": 20.09570532536062, "kl": 0.07470703125, "learning_rate": 9.10460837567899e-07, "loss": 0.0298, "reward": 1.4191932678222656, "reward_std": 0.1313377022743225, "rewards/accuracy_reward_stage2": 0.41919323801994324, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 512 }, { "completion_length": 8.046875, "epoch": 0.08988960925179604, "grad_norm": 22.475214131805384, "kl": 0.09375, "learning_rate": 9.102856141580515e-07, "loss": 0.0108, "reward": 1.6266400814056396, "reward_std": 0.2724280059337616, "rewards/accuracy_reward_stage2": 0.6422651410102844, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 513 }, { "completion_length": 15.125, "epoch": 0.0900648326616436, "grad_norm": 19.065789675442435, "kl": 0.5703125, "learning_rate": 9.10110390748204e-07, "loss": 0.2284, "reward": 1.4352792501449585, "reward_std": 0.20429880917072296, "rewards/accuracy_reward_stage2": 0.5602791905403137, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 514 }, { "completion_length": 7.03125, "epoch": 0.09024005607149115, "grad_norm": 41.06172188165604, "kl": 0.298828125, "learning_rate": 9.099351673383564e-07, "loss": 0.0861, "reward": 1.4946836233139038, "reward_std": 0.3347545266151428, "rewards/accuracy_reward_stage2": 0.5103086233139038, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 515 }, { "completion_length": 16.71875, "epoch": 0.0904152794813387, "grad_norm": 21.74849083261524, "kl": 0.021728515625, "learning_rate": 9.097599439285089e-07, "loss": -0.0355, "reward": 1.4019594192504883, "reward_std": 0.2302931249141693, "rewards/accuracy_reward_stage2": 0.5425843000411987, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 516 }, { "completion_length": 7.921875, "epoch": 0.09059050289118627, "grad_norm": 20.46413969828866, "kl": 0.072265625, "learning_rate": 9.095847205186612e-07, "loss": 0.0288, "reward": 1.7358605861663818, "reward_std": 0.22160489857196808, "rewards/accuracy_reward_stage2": 0.7358605265617371, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 517 }, { "completion_length": 8.359375, "epoch": 0.09076572630103381, "grad_norm": 19.461831091625143, "kl": 0.0400390625, "learning_rate": 9.094094971088136e-07, "loss": 0.0159, "reward": 1.5917770862579346, "reward_std": 0.20575006306171417, "rewards/accuracy_reward_stage2": 0.5917772054672241, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 518 }, { "completion_length": 23.453125, "epoch": 0.09094094971088137, "grad_norm": 28.252730679984083, "kl": 0.486328125, "learning_rate": 9.092342736989661e-07, "loss": 0.1949, "reward": 1.470628261566162, "reward_std": 0.19762389361858368, "rewards/accuracy_reward_stage2": 0.5956283211708069, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 519 }, { "completion_length": 7.578125, "epoch": 0.09111617312072894, "grad_norm": 20.000340904813825, "kl": 0.04638671875, "learning_rate": 9.090590502891185e-07, "loss": 0.0185, "reward": 1.4284979104995728, "reward_std": 0.12627477943897247, "rewards/accuracy_reward_stage2": 0.428497850894928, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 520 }, { "completion_length": 10.96875, "epoch": 0.09129139653057648, "grad_norm": 19.089316113977233, "kl": 0.109375, "learning_rate": 9.08883826879271e-07, "loss": 0.0438, "reward": 1.519882321357727, "reward_std": 0.12701007723808289, "rewards/accuracy_reward_stage2": 0.644882321357727, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 521 }, { "completion_length": 10.65625, "epoch": 0.09146661994042404, "grad_norm": 17.661386673905188, "kl": 0.051025390625, "learning_rate": 9.087086034694235e-07, "loss": 0.0204, "reward": 1.499420404434204, "reward_std": 0.16511984169483185, "rewards/accuracy_reward_stage2": 0.4994203746318817, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 522 }, { "completion_length": 21.40625, "epoch": 0.09164184335027159, "grad_norm": 16.986774971855642, "kl": 0.0859375, "learning_rate": 9.085333800595759e-07, "loss": 0.0344, "reward": 1.2771297693252563, "reward_std": 0.15670299530029297, "rewards/accuracy_reward_stage2": 0.40212973952293396, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 523 }, { "completion_length": 6.5625, "epoch": 0.09181706676011915, "grad_norm": 13.33918575428173, "kl": 0.06689453125, "learning_rate": 9.083581566497284e-07, "loss": 0.0268, "reward": 1.2362689971923828, "reward_std": 0.01895066723227501, "rewards/accuracy_reward_stage2": 0.36126893758773804, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 524 }, { "completion_length": 8.15625, "epoch": 0.09199229016996671, "grad_norm": 18.44676368745436, "kl": 0.0888671875, "learning_rate": 9.081829332398809e-07, "loss": 0.0357, "reward": 1.4270833730697632, "reward_std": 0.2298484742641449, "rewards/accuracy_reward_stage2": 0.4270833730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 525 }, { "completion_length": 11.53125, "epoch": 0.09216751357981426, "grad_norm": 24.122259978679597, "kl": 0.0791015625, "learning_rate": 9.080077098300333e-07, "loss": 0.0026, "reward": 1.4625226259231567, "reward_std": 0.24698016047477722, "rewards/accuracy_reward_stage2": 0.603147566318512, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 526 }, { "completion_length": 7.9375, "epoch": 0.09234273698966182, "grad_norm": 24.15433980688675, "kl": 0.03173828125, "learning_rate": 9.078324864201857e-07, "loss": 0.0127, "reward": 1.3791133165359497, "reward_std": 0.21480947732925415, "rewards/accuracy_reward_stage2": 0.37911322712898254, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 527 }, { "completion_length": 11.390625, "epoch": 0.09251796039950938, "grad_norm": 24.953994379295434, "kl": 0.1103515625, "learning_rate": 9.076572630103381e-07, "loss": 0.0344, "reward": 1.471587896347046, "reward_std": 0.2744523882865906, "rewards/accuracy_reward_stage2": 0.5965878963470459, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 528 }, { "completion_length": 10.671875, "epoch": 0.09269318380935693, "grad_norm": 27.436250870990296, "kl": 0.09521484375, "learning_rate": 9.074820396004906e-07, "loss": 0.038, "reward": 1.618800401687622, "reward_std": 0.2890097498893738, "rewards/accuracy_reward_stage2": 0.6188005208969116, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 529 }, { "completion_length": 8.40625, "epoch": 0.09286840721920449, "grad_norm": 23.698633010265233, "kl": 0.09130859375, "learning_rate": 9.07306816190643e-07, "loss": -0.0077, "reward": 1.6275365352630615, "reward_std": 0.30690711736679077, "rewards/accuracy_reward_stage2": 0.7681615352630615, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 530 }, { "completion_length": 13.046875, "epoch": 0.09304363062905203, "grad_norm": 21.264761614556896, "kl": 0.06787109375, "learning_rate": 9.071315927807954e-07, "loss": 0.0272, "reward": 1.5593863725662231, "reward_std": 0.16855500638484955, "rewards/accuracy_reward_stage2": 0.5593863725662231, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 531 }, { "completion_length": 8.796875, "epoch": 0.0932188540388996, "grad_norm": 19.428518945294773, "kl": 0.0859375, "learning_rate": 9.069563693709479e-07, "loss": 0.0344, "reward": 1.565098762512207, "reward_std": 0.3714370131492615, "rewards/accuracy_reward_stage2": 0.5650988221168518, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 532 }, { "completion_length": 10.15625, "epoch": 0.09339407744874716, "grad_norm": 20.04251433090847, "kl": 0.019287109375, "learning_rate": 9.067811459611004e-07, "loss": -0.0364, "reward": 1.355189323425293, "reward_std": 0.1609458327293396, "rewards/accuracy_reward_stage2": 0.37081438302993774, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 533 }, { "completion_length": 6.796875, "epoch": 0.0935693008585947, "grad_norm": 20.797059105787657, "kl": 0.039306640625, "learning_rate": 9.066059225512528e-07, "loss": -0.0054, "reward": 1.5364583730697632, "reward_std": 0.2956216037273407, "rewards/accuracy_reward_stage2": 0.5520833730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 534 }, { "completion_length": 14.875, "epoch": 0.09374452426844226, "grad_norm": 24.920972156704856, "kl": 0.69140625, "learning_rate": 9.064306991414053e-07, "loss": 0.2752, "reward": 1.4657204151153564, "reward_std": 0.2909356355667114, "rewards/accuracy_reward_stage2": 0.5907202959060669, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 535 }, { "completion_length": 5.765625, "epoch": 0.09391974767828982, "grad_norm": 11.554153440169287, "kl": 0.0172119140625, "learning_rate": 9.062554757315576e-07, "loss": 0.0069, "reward": 1.587594747543335, "reward_std": 0.11471574753522873, "rewards/accuracy_reward_stage2": 0.7125946879386902, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 536 }, { "completion_length": 14.5, "epoch": 0.09409497108813737, "grad_norm": 18.408333407487323, "kl": 0.06005859375, "learning_rate": 9.060802523217101e-07, "loss": 0.0241, "reward": 1.280574083328247, "reward_std": 0.1755804419517517, "rewards/accuracy_reward_stage2": 0.40557414293289185, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 537 }, { "completion_length": 8.171875, "epoch": 0.09427019449798493, "grad_norm": 24.09543191878335, "kl": 0.2158203125, "learning_rate": 9.059050289118626e-07, "loss": 0.042, "reward": 1.2124801874160767, "reward_std": 0.25265681743621826, "rewards/accuracy_reward_stage2": 0.47810518741607666, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 538 }, { "completion_length": 29.546875, "epoch": 0.09444541790783248, "grad_norm": 21.332602397727992, "kl": 0.0361328125, "learning_rate": 9.05729805502015e-07, "loss": 0.0144, "reward": 1.2393457889556885, "reward_std": 0.20028795301914215, "rewards/accuracy_reward_stage2": 0.36434581875801086, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 539 }, { "completion_length": 19.875, "epoch": 0.09462064131768004, "grad_norm": 23.21808555459532, "kl": 0.7265625, "learning_rate": 9.055545820921675e-07, "loss": 0.292, "reward": 1.217187523841858, "reward_std": 0.20525680482387543, "rewards/accuracy_reward_stage2": 0.3421875238418579, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 540 }, { "completion_length": 18.359375, "epoch": 0.0947958647275276, "grad_norm": 25.148256796049036, "kl": 0.1875, "learning_rate": 9.0537935868232e-07, "loss": 0.0747, "reward": 1.2867825031280518, "reward_std": 0.20961745083332062, "rewards/accuracy_reward_stage2": 0.411782443523407, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 541 }, { "completion_length": 11.375, "epoch": 0.09497108813737515, "grad_norm": 23.908511785902895, "kl": 0.062255859375, "learning_rate": 9.052041352724724e-07, "loss": 0.0249, "reward": 1.2482308149337769, "reward_std": 0.21626482903957367, "rewards/accuracy_reward_stage2": 0.37323081493377686, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 542 }, { "completion_length": 11.96875, "epoch": 0.09514631154722271, "grad_norm": 17.570510934597706, "kl": 0.1220703125, "learning_rate": 9.050289118626248e-07, "loss": 0.0136, "reward": 1.5263545513153076, "reward_std": 0.18224114179611206, "rewards/accuracy_reward_stage2": 0.5419795513153076, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 543 }, { "completion_length": 13.421875, "epoch": 0.09532153495707027, "grad_norm": 22.0167136435298, "kl": 0.1083984375, "learning_rate": 9.048536884527772e-07, "loss": 0.0434, "reward": 1.5312836170196533, "reward_std": 0.2787465751171112, "rewards/accuracy_reward_stage2": 0.5312834978103638, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 544 }, { "completion_length": 7.484375, "epoch": 0.09549675836691782, "grad_norm": 20.119504546456255, "kl": 0.1279296875, "learning_rate": 9.046784650429297e-07, "loss": 0.0513, "reward": 1.6141095161437988, "reward_std": 0.19938796758651733, "rewards/accuracy_reward_stage2": 0.6141095161437988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 545 }, { "completion_length": 9.328125, "epoch": 0.09567198177676538, "grad_norm": 21.585749591558706, "kl": 0.06982421875, "learning_rate": 9.045032416330821e-07, "loss": 0.028, "reward": 1.459768295288086, "reward_std": 0.1722957044839859, "rewards/accuracy_reward_stage2": 0.5847682356834412, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 546 }, { "completion_length": 20.6875, "epoch": 0.09584720518661294, "grad_norm": 20.96518673634591, "kl": 0.025634765625, "learning_rate": 9.043280182232345e-07, "loss": 0.0103, "reward": 1.3154137134552002, "reward_std": 0.2370438277721405, "rewards/accuracy_reward_stage2": 0.4404137134552002, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 547 }, { "completion_length": 11.328125, "epoch": 0.09602242859646049, "grad_norm": 19.927064137421482, "kl": 0.04931640625, "learning_rate": 9.04152794813387e-07, "loss": 0.0198, "reward": 1.6055917739868164, "reward_std": 0.11641772091388702, "rewards/accuracy_reward_stage2": 0.6055917739868164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 548 }, { "completion_length": 8.296875, "epoch": 0.09619765200630805, "grad_norm": 21.137165253719424, "kl": 0.05712890625, "learning_rate": 9.039775714035395e-07, "loss": 0.0228, "reward": 1.746246576309204, "reward_std": 0.27798545360565186, "rewards/accuracy_reward_stage2": 0.7462465763092041, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 549 }, { "completion_length": 7.015625, "epoch": 0.09637287541615559, "grad_norm": 17.906357465189267, "kl": 0.10693359375, "learning_rate": 9.038023479936919e-07, "loss": 0.0429, "reward": 1.5435776710510254, "reward_std": 0.12182464450597763, "rewards/accuracy_reward_stage2": 0.5435777902603149, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 550 }, { "completion_length": 10.578125, "epoch": 0.09654809882600315, "grad_norm": 52.716369091651, "kl": 0.138671875, "learning_rate": 9.036271245838444e-07, "loss": 0.0555, "reward": 1.751212239265442, "reward_std": 0.3372414708137512, "rewards/accuracy_reward_stage2": 0.7512121796607971, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 551 }, { "completion_length": 9.796875, "epoch": 0.09672332223585071, "grad_norm": 19.819458175934273, "kl": 0.11279296875, "learning_rate": 9.034519011739968e-07, "loss": 0.0452, "reward": 1.5237207412719727, "reward_std": 0.17480897903442383, "rewards/accuracy_reward_stage2": 0.6487207412719727, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 552 }, { "completion_length": 11.109375, "epoch": 0.09689854564569826, "grad_norm": 16.12663207048521, "kl": 0.0517578125, "learning_rate": 9.032766777641493e-07, "loss": 0.0207, "reward": 1.7208008766174316, "reward_std": 0.1695939302444458, "rewards/accuracy_reward_stage2": 0.7208009362220764, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 553 }, { "completion_length": 14.015625, "epoch": 0.09707376905554582, "grad_norm": 23.486242967320905, "kl": 0.0634765625, "learning_rate": 9.031014543543018e-07, "loss": 0.0253, "reward": 1.3676196336746216, "reward_std": 0.23155313730239868, "rewards/accuracy_reward_stage2": 0.36761969327926636, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 554 }, { "completion_length": 17.125, "epoch": 0.09724899246539338, "grad_norm": 19.557327683653888, "kl": 0.0546875, "learning_rate": 9.029262309444542e-07, "loss": 0.0218, "reward": 1.3779256343841553, "reward_std": 0.14323818683624268, "rewards/accuracy_reward_stage2": 0.5029256343841553, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 555 }, { "completion_length": 17.546875, "epoch": 0.09742421587524093, "grad_norm": 17.80871699497735, "kl": 0.7734375, "learning_rate": 9.027510075346065e-07, "loss": 0.3101, "reward": 1.411677598953247, "reward_std": 0.13221172988414764, "rewards/accuracy_reward_stage2": 0.5366775989532471, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 556 }, { "completion_length": 11.46875, "epoch": 0.09759943928508849, "grad_norm": 15.659891701415704, "kl": 0.08642578125, "learning_rate": 9.02575784124759e-07, "loss": 0.0056, "reward": 1.859658122062683, "reward_std": 0.14201951026916504, "rewards/accuracy_reward_stage2": 0.8752831220626831, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 557 }, { "completion_length": 14.34375, "epoch": 0.09777466269493604, "grad_norm": 24.3519660210772, "kl": 0.0181884765625, "learning_rate": 9.024005607149114e-07, "loss": 0.0073, "reward": 1.4546735286712646, "reward_std": 0.31135839223861694, "rewards/accuracy_reward_stage2": 0.4546734690666199, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 558 }, { "completion_length": 11.953125, "epoch": 0.0979498861047836, "grad_norm": 20.241237658071395, "kl": 0.107421875, "learning_rate": 9.022253373050639e-07, "loss": 0.0429, "reward": 1.2426481246948242, "reward_std": 0.18194395303726196, "rewards/accuracy_reward_stage2": 0.3676481544971466, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 559 }, { "completion_length": 6.640625, "epoch": 0.09812510951463116, "grad_norm": 24.783421445628193, "kl": 0.10546875, "learning_rate": 9.020501138952163e-07, "loss": 0.0424, "reward": 1.6440285444259644, "reward_std": 0.3332129716873169, "rewards/accuracy_reward_stage2": 0.6440285444259644, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 560 }, { "completion_length": 18.84375, "epoch": 0.0983003329244787, "grad_norm": 18.366188266106377, "kl": 0.34765625, "learning_rate": 9.018748904853688e-07, "loss": 0.1393, "reward": 1.3278311491012573, "reward_std": 0.17044323682785034, "rewards/accuracy_reward_stage2": 0.4528311789035797, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 561 }, { "completion_length": 9.625, "epoch": 0.09847555633432627, "grad_norm": 23.72519010814911, "kl": 0.053466796875, "learning_rate": 9.016996670755213e-07, "loss": 0.0214, "reward": 1.7041335105895996, "reward_std": 0.27296823263168335, "rewards/accuracy_reward_stage2": 0.7041334509849548, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 562 }, { "completion_length": 14.109375, "epoch": 0.09865077974417383, "grad_norm": 15.513103779989715, "kl": 0.05224609375, "learning_rate": 9.015244436656737e-07, "loss": 0.0209, "reward": 1.4729877710342407, "reward_std": 0.1910639852285385, "rewards/accuracy_reward_stage2": 0.4729878604412079, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 563 }, { "completion_length": 10.921875, "epoch": 0.09882600315402137, "grad_norm": 24.6052578581256, "kl": 0.07470703125, "learning_rate": 9.013492202558262e-07, "loss": 0.0084, "reward": 1.7897675037384033, "reward_std": 0.2912678122520447, "rewards/accuracy_reward_stage2": 0.8053925037384033, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 564 }, { "completion_length": 12.8125, "epoch": 0.09900122656386894, "grad_norm": 23.816204925967753, "kl": 0.1689453125, "learning_rate": 9.011739968459787e-07, "loss": 0.0676, "reward": 1.3777607679367065, "reward_std": 0.19424016773700714, "rewards/accuracy_reward_stage2": 0.7527608275413513, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 565 }, { "completion_length": 11.1875, "epoch": 0.09917644997371648, "grad_norm": 23.472488849189013, "kl": 0.06591796875, "learning_rate": 9.00998773436131e-07, "loss": 0.0265, "reward": 1.4185214042663574, "reward_std": 0.1836758255958557, "rewards/accuracy_reward_stage2": 0.41852134466171265, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 566 }, { "completion_length": 10.03125, "epoch": 0.09935167338356404, "grad_norm": 21.483289325649206, "kl": 0.099609375, "learning_rate": 9.008235500262835e-07, "loss": 0.0398, "reward": 1.4853670597076416, "reward_std": 0.18764030933380127, "rewards/accuracy_reward_stage2": 0.7353670597076416, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 567 }, { "completion_length": 8.3125, "epoch": 0.0995268967934116, "grad_norm": 22.82355246060793, "kl": 0.08984375, "learning_rate": 9.006483266164358e-07, "loss": 0.0359, "reward": 1.4687992334365845, "reward_std": 0.3864898681640625, "rewards/accuracy_reward_stage2": 0.5937991738319397, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 568 }, { "completion_length": 9.296875, "epoch": 0.09970212020325915, "grad_norm": 16.1402694836104, "kl": 0.033935546875, "learning_rate": 9.004731032065883e-07, "loss": 0.0136, "reward": 1.5385760068893433, "reward_std": 0.17539136111736298, "rewards/accuracy_reward_stage2": 0.6635760068893433, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 569 }, { "completion_length": 8.828125, "epoch": 0.09987734361310671, "grad_norm": 24.375755946771584, "kl": 0.126953125, "learning_rate": 9.002978797967408e-07, "loss": 0.0509, "reward": 1.4903528690338135, "reward_std": 0.23065432906150818, "rewards/accuracy_reward_stage2": 0.6153527498245239, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 570 }, { "completion_length": 13.546875, "epoch": 0.10005256702295427, "grad_norm": 14.409670458769568, "kl": 0.026123046875, "learning_rate": 9.001226563868932e-07, "loss": 0.0105, "reward": 1.639979362487793, "reward_std": 0.05574566125869751, "rewards/accuracy_reward_stage2": 0.6399792432785034, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 571 }, { "completion_length": 8.1875, "epoch": 0.10022779043280182, "grad_norm": 21.203228039697116, "kl": 0.033447265625, "learning_rate": 8.999474329770457e-07, "loss": 0.0133, "reward": 1.7701388597488403, "reward_std": 0.15610602498054504, "rewards/accuracy_reward_stage2": 0.7701388597488403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 572 }, { "completion_length": 13.515625, "epoch": 0.10040301384264938, "grad_norm": 248.60038049667256, "kl": 1.140625, "learning_rate": 8.997722095671982e-07, "loss": 0.4545, "reward": 1.1850402355194092, "reward_std": 0.3066478967666626, "rewards/accuracy_reward_stage2": 0.560040295124054, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 573 }, { "completion_length": 8.09375, "epoch": 0.10057823725249693, "grad_norm": 17.849366329021635, "kl": 0.05224609375, "learning_rate": 8.995969861573506e-07, "loss": 0.0209, "reward": 1.8501524925231934, "reward_std": 0.1443457454442978, "rewards/accuracy_reward_stage2": 0.8501523733139038, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 574 }, { "completion_length": 5.625, "epoch": 0.10075346066234449, "grad_norm": 15.276267483118692, "kl": 0.0654296875, "learning_rate": 8.994217627475031e-07, "loss": 0.0261, "reward": 1.4483295679092407, "reward_std": 0.11541729420423508, "rewards/accuracy_reward_stage2": 0.5733295679092407, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 575 }, { "completion_length": 37.859375, "epoch": 0.10092868407219205, "grad_norm": 3981.8869707992453, "kl": 23.375, "learning_rate": 8.992465393376554e-07, "loss": 9.357, "reward": 1.3813152313232422, "reward_std": 0.07771497964859009, "rewards/accuracy_reward_stage2": 0.7563152313232422, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 576 }, { "completion_length": 13.828125, "epoch": 0.1011039074820396, "grad_norm": 22.44950284758865, "kl": 0.06640625, "learning_rate": 8.990713159278079e-07, "loss": 0.0267, "reward": 1.568946123123169, "reward_std": 0.2095562368631363, "rewards/accuracy_reward_stage2": 0.568946123123169, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 577 }, { "completion_length": 7.9375, "epoch": 0.10127913089188716, "grad_norm": 26.53547380462697, "kl": 0.12353515625, "learning_rate": 8.988960925179604e-07, "loss": 0.0495, "reward": 1.6583962440490723, "reward_std": 0.29453662037849426, "rewards/accuracy_reward_stage2": 0.658396303653717, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 578 }, { "completion_length": 9.3125, "epoch": 0.10145435430173472, "grad_norm": 16.845420567269674, "kl": 0.030029296875, "learning_rate": 8.987208691081128e-07, "loss": 0.012, "reward": 1.513580560684204, "reward_std": 0.22537294030189514, "rewards/accuracy_reward_stage2": 0.5135806202888489, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 579 }, { "completion_length": 9.28125, "epoch": 0.10162957771158226, "grad_norm": 23.688189724072313, "kl": 0.08984375, "learning_rate": 8.985456456982653e-07, "loss": 0.0359, "reward": 1.4752018451690674, "reward_std": 0.3478262424468994, "rewards/accuracy_reward_stage2": 0.6002017855644226, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 580 }, { "completion_length": 10.578125, "epoch": 0.10180480112142982, "grad_norm": 21.007284172674584, "kl": 0.099609375, "learning_rate": 8.983704222884176e-07, "loss": 0.0397, "reward": 1.5977373123168945, "reward_std": 0.2676808834075928, "rewards/accuracy_reward_stage2": 0.5977373719215393, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 581 }, { "completion_length": 28.8125, "epoch": 0.10198002453127737, "grad_norm": 22.332576010592028, "kl": 0.322265625, "learning_rate": 8.981951988785701e-07, "loss": 0.1292, "reward": 1.1091969013214111, "reward_std": 0.1680152416229248, "rewards/accuracy_reward_stage2": 0.3591969609260559, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 582 }, { "completion_length": 9.671875, "epoch": 0.10215524794112493, "grad_norm": 18.801648588335883, "kl": 0.048828125, "learning_rate": 8.980199754687226e-07, "loss": 0.0196, "reward": 1.2164499759674072, "reward_std": 0.17073309421539307, "rewards/accuracy_reward_stage2": 0.21645000576972961, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 583 }, { "completion_length": 9.234375, "epoch": 0.1023304713509725, "grad_norm": 15.080559029811127, "kl": 0.0216064453125, "learning_rate": 8.97844752058875e-07, "loss": 0.0087, "reward": 1.6748721599578857, "reward_std": 0.123601995408535, "rewards/accuracy_reward_stage2": 0.6748720407485962, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 584 }, { "completion_length": 20.1875, "epoch": 0.10250569476082004, "grad_norm": 30.400426260278596, "kl": 0.53125, "learning_rate": 8.976695286490275e-07, "loss": 0.1686, "reward": 1.4335464239120483, "reward_std": 0.15213021636009216, "rewards/accuracy_reward_stage2": 0.5741714239120483, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 585 }, { "completion_length": 10.390625, "epoch": 0.1026809181706676, "grad_norm": 22.515022914890093, "kl": 0.1064453125, "learning_rate": 8.974943052391799e-07, "loss": -0.0016, "reward": 1.328930139541626, "reward_std": 0.270600289106369, "rewards/accuracy_reward_stage2": 0.469555139541626, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 586 }, { "completion_length": 9.484375, "epoch": 0.10285614158051516, "grad_norm": 22.364167132308026, "kl": 0.10986328125, "learning_rate": 8.973190818293323e-07, "loss": 0.0439, "reward": 1.5296134948730469, "reward_std": 0.2580759525299072, "rewards/accuracy_reward_stage2": 0.5296134948730469, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 587 }, { "completion_length": 18.15625, "epoch": 0.10303136499036271, "grad_norm": 21.189378057515125, "kl": 0.0927734375, "learning_rate": 8.971438584194848e-07, "loss": 0.0371, "reward": 1.5236037969589233, "reward_std": 0.26827800273895264, "rewards/accuracy_reward_stage2": 0.5236037969589233, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 588 }, { "completion_length": 10.703125, "epoch": 0.10320658840021027, "grad_norm": 17.39955052196623, "kl": 0.062255859375, "learning_rate": 8.969686350096372e-07, "loss": 0.0249, "reward": 1.7869575023651123, "reward_std": 0.19973281025886536, "rewards/accuracy_reward_stage2": 0.7869575619697571, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 589 }, { "completion_length": 14.03125, "epoch": 0.10338181181005783, "grad_norm": 25.503001638471407, "kl": 0.37109375, "learning_rate": 8.967934115997897e-07, "loss": 0.1483, "reward": 1.2879681587219238, "reward_std": 0.29135996103286743, "rewards/accuracy_reward_stage2": 0.5379682183265686, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 590 }, { "completion_length": 19.984375, "epoch": 0.10355703521990538, "grad_norm": 19.80642056865787, "kl": 0.3828125, "learning_rate": 8.966181881899422e-07, "loss": 0.1527, "reward": 1.4519280195236206, "reward_std": 0.10544341057538986, "rewards/accuracy_reward_stage2": 0.7019280195236206, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 591 }, { "completion_length": 9.0625, "epoch": 0.10373225862975294, "grad_norm": 17.164179326779152, "kl": 0.0208740234375, "learning_rate": 8.964429647800946e-07, "loss": -0.0669, "reward": 1.6845653057098389, "reward_std": 0.2696949243545532, "rewards/accuracy_reward_stage2": 0.8408153057098389, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 592 }, { "completion_length": 9.84375, "epoch": 0.10390748203960049, "grad_norm": 20.757377644711703, "kl": 0.03857421875, "learning_rate": 8.962677413702471e-07, "loss": 0.0154, "reward": 1.6968727111816406, "reward_std": 0.17491915822029114, "rewards/accuracy_reward_stage2": 0.6968726515769958, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 593 }, { "completion_length": 12.59375, "epoch": 0.10408270544944805, "grad_norm": 29.563787899550757, "kl": 0.044677734375, "learning_rate": 8.960925179603995e-07, "loss": 0.0403, "reward": 1.6349983215332031, "reward_std": 0.1502247452735901, "rewards/accuracy_reward_stage2": 0.7599983215332031, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 594 }, { "completion_length": 12.609375, "epoch": 0.1042579288592956, "grad_norm": 14.903961141826915, "kl": 0.0439453125, "learning_rate": 8.959172945505519e-07, "loss": -0.0343, "reward": 1.5937268733978271, "reward_std": 0.19302524626255035, "rewards/accuracy_reward_stage2": 0.6249768733978271, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 595 }, { "completion_length": 21.4375, "epoch": 0.10443315226914315, "grad_norm": 26.291996159246033, "kl": 0.038330078125, "learning_rate": 8.957420711407043e-07, "loss": -0.0162, "reward": 1.3569380044937134, "reward_std": 0.20827914774417877, "rewards/accuracy_reward_stage2": 0.3725629448890686, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 596 }, { "completion_length": 7.390625, "epoch": 0.10460837567899071, "grad_norm": 16.93122883453144, "kl": 0.052978515625, "learning_rate": 8.955668477308567e-07, "loss": 0.0212, "reward": 1.5769851207733154, "reward_std": 0.25848501920700073, "rewards/accuracy_reward_stage2": 0.7019850015640259, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 597 }, { "completion_length": 23.1875, "epoch": 0.10478359908883828, "grad_norm": 60.151052040310844, "kl": 0.81640625, "learning_rate": 8.953916243210092e-07, "loss": 0.2832, "reward": 1.166857123374939, "reward_std": 0.17063213884830475, "rewards/accuracy_reward_stage2": 0.30748212337493896, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 598 }, { "completion_length": 9.84375, "epoch": 0.10495882249868582, "grad_norm": 29.438033012180327, "kl": 0.1962890625, "learning_rate": 8.952164009111617e-07, "loss": 0.0351, "reward": 1.6112334728240967, "reward_std": 0.2669917345046997, "rewards/accuracy_reward_stage2": 0.6268585324287415, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 599 }, { "completion_length": 15.953125, "epoch": 0.10513404590853338, "grad_norm": 19.207758702593235, "kl": 0.0771484375, "learning_rate": 8.950411775013141e-07, "loss": -0.0133, "reward": 1.3625128269195557, "reward_std": 0.12542307376861572, "rewards/accuracy_reward_stage2": 0.37813782691955566, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 600 }, { "completion_length": 13.1875, "epoch": 0.10530926931838093, "grad_norm": 22.80070618758305, "kl": 0.0947265625, "learning_rate": 8.948659540914666e-07, "loss": 0.0107, "reward": 1.4847618341445923, "reward_std": 0.20820224285125732, "rewards/accuracy_reward_stage2": 0.5003868341445923, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 601 }, { "completion_length": 13.59375, "epoch": 0.10548449272822849, "grad_norm": 16.16392296114711, "kl": 0.08251953125, "learning_rate": 8.946907306816191e-07, "loss": -0.0004, "reward": 1.6386563777923584, "reward_std": 0.14577843248844147, "rewards/accuracy_reward_stage2": 0.6542813181877136, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 602 }, { "completion_length": 9.375, "epoch": 0.10565971613807605, "grad_norm": 22.11154070768946, "kl": 0.04150390625, "learning_rate": 8.945155072717715e-07, "loss": 0.0166, "reward": 1.7849338054656982, "reward_std": 0.2635495364665985, "rewards/accuracy_reward_stage2": 0.7849337458610535, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 603 }, { "completion_length": 21.484375, "epoch": 0.1058349395479236, "grad_norm": 25.437037649937153, "kl": 0.33203125, "learning_rate": 8.94340283861924e-07, "loss": 0.1333, "reward": 1.4886643886566162, "reward_std": 0.19219179451465607, "rewards/accuracy_reward_stage2": 0.6136643886566162, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 604 }, { "completion_length": 14.390625, "epoch": 0.10601016295777116, "grad_norm": 12.178068721041292, "kl": 0.00909423828125, "learning_rate": 8.941650604520764e-07, "loss": 0.0036, "reward": 1.5776515007019043, "reward_std": 0.1157275140285492, "rewards/accuracy_reward_stage2": 0.5776515007019043, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 605 }, { "completion_length": 13.109375, "epoch": 0.10618538636761872, "grad_norm": 20.590768959219737, "kl": 0.0311279296875, "learning_rate": 8.939898370422288e-07, "loss": -0.0317, "reward": 1.5676214694976807, "reward_std": 0.10006687045097351, "rewards/accuracy_reward_stage2": 0.5832464694976807, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 606 }, { "completion_length": 10.03125, "epoch": 0.10636060977746627, "grad_norm": 28.16540860038017, "kl": 0.224609375, "learning_rate": 8.938146136323812e-07, "loss": 0.0899, "reward": 1.3759479522705078, "reward_std": 0.07002376019954681, "rewards/accuracy_reward_stage2": 0.5009479522705078, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 607 }, { "completion_length": 9.515625, "epoch": 0.10653583318731383, "grad_norm": 15.000548669218713, "kl": 0.05859375, "learning_rate": 8.936393902225336e-07, "loss": 0.0234, "reward": 1.5940972566604614, "reward_std": 0.1635403335094452, "rewards/accuracy_reward_stage2": 0.7190971970558167, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 608 }, { "completion_length": 7.4375, "epoch": 0.10671105659716137, "grad_norm": 19.213288800692773, "kl": 0.043701171875, "learning_rate": 8.934641668126861e-07, "loss": 0.0175, "reward": 1.2412978410720825, "reward_std": 0.2668173313140869, "rewards/accuracy_reward_stage2": 0.3662978708744049, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 609 }, { "completion_length": 16.1875, "epoch": 0.10688628000700894, "grad_norm": 22.02974792763698, "kl": 0.55078125, "learning_rate": 8.932889434028386e-07, "loss": 0.2196, "reward": 1.4358563423156738, "reward_std": 0.2029583603143692, "rewards/accuracy_reward_stage2": 0.6858564615249634, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 610 }, { "completion_length": 8.765625, "epoch": 0.1070615034168565, "grad_norm": 27.041480284219045, "kl": 0.017578125, "learning_rate": 8.93113719992991e-07, "loss": 0.007, "reward": 1.6799907684326172, "reward_std": 0.11624743044376373, "rewards/accuracy_reward_stage2": 0.6799907088279724, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 611 }, { "completion_length": 9.875, "epoch": 0.10723672682670404, "grad_norm": 124.73618694081247, "kl": 0.037109375, "learning_rate": 8.929384965831435e-07, "loss": 0.0149, "reward": 1.5989768505096436, "reward_std": 0.1074318140745163, "rewards/accuracy_reward_stage2": 0.5989767909049988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 612 }, { "completion_length": 8.09375, "epoch": 0.1074119502365516, "grad_norm": 21.868434969689023, "kl": 0.041259765625, "learning_rate": 8.927632731732959e-07, "loss": -0.0277, "reward": 1.1822917461395264, "reward_std": 0.16993504762649536, "rewards/accuracy_reward_stage2": 0.1979166716337204, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 613 }, { "completion_length": 10.25, "epoch": 0.10758717364639916, "grad_norm": 20.31274312952241, "kl": 0.061279296875, "learning_rate": 8.925880497634484e-07, "loss": -0.0486, "reward": 1.7639718055725098, "reward_std": 0.2529860734939575, "rewards/accuracy_reward_stage2": 0.9202218055725098, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 614 }, { "completion_length": 10.625, "epoch": 0.10776239705624671, "grad_norm": 22.26824541507161, "kl": 0.06103515625, "learning_rate": 8.924128263536009e-07, "loss": 0.0245, "reward": 1.5011794567108154, "reward_std": 0.23802436888217926, "rewards/accuracy_reward_stage2": 0.5011795163154602, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 615 }, { "completion_length": 12.109375, "epoch": 0.10793762046609427, "grad_norm": 17.995465269921365, "kl": 0.06689453125, "learning_rate": 8.922376029437532e-07, "loss": 0.0268, "reward": 1.575548529624939, "reward_std": 0.20315426588058472, "rewards/accuracy_reward_stage2": 0.700548529624939, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 616 }, { "completion_length": 13.984375, "epoch": 0.10811284387594182, "grad_norm": 19.57701236061785, "kl": 0.36328125, "learning_rate": 8.920623795339057e-07, "loss": 0.1457, "reward": 1.6030621528625488, "reward_std": 0.13488642871379852, "rewards/accuracy_reward_stage2": 0.7280622124671936, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 617 }, { "completion_length": 17.46875, "epoch": 0.10828806728578938, "grad_norm": 20.320395279142957, "kl": 0.046142578125, "learning_rate": 8.918871561240582e-07, "loss": 0.0184, "reward": 1.4128468036651611, "reward_std": 0.19677528738975525, "rewards/accuracy_reward_stage2": 0.41284680366516113, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 618 }, { "completion_length": 6.234375, "epoch": 0.10846329069563694, "grad_norm": 21.100346568277157, "kl": 0.03173828125, "learning_rate": 8.917119327142105e-07, "loss": 0.0126, "reward": 1.506620168685913, "reward_std": 0.19347314536571503, "rewards/accuracy_reward_stage2": 0.5066201686859131, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 619 }, { "completion_length": 10.03125, "epoch": 0.10863851410548449, "grad_norm": 20.38714158047014, "kl": 0.0306396484375, "learning_rate": 8.91536709304363e-07, "loss": 0.0122, "reward": 1.515625, "reward_std": 0.2109457552433014, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 620 }, { "completion_length": 6.15625, "epoch": 0.10881373751533205, "grad_norm": 13.837433374871935, "kl": 0.08251953125, "learning_rate": 8.913614858945154e-07, "loss": 0.033, "reward": 1.5983612537384033, "reward_std": 0.13748112320899963, "rewards/accuracy_reward_stage2": 0.5983611941337585, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 621 }, { "completion_length": 9.46875, "epoch": 0.10898896092517961, "grad_norm": 22.91529731345705, "kl": 0.1708984375, "learning_rate": 8.911862624846679e-07, "loss": 0.0683, "reward": 1.6282224655151367, "reward_std": 0.23113414645195007, "rewards/accuracy_reward_stage2": 0.6282224655151367, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 622 }, { "completion_length": 14.25, "epoch": 0.10916418433502716, "grad_norm": 15.083020792961545, "kl": 0.0264892578125, "learning_rate": 8.910110390748204e-07, "loss": 0.0106, "reward": 1.7609566450119019, "reward_std": 0.18378415703773499, "rewards/accuracy_reward_stage2": 0.7609566450119019, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 623 }, { "completion_length": 16.765625, "epoch": 0.10933940774487472, "grad_norm": 21.66425736767919, "kl": 0.306640625, "learning_rate": 8.908358156649728e-07, "loss": 0.1057, "reward": 1.4428138732910156, "reward_std": 0.12810033559799194, "rewards/accuracy_reward_stage2": 0.5678137540817261, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 624 }, { "completion_length": 11.375, "epoch": 0.10951463115472228, "grad_norm": 22.060448935490477, "kl": 0.0830078125, "learning_rate": 8.906605922551253e-07, "loss": 0.0331, "reward": 1.6386213302612305, "reward_std": 0.10365209728479385, "rewards/accuracy_reward_stage2": 0.6386213302612305, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 625 }, { "completion_length": 9.5625, "epoch": 0.10968985456456982, "grad_norm": 15.622154479911362, "kl": 0.020751953125, "learning_rate": 8.904853688452777e-07, "loss": -0.0648, "reward": 1.841752290725708, "reward_std": 0.1866808980703354, "rewards/accuracy_reward_stage2": 0.8730022311210632, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 626 }, { "completion_length": 11.953125, "epoch": 0.10986507797441739, "grad_norm": 12.336957618950738, "kl": 0.07470703125, "learning_rate": 8.903101454354301e-07, "loss": 0.0299, "reward": 1.5518933534622192, "reward_std": 0.07083739340305328, "rewards/accuracy_reward_stage2": 0.6768933534622192, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 627 }, { "completion_length": 11.875, "epoch": 0.11004030138426493, "grad_norm": 19.012998512084977, "kl": 0.0830078125, "learning_rate": 8.901349220255826e-07, "loss": -0.0109, "reward": 1.2286701202392578, "reward_std": 0.21301977336406708, "rewards/accuracy_reward_stage2": 0.24429510533809662, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 628 }, { "completion_length": 9.28125, "epoch": 0.1102155247941125, "grad_norm": 19.74217319899889, "kl": 0.287109375, "learning_rate": 8.89959698615735e-07, "loss": 0.1149, "reward": 1.5378367900848389, "reward_std": 0.13060753047466278, "rewards/accuracy_reward_stage2": 0.6628367900848389, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 629 }, { "completion_length": 9.296875, "epoch": 0.11039074820396005, "grad_norm": 19.776418120024495, "kl": 0.040283203125, "learning_rate": 8.897844752058875e-07, "loss": -0.057, "reward": 1.629166603088379, "reward_std": 0.21690016984939575, "rewards/accuracy_reward_stage2": 0.6604166626930237, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 630 }, { "completion_length": 7.625, "epoch": 0.1105659716138076, "grad_norm": 17.89716035722575, "kl": 0.12109375, "learning_rate": 8.8960925179604e-07, "loss": 0.009, "reward": 1.456545114517212, "reward_std": 0.1892307996749878, "rewards/accuracy_reward_stage2": 0.47217005491256714, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 631 }, { "completion_length": 7.671875, "epoch": 0.11074119502365516, "grad_norm": 19.94611007960208, "kl": 0.0595703125, "learning_rate": 8.894340283861923e-07, "loss": 0.0238, "reward": 1.5591226816177368, "reward_std": 0.14498020708560944, "rewards/accuracy_reward_stage2": 0.5591225624084473, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 632 }, { "completion_length": 15.0, "epoch": 0.11091641843350272, "grad_norm": 19.2567062060884, "kl": 0.062255859375, "learning_rate": 8.892588049763448e-07, "loss": -0.0193, "reward": 1.8932830095291138, "reward_std": 0.18728239834308624, "rewards/accuracy_reward_stage2": 0.9089080095291138, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 633 }, { "completion_length": 16.828125, "epoch": 0.11109164184335027, "grad_norm": 96.32237761630043, "kl": 0.2412109375, "learning_rate": 8.890835815664973e-07, "loss": 0.0084, "reward": 1.396424651145935, "reward_std": 0.2588643431663513, "rewards/accuracy_reward_stage2": 0.42767465114593506, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 634 }, { "completion_length": 8.90625, "epoch": 0.11126686525319783, "grad_norm": 33.13647172827168, "kl": 0.1201171875, "learning_rate": 8.889083581566496e-07, "loss": 0.0482, "reward": 1.541839361190796, "reward_std": 0.2884628474712372, "rewards/accuracy_reward_stage2": 0.6668393611907959, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 635 }, { "completion_length": 10.921875, "epoch": 0.11144208866304538, "grad_norm": 22.79993073340755, "kl": 0.056640625, "learning_rate": 8.887331347468021e-07, "loss": 0.0227, "reward": 1.5613070726394653, "reward_std": 0.24244731664657593, "rewards/accuracy_reward_stage2": 0.7019320130348206, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 636 }, { "completion_length": 10.40625, "epoch": 0.11161731207289294, "grad_norm": 25.892737746278428, "kl": 0.040771484375, "learning_rate": 8.885579113369545e-07, "loss": 0.0163, "reward": 1.7931983470916748, "reward_std": 0.22556662559509277, "rewards/accuracy_reward_stage2": 0.7931983470916748, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 637 }, { "completion_length": 19.671875, "epoch": 0.1117925354827405, "grad_norm": 3265.4880349670693, "kl": 11.375, "learning_rate": 8.88382687927107e-07, "loss": 4.5539, "reward": 1.350884199142456, "reward_std": 0.21642833948135376, "rewards/accuracy_reward_stage2": 0.47588419914245605, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 638 }, { "completion_length": 9.40625, "epoch": 0.11196775889258805, "grad_norm": 25.598904130512008, "kl": 0.07763671875, "learning_rate": 8.882074645172595e-07, "loss": -0.0043, "reward": 1.550414800643921, "reward_std": 0.26173245906829834, "rewards/accuracy_reward_stage2": 0.5660399198532104, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 639 }, { "completion_length": 13.75, "epoch": 0.1121429823024356, "grad_norm": 435.07023916376096, "kl": 2.25, "learning_rate": 8.880322411074119e-07, "loss": 0.9058, "reward": 1.3459508419036865, "reward_std": 0.1702684909105301, "rewards/accuracy_reward_stage2": 0.4709508419036865, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 640 }, { "completion_length": 9.6875, "epoch": 0.11231820571228317, "grad_norm": 19.688791320693465, "kl": 0.099609375, "learning_rate": 8.878570176975644e-07, "loss": 0.0398, "reward": 1.2360773086547852, "reward_std": 0.13248330354690552, "rewards/accuracy_reward_stage2": 0.36107730865478516, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 641 }, { "completion_length": 7.828125, "epoch": 0.11249342912213071, "grad_norm": 36.62343718508925, "kl": 0.06494140625, "learning_rate": 8.876817942877169e-07, "loss": 0.026, "reward": 1.5003522634506226, "reward_std": 0.1630491018295288, "rewards/accuracy_reward_stage2": 0.5003523230552673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 642 }, { "completion_length": 11.84375, "epoch": 0.11266865253197828, "grad_norm": 24.784720605465544, "kl": 0.14453125, "learning_rate": 8.875065708778693e-07, "loss": 0.0134, "reward": 1.6077549457550049, "reward_std": 0.23488447070121765, "rewards/accuracy_reward_stage2": 0.7483799457550049, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 643 }, { "completion_length": 14.46875, "epoch": 0.11284387594182582, "grad_norm": 25.261374722671505, "kl": 0.09619140625, "learning_rate": 8.873313474680218e-07, "loss": 0.0138, "reward": 1.5423240661621094, "reward_std": 0.23243725299835205, "rewards/accuracy_reward_stage2": 0.5579490661621094, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 644 }, { "completion_length": 12.09375, "epoch": 0.11301909935167338, "grad_norm": 24.468911275019025, "kl": 0.310546875, "learning_rate": 8.87156124058174e-07, "loss": 0.1233, "reward": 1.5601372718811035, "reward_std": 0.12885455787181854, "rewards/accuracy_reward_stage2": 0.6851372122764587, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 645 }, { "completion_length": 30.671875, "epoch": 0.11319432276152094, "grad_norm": 19.03420864463341, "kl": 0.0155029296875, "learning_rate": 8.869809006483265e-07, "loss": 0.0062, "reward": 1.8722697496414185, "reward_std": 0.08883378654718399, "rewards/accuracy_reward_stage2": 0.8722698092460632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 646 }, { "completion_length": 10.03125, "epoch": 0.11336954617136849, "grad_norm": 26.39720044309794, "kl": 0.1533203125, "learning_rate": 8.86805677238479e-07, "loss": -0.0185, "reward": 1.4335455894470215, "reward_std": 0.3102685213088989, "rewards/accuracy_reward_stage2": 0.4647955894470215, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 647 }, { "completion_length": 12.25, "epoch": 0.11354476958121605, "grad_norm": 20.080247397857068, "kl": 0.08203125, "learning_rate": 8.866304538286314e-07, "loss": 0.0329, "reward": 1.620429515838623, "reward_std": 0.22463209927082062, "rewards/accuracy_reward_stage2": 0.6204294562339783, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 648 }, { "completion_length": 10.703125, "epoch": 0.11371999299106361, "grad_norm": 25.501686410903066, "kl": 0.1396484375, "learning_rate": 8.864552304187839e-07, "loss": 0.0559, "reward": 1.4538609981536865, "reward_std": 0.2756735682487488, "rewards/accuracy_reward_stage2": 0.5788609385490417, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 649 }, { "completion_length": 7.484375, "epoch": 0.11389521640091116, "grad_norm": 20.12953702589919, "kl": 0.039794921875, "learning_rate": 8.862800070089363e-07, "loss": -0.0234, "reward": 1.4669541120529175, "reward_std": 0.1871233880519867, "rewards/accuracy_reward_stage2": 0.4825791120529175, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 650 }, { "completion_length": 8.859375, "epoch": 0.11407043981075872, "grad_norm": 21.64836519336152, "kl": 0.07666015625, "learning_rate": 8.861047835990888e-07, "loss": 0.0307, "reward": 1.3550217151641846, "reward_std": 0.2267133742570877, "rewards/accuracy_reward_stage2": 0.35502177476882935, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 651 }, { "completion_length": 10.359375, "epoch": 0.11424566322060627, "grad_norm": 18.437674084025137, "kl": 0.03271484375, "learning_rate": 8.859295601892413e-07, "loss": 0.0346, "reward": 1.4443836212158203, "reward_std": 0.22392143309116364, "rewards/accuracy_reward_stage2": 0.5693836808204651, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 652 }, { "completion_length": 8.609375, "epoch": 0.11442088663045383, "grad_norm": 19.893784433796597, "kl": 0.06982421875, "learning_rate": 8.857543367793937e-07, "loss": 0.0004, "reward": 1.4368394613265991, "reward_std": 0.24404609203338623, "rewards/accuracy_reward_stage2": 0.5774644613265991, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 653 }, { "completion_length": 11.859375, "epoch": 0.11459611004030139, "grad_norm": 14.241918651909575, "kl": 0.119140625, "learning_rate": 8.855791133695462e-07, "loss": 0.0477, "reward": 1.4363772869110107, "reward_std": 0.09710687398910522, "rewards/accuracy_reward_stage2": 0.6863773465156555, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 654 }, { "completion_length": 9.78125, "epoch": 0.11477133345014894, "grad_norm": 19.270036996457037, "kl": 0.08251953125, "learning_rate": 8.854038899596987e-07, "loss": 0.0331, "reward": 1.6022714376449585, "reward_std": 0.29964011907577515, "rewards/accuracy_reward_stage2": 0.6022714376449585, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 655 }, { "completion_length": 10.859375, "epoch": 0.1149465568599965, "grad_norm": 26.189268834179053, "kl": 0.052978515625, "learning_rate": 8.85228666549851e-07, "loss": -0.014, "reward": 1.5754039287567139, "reward_std": 0.25072386860847473, "rewards/accuracy_reward_stage2": 0.6066538095474243, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 656 }, { "completion_length": 18.671875, "epoch": 0.11512178026984406, "grad_norm": 22.108350234072944, "kl": 0.015625, "learning_rate": 8.850534431400035e-07, "loss": 0.0062, "reward": 1.5428324937820435, "reward_std": 0.25868257880210876, "rewards/accuracy_reward_stage2": 0.5428324937820435, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 657 }, { "completion_length": 10.1875, "epoch": 0.1152970036796916, "grad_norm": 16.81031071335677, "kl": 0.06640625, "learning_rate": 8.848782197301558e-07, "loss": 0.0265, "reward": 1.6095609664916992, "reward_std": 0.1511930674314499, "rewards/accuracy_reward_stage2": 0.609561026096344, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 658 }, { "completion_length": 18.171875, "epoch": 0.11547222708953916, "grad_norm": 23.544019291161536, "kl": 0.33203125, "learning_rate": 8.847029963203083e-07, "loss": 0.1328, "reward": 1.2967886924743652, "reward_std": 0.24356095492839813, "rewards/accuracy_reward_stage2": 0.42178869247436523, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 659 }, { "completion_length": 9.46875, "epoch": 0.11564745049938671, "grad_norm": 20.610555296149347, "kl": 0.045166015625, "learning_rate": 8.845277729104608e-07, "loss": 0.0181, "reward": 1.5369362831115723, "reward_std": 0.19677358865737915, "rewards/accuracy_reward_stage2": 0.5369362235069275, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 660 }, { "completion_length": 11.578125, "epoch": 0.11582267390923427, "grad_norm": 23.473030243653238, "kl": 0.09375, "learning_rate": 8.843525495006132e-07, "loss": -0.0066, "reward": 1.5962979793548584, "reward_std": 0.24861329793930054, "rewards/accuracy_reward_stage2": 0.6119229793548584, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 661 }, { "completion_length": 11.921875, "epoch": 0.11599789731908183, "grad_norm": 33.93562617380281, "kl": 0.08251953125, "learning_rate": 8.841773260907657e-07, "loss": 0.0329, "reward": 1.5833332538604736, "reward_std": 0.212066650390625, "rewards/accuracy_reward_stage2": 0.5833332538604736, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 662 }, { "completion_length": 9.84375, "epoch": 0.11617312072892938, "grad_norm": 26.082326069186735, "kl": 0.06396484375, "learning_rate": 8.840021026809182e-07, "loss": -0.0278, "reward": 1.449662446975708, "reward_std": 0.22050856053829193, "rewards/accuracy_reward_stage2": 0.48091232776641846, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 663 }, { "completion_length": 11.359375, "epoch": 0.11634834413877694, "grad_norm": 20.736667075932573, "kl": 0.05029296875, "learning_rate": 8.838268792710706e-07, "loss": 0.0201, "reward": 1.7296762466430664, "reward_std": 0.19434456527233124, "rewards/accuracy_reward_stage2": 0.7296761870384216, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 664 }, { "completion_length": 15.875, "epoch": 0.1165235675486245, "grad_norm": 16.50673791343758, "kl": 0.060791015625, "learning_rate": 8.83651655861223e-07, "loss": 0.0243, "reward": 1.299643635749817, "reward_std": 0.21787844598293304, "rewards/accuracy_reward_stage2": 0.2996436357498169, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 665 }, { "completion_length": 18.34375, "epoch": 0.11669879095847205, "grad_norm": 22.42753279935099, "kl": 0.416015625, "learning_rate": 8.834764324513754e-07, "loss": 0.1658, "reward": 1.5624425411224365, "reward_std": 0.16296470165252686, "rewards/accuracy_reward_stage2": 0.6874425411224365, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 666 }, { "completion_length": 12.671875, "epoch": 0.11687401436831961, "grad_norm": 21.038441282610368, "kl": 0.0615234375, "learning_rate": 8.833012090415279e-07, "loss": 0.0246, "reward": 1.6154024600982666, "reward_std": 0.28916823863983154, "rewards/accuracy_reward_stage2": 0.7404024004936218, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 667 }, { "completion_length": 9.609375, "epoch": 0.11704923777816717, "grad_norm": 30.071927015193936, "kl": 0.08447265625, "learning_rate": 8.831259856316804e-07, "loss": 0.0338, "reward": 1.4298069477081299, "reward_std": 0.21128447353839874, "rewards/accuracy_reward_stage2": 0.4298068881034851, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 668 }, { "completion_length": 7.453125, "epoch": 0.11722446118801472, "grad_norm": 21.105957298891614, "kl": 0.06494140625, "learning_rate": 8.829507622218328e-07, "loss": -0.0181, "reward": 1.3958759307861328, "reward_std": 0.17537108063697815, "rewards/accuracy_reward_stage2": 0.5365009903907776, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 669 }, { "completion_length": 10.46875, "epoch": 0.11739968459786228, "grad_norm": 18.960024727042775, "kl": 0.056884765625, "learning_rate": 8.827755388119852e-07, "loss": -0.0656, "reward": 1.4355816841125488, "reward_std": 0.23458895087242126, "rewards/accuracy_reward_stage2": 0.46683168411254883, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 670 }, { "completion_length": 37.203125, "epoch": 0.11757490800770982, "grad_norm": 56.1089179681269, "kl": 0.3671875, "learning_rate": 8.826003154021377e-07, "loss": 0.1468, "reward": 1.473738193511963, "reward_std": 0.2174667865037918, "rewards/accuracy_reward_stage2": 0.5987382531166077, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 671 }, { "completion_length": 7.15625, "epoch": 0.11775013141755739, "grad_norm": 13.451270267464256, "kl": 0.11474609375, "learning_rate": 8.824250919922901e-07, "loss": 0.0459, "reward": 1.5743929147720337, "reward_std": 0.10862401127815247, "rewards/accuracy_reward_stage2": 0.6993929147720337, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 672 }, { "completion_length": 20.109375, "epoch": 0.11792535482740495, "grad_norm": 26.849572474111802, "kl": 0.1298828125, "learning_rate": 8.822498685824426e-07, "loss": 0.0186, "reward": 1.508543610572815, "reward_std": 0.21059757471084595, "rewards/accuracy_reward_stage2": 0.6491686105728149, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 673 }, { "completion_length": 10.328125, "epoch": 0.1181005782372525, "grad_norm": 17.31888261104493, "kl": 0.03564453125, "learning_rate": 8.82074645172595e-07, "loss": 0.0143, "reward": 1.3142361640930176, "reward_std": 0.1992053985595703, "rewards/accuracy_reward_stage2": 0.3142361044883728, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 674 }, { "completion_length": 13.3125, "epoch": 0.11827580164710005, "grad_norm": 21.049934078158824, "kl": 0.140625, "learning_rate": 8.818994217627474e-07, "loss": 0.0562, "reward": 1.5868923664093018, "reward_std": 0.19813916087150574, "rewards/accuracy_reward_stage2": 0.586892306804657, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 675 }, { "completion_length": 15.25, "epoch": 0.11845102505694761, "grad_norm": 67.66555012601259, "kl": 0.052001953125, "learning_rate": 8.817241983528999e-07, "loss": 0.0209, "reward": 1.648368000984192, "reward_std": 0.13681325316429138, "rewards/accuracy_reward_stage2": 0.6483679413795471, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 676 }, { "completion_length": 8.625, "epoch": 0.11862624846679516, "grad_norm": 23.914579904623068, "kl": 0.0274658203125, "learning_rate": 8.815489749430523e-07, "loss": 0.011, "reward": 1.7271525859832764, "reward_std": 0.29088348150253296, "rewards/accuracy_reward_stage2": 0.7271526455879211, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 677 }, { "completion_length": 10.359375, "epoch": 0.11880147187664272, "grad_norm": 23.544438238575456, "kl": 0.06494140625, "learning_rate": 8.813737515332048e-07, "loss": -0.0126, "reward": 1.7988197803497314, "reward_std": 0.2707360088825226, "rewards/accuracy_reward_stage2": 0.8144446611404419, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 678 }, { "completion_length": 13.609375, "epoch": 0.11897669528649027, "grad_norm": 24.38470438267223, "kl": 0.06884765625, "learning_rate": 8.811985281233573e-07, "loss": -0.0755, "reward": 1.5430048704147339, "reward_std": 0.2995755672454834, "rewards/accuracy_reward_stage2": 0.5898798108100891, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 679 }, { "completion_length": 20.3125, "epoch": 0.11915191869633783, "grad_norm": 20.877247111687545, "kl": 0.55078125, "learning_rate": 8.810233047135097e-07, "loss": 0.2208, "reward": 1.4461277723312378, "reward_std": 0.10148172080516815, "rewards/accuracy_reward_stage2": 0.5711277723312378, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 680 }, { "completion_length": 12.59375, "epoch": 0.11932714210618539, "grad_norm": 53.43297706665784, "kl": 0.07080078125, "learning_rate": 8.808480813036622e-07, "loss": 0.0284, "reward": 1.6597884893417358, "reward_std": 0.29190492630004883, "rewards/accuracy_reward_stage2": 0.6597884893417358, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 681 }, { "completion_length": 14.15625, "epoch": 0.11950236551603294, "grad_norm": 11.011832755978508, "kl": 0.047119140625, "learning_rate": 8.806728578938146e-07, "loss": 0.0188, "reward": 1.5102589130401611, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.5102588534355164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 682 }, { "completion_length": 9.640625, "epoch": 0.1196775889258805, "grad_norm": 25.308451607783756, "kl": 0.0859375, "learning_rate": 8.80497634483967e-07, "loss": -0.0043, "reward": 1.5487972497940063, "reward_std": 0.15885029733181, "rewards/accuracy_reward_stage2": 0.5800472497940063, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 683 }, { "completion_length": 22.40625, "epoch": 0.11985281233572806, "grad_norm": 25.29527051256025, "kl": 0.031982421875, "learning_rate": 8.803224110741195e-07, "loss": 0.0129, "reward": 1.7597458362579346, "reward_std": 0.18770715594291687, "rewards/accuracy_reward_stage2": 0.7597458362579346, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 684 }, { "completion_length": 29.03125, "epoch": 0.1200280357455756, "grad_norm": 86.19787745150437, "kl": 0.236328125, "learning_rate": 8.801471876642718e-07, "loss": 0.0505, "reward": 1.5671895742416382, "reward_std": 0.1661956012248993, "rewards/accuracy_reward_stage2": 0.7078145742416382, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 685 }, { "completion_length": 6.984375, "epoch": 0.12020325915542317, "grad_norm": 13.60605571568435, "kl": 0.07177734375, "learning_rate": 8.799719642544243e-07, "loss": 0.0288, "reward": 1.4552290439605713, "reward_std": 0.16398081183433533, "rewards/accuracy_reward_stage2": 0.4552290439605713, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 686 }, { "completion_length": 10.390625, "epoch": 0.12037848256527071, "grad_norm": 25.567410027834242, "kl": 0.015380859375, "learning_rate": 8.797967408445768e-07, "loss": 0.0062, "reward": 1.7256697416305542, "reward_std": 0.20889979600906372, "rewards/accuracy_reward_stage2": 0.7256697416305542, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 687 }, { "completion_length": 8.640625, "epoch": 0.12055370597511827, "grad_norm": 16.393250950051225, "kl": 0.038818359375, "learning_rate": 8.796215174347292e-07, "loss": 0.0156, "reward": 1.6450669765472412, "reward_std": 0.09931539744138718, "rewards/accuracy_reward_stage2": 0.6450668573379517, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 688 }, { "completion_length": 19.53125, "epoch": 0.12072892938496584, "grad_norm": 20.266352703794034, "kl": 0.0556640625, "learning_rate": 8.794462940248817e-07, "loss": -0.0219, "reward": 1.4606654644012451, "reward_std": 0.3459789752960205, "rewards/accuracy_reward_stage2": 0.47629040479660034, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 689 }, { "completion_length": 12.0625, "epoch": 0.12090415279481338, "grad_norm": 45.55200569986124, "kl": 0.322265625, "learning_rate": 8.792710706150341e-07, "loss": 0.1292, "reward": 1.5311663150787354, "reward_std": 0.19967830181121826, "rewards/accuracy_reward_stage2": 0.6561661958694458, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 690 }, { "completion_length": 10.890625, "epoch": 0.12107937620466094, "grad_norm": 19.922042543972633, "kl": 0.0703125, "learning_rate": 8.790958472051866e-07, "loss": 0.0281, "reward": 1.6214659214019775, "reward_std": 0.17283859848976135, "rewards/accuracy_reward_stage2": 0.6214658617973328, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 691 }, { "completion_length": 22.9375, "epoch": 0.1212545996145085, "grad_norm": 19.99328887677077, "kl": 0.046142578125, "learning_rate": 8.789206237953391e-07, "loss": 0.0185, "reward": 1.484812617301941, "reward_std": 0.16919946670532227, "rewards/accuracy_reward_stage2": 0.48481255769729614, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 692 }, { "completion_length": 8.171875, "epoch": 0.12142982302435605, "grad_norm": 24.11697341254596, "kl": 0.0247802734375, "learning_rate": 8.787454003854915e-07, "loss": 0.0099, "reward": 1.6628926992416382, "reward_std": 0.2817220687866211, "rewards/accuracy_reward_stage2": 0.6628926992416382, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 693 }, { "completion_length": 14.6875, "epoch": 0.12160504643420361, "grad_norm": 55.143918136953324, "kl": 0.625, "learning_rate": 8.78570176975644e-07, "loss": 0.1772, "reward": 1.2694811820983887, "reward_std": 0.259592741727829, "rewards/accuracy_reward_stage2": 0.41010621190071106, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 694 }, { "completion_length": 14.734375, "epoch": 0.12178026984405116, "grad_norm": 21.688875450489633, "kl": 0.03564453125, "learning_rate": 8.783949535657964e-07, "loss": 0.0143, "reward": 1.6681108474731445, "reward_std": 0.13850000500679016, "rewards/accuracy_reward_stage2": 0.6681109666824341, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 695 }, { "completion_length": 9.453125, "epoch": 0.12195549325389872, "grad_norm": 20.260297289156117, "kl": 0.0986328125, "learning_rate": 8.782197301559487e-07, "loss": -0.0152, "reward": 1.6024032831192017, "reward_std": 0.26473379135131836, "rewards/accuracy_reward_stage2": 0.6336532831192017, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 696 }, { "completion_length": 13.5625, "epoch": 0.12213071666374628, "grad_norm": 52.401001224493655, "kl": 0.5, "learning_rate": 8.780445067461012e-07, "loss": 0.124, "reward": 1.285620927810669, "reward_std": 0.34111350774765015, "rewards/accuracy_reward_stage2": 0.44187092781066895, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 697 }, { "completion_length": 17.609375, "epoch": 0.12230594007359383, "grad_norm": 20.79419101396458, "kl": 0.23828125, "learning_rate": 8.778692833362536e-07, "loss": 0.0295, "reward": 1.3721497058868408, "reward_std": 0.11534123867750168, "rewards/accuracy_reward_stage2": 0.5283997058868408, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 698 }, { "completion_length": 19.265625, "epoch": 0.12248116348344139, "grad_norm": 16.840847107451385, "kl": 0.016357421875, "learning_rate": 8.776940599264061e-07, "loss": -0.0376, "reward": 1.3428521156311035, "reward_std": 0.12679797410964966, "rewards/accuracy_reward_stage2": 0.3584771156311035, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 699 }, { "completion_length": 7.390625, "epoch": 0.12265638689328895, "grad_norm": 10.71083439910046, "kl": 0.01177978515625, "learning_rate": 8.775188365165586e-07, "loss": -0.0395, "reward": 1.375, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.390625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 700 }, { "completion_length": 23.359375, "epoch": 0.1228316103031365, "grad_norm": 127.53890039782576, "kl": 0.451171875, "learning_rate": 8.77343613106711e-07, "loss": 0.1522, "reward": 1.495405912399292, "reward_std": 0.22437676787376404, "rewards/accuracy_reward_stage2": 0.6360308527946472, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 701 }, { "completion_length": 11.90625, "epoch": 0.12300683371298406, "grad_norm": 22.438948285025383, "kl": 0.0966796875, "learning_rate": 8.771683896968635e-07, "loss": -0.0445, "reward": 1.5524406433105469, "reward_std": 0.2997596263885498, "rewards/accuracy_reward_stage2": 0.5836907029151917, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 702 }, { "completion_length": 13.21875, "epoch": 0.1231820571228316, "grad_norm": 19.987582006618496, "kl": 0.044921875, "learning_rate": 8.76993166287016e-07, "loss": -0.0704, "reward": 1.6021901369094849, "reward_std": 0.3515224754810333, "rewards/accuracy_reward_stage2": 0.6334401369094849, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 703 }, { "completion_length": 17.109375, "epoch": 0.12335728053267916, "grad_norm": 23.503168971994356, "kl": 0.06396484375, "learning_rate": 8.768179428771684e-07, "loss": 0.0255, "reward": 1.6832342147827148, "reward_std": 0.20066285133361816, "rewards/accuracy_reward_stage2": 0.6832343339920044, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 704 }, { "completion_length": 12.6875, "epoch": 0.12353250394252673, "grad_norm": 19.273611616412307, "kl": 0.0869140625, "learning_rate": 8.766427194673208e-07, "loss": -0.0095, "reward": 1.6391098499298096, "reward_std": 0.2224484533071518, "rewards/accuracy_reward_stage2": 0.6547348499298096, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 705 }, { "completion_length": 7.671875, "epoch": 0.12370772735237427, "grad_norm": 18.929037623227902, "kl": 0.05419921875, "learning_rate": 8.764674960574732e-07, "loss": -0.0225, "reward": 1.328125, "reward_std": 0.1530819833278656, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 706 }, { "completion_length": 13.21875, "epoch": 0.12388295076222183, "grad_norm": 13.248950555078183, "kl": 0.0189208984375, "learning_rate": 8.762922726476257e-07, "loss": 0.0076, "reward": 1.7340033054351807, "reward_std": 0.10389992594718933, "rewards/accuracy_reward_stage2": 0.7340033054351807, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 707 }, { "completion_length": 20.359375, "epoch": 0.1240581741720694, "grad_norm": 27.180274349507958, "kl": 0.2197265625, "learning_rate": 8.761170492377782e-07, "loss": 0.0879, "reward": 1.3913912773132324, "reward_std": 0.165449857711792, "rewards/accuracy_reward_stage2": 0.5163911581039429, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 708 }, { "completion_length": 21.78125, "epoch": 0.12423339758191694, "grad_norm": 58.96739621642977, "kl": 0.435546875, "learning_rate": 8.759418258279305e-07, "loss": 0.1303, "reward": 1.2916667461395264, "reward_std": 0.2051776796579361, "rewards/accuracy_reward_stage2": 0.5572916865348816, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 709 }, { "completion_length": 8.546875, "epoch": 0.1244086209917645, "grad_norm": 25.120922125351125, "kl": 0.08447265625, "learning_rate": 8.75766602418083e-07, "loss": -0.0181, "reward": 1.6080281734466553, "reward_std": 0.3198818564414978, "rewards/accuracy_reward_stage2": 0.6392781734466553, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 710 }, { "completion_length": 8.734375, "epoch": 0.12458384440161206, "grad_norm": 28.15242862624003, "kl": 0.1923828125, "learning_rate": 8.755913790082355e-07, "loss": 0.0328, "reward": 1.4423253536224365, "reward_std": 0.17335930466651917, "rewards/accuracy_reward_stage2": 0.45795029401779175, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 711 }, { "completion_length": 10.90625, "epoch": 0.12475906781145961, "grad_norm": 21.76944181859614, "kl": 0.10009765625, "learning_rate": 8.754161555983879e-07, "loss": 0.0401, "reward": 1.505290150642395, "reward_std": 0.2847074270248413, "rewards/accuracy_reward_stage2": 0.5052902102470398, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 712 }, { "completion_length": 9.0625, "epoch": 0.12493429122130717, "grad_norm": 18.094072371451865, "kl": 0.0625, "learning_rate": 8.752409321885404e-07, "loss": -0.0187, "reward": 1.6135753393173218, "reward_std": 0.1741529405117035, "rewards/accuracy_reward_stage2": 0.6292003393173218, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 713 }, { "completion_length": 8.609375, "epoch": 0.12510951463115472, "grad_norm": 15.344482387105208, "kl": 0.1025390625, "learning_rate": 8.750657087786927e-07, "loss": 0.0411, "reward": 1.7685964107513428, "reward_std": 0.06149989739060402, "rewards/accuracy_reward_stage2": 0.768596351146698, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 714 }, { "completion_length": 12.0625, "epoch": 0.1252847380410023, "grad_norm": 62.719821119851055, "kl": 0.0654296875, "learning_rate": 8.748904853688452e-07, "loss": 0.0262, "reward": 1.5387461185455322, "reward_std": 0.22166486084461212, "rewards/accuracy_reward_stage2": 0.5387461185455322, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 715 }, { "completion_length": 8.90625, "epoch": 0.12545996145084984, "grad_norm": 22.598417730239905, "kl": 0.08203125, "learning_rate": 8.747152619589977e-07, "loss": -0.043, "reward": 1.7066401243209839, "reward_std": 0.23297566175460815, "rewards/accuracy_reward_stage2": 0.8628901839256287, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 716 }, { "completion_length": 13.8125, "epoch": 0.12563518486069739, "grad_norm": 19.205609822610867, "kl": 0.1552734375, "learning_rate": 8.745400385491501e-07, "loss": 0.062, "reward": 1.5689747333526611, "reward_std": 0.11873051524162292, "rewards/accuracy_reward_stage2": 0.6939746737480164, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 717 }, { "completion_length": 17.75, "epoch": 0.12581040827054493, "grad_norm": 24.131101589429424, "kl": 0.0277099609375, "learning_rate": 8.743648151393026e-07, "loss": 0.0111, "reward": 1.6367136240005493, "reward_std": 0.21164877712726593, "rewards/accuracy_reward_stage2": 0.6367136240005493, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 718 }, { "completion_length": 12.671875, "epoch": 0.1259856316803925, "grad_norm": 18.986879097817095, "kl": 0.16015625, "learning_rate": 8.741895917294551e-07, "loss": 0.037, "reward": 1.5682744979858398, "reward_std": 0.16256004571914673, "rewards/accuracy_reward_stage2": 0.7088994383811951, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 719 }, { "completion_length": 26.125, "epoch": 0.12616085509024005, "grad_norm": 14.897658742516182, "kl": 0.033203125, "learning_rate": 8.740143683196075e-07, "loss": -0.0309, "reward": 1.5009183883666992, "reward_std": 0.135984867811203, "rewards/accuracy_reward_stage2": 0.516543447971344, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 720 }, { "completion_length": 14.15625, "epoch": 0.1263360785000876, "grad_norm": 1887.643603829332, "kl": 6.03125, "learning_rate": 8.738391449097599e-07, "loss": 2.4267, "reward": 1.3118016719818115, "reward_std": 0.15657545626163483, "rewards/accuracy_reward_stage2": 0.4368016719818115, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 721 }, { "completion_length": 8.875, "epoch": 0.12651130190993518, "grad_norm": 15.99578727889066, "kl": 0.07373046875, "learning_rate": 8.736639214999123e-07, "loss": -0.0463, "reward": 1.6581439971923828, "reward_std": 0.31767192482948303, "rewards/accuracy_reward_stage2": 0.689393937587738, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 722 }, { "completion_length": 10.125, "epoch": 0.12668652531978272, "grad_norm": 21.204954483400815, "kl": 0.11328125, "learning_rate": 8.734886980900648e-07, "loss": -0.0853, "reward": 1.407235860824585, "reward_std": 0.3323286771774292, "rewards/accuracy_reward_stage2": 0.45411089062690735, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 723 }, { "completion_length": 14.296875, "epoch": 0.12686174872963027, "grad_norm": 22.800784257270934, "kl": 0.3984375, "learning_rate": 8.733134746802173e-07, "loss": 0.1378, "reward": 1.3840141296386719, "reward_std": 0.2325722724199295, "rewards/accuracy_reward_stage2": 0.5246391296386719, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 724 }, { "completion_length": 8.890625, "epoch": 0.12703697213947784, "grad_norm": 20.381938458516128, "kl": 0.043701171875, "learning_rate": 8.731382512703696e-07, "loss": 0.0175, "reward": 1.5718777179718018, "reward_std": 0.21215331554412842, "rewards/accuracy_reward_stage2": 0.5718777179718018, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 725 }, { "completion_length": 8.421875, "epoch": 0.1272121955493254, "grad_norm": 21.793579301173956, "kl": 0.080078125, "learning_rate": 8.729630278605221e-07, "loss": 0.0321, "reward": 1.6613794565200806, "reward_std": 0.3252708613872528, "rewards/accuracy_reward_stage2": 0.6613793969154358, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 726 }, { "completion_length": 14.09375, "epoch": 0.12738741895917294, "grad_norm": 23.380397758448332, "kl": 0.0673828125, "learning_rate": 8.727878044506745e-07, "loss": -0.0613, "reward": 1.5214309692382812, "reward_std": 0.2900305986404419, "rewards/accuracy_reward_stage2": 0.5526810884475708, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 727 }, { "completion_length": 8.203125, "epoch": 0.1275626423690205, "grad_norm": 18.344928650612594, "kl": 0.11767578125, "learning_rate": 8.72612581040827e-07, "loss": 0.0471, "reward": 1.5166375637054443, "reward_std": 0.23853465914726257, "rewards/accuracy_reward_stage2": 0.7666375041007996, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 728 }, { "completion_length": 13.75, "epoch": 0.12773786577886806, "grad_norm": 11.786925146699105, "kl": 0.039306640625, "learning_rate": 8.724373576309795e-07, "loss": -0.0644, "reward": 1.3146369457244873, "reward_std": 0.16122567653656006, "rewards/accuracy_reward_stage2": 0.4708869457244873, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 729 }, { "completion_length": 10.59375, "epoch": 0.1279130891887156, "grad_norm": 19.45807361148035, "kl": 0.09423828125, "learning_rate": 8.722621342211319e-07, "loss": 0.0378, "reward": 1.5389803647994995, "reward_std": 0.22324511408805847, "rewards/accuracy_reward_stage2": 0.6639803647994995, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 730 }, { "completion_length": 13.296875, "epoch": 0.12808831259856318, "grad_norm": 10.960043287191809, "kl": 0.33203125, "learning_rate": 8.720869108112844e-07, "loss": 0.133, "reward": 1.5178592205047607, "reward_std": 0.08258861303329468, "rewards/accuracy_reward_stage2": 0.6428592205047607, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 731 }, { "completion_length": 12.515625, "epoch": 0.12826353600841073, "grad_norm": 17.782479377656646, "kl": 0.043212890625, "learning_rate": 8.719116874014369e-07, "loss": -0.0269, "reward": 1.580472707748413, "reward_std": 0.23168525099754333, "rewards/accuracy_reward_stage2": 0.5960977673530579, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 732 }, { "completion_length": 8.3125, "epoch": 0.12843875941825827, "grad_norm": 25.95392845141862, "kl": 0.0732421875, "learning_rate": 8.717364639915893e-07, "loss": -0.0464, "reward": 1.5123786926269531, "reward_std": 0.2772209644317627, "rewards/accuracy_reward_stage2": 0.5436286926269531, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 733 }, { "completion_length": 23.703125, "epoch": 0.12861398282810582, "grad_norm": 38.20590883091491, "kl": 0.609375, "learning_rate": 8.715612405817416e-07, "loss": 0.2438, "reward": 1.48157799243927, "reward_std": 0.26270562410354614, "rewards/accuracy_reward_stage2": 0.73157799243927, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 734 }, { "completion_length": 12.46875, "epoch": 0.1287892062379534, "grad_norm": 24.100699456688726, "kl": 0.0908203125, "learning_rate": 8.71386017171894e-07, "loss": 0.0364, "reward": 1.5929126739501953, "reward_std": 0.2643412947654724, "rewards/accuracy_reward_stage2": 0.5929126739501953, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 735 }, { "completion_length": 9.703125, "epoch": 0.12896442964780094, "grad_norm": 10.832855482198559, "kl": 0.016845703125, "learning_rate": 8.712107937620465e-07, "loss": -0.0879, "reward": 1.484375, "reward_std": 0.13258251547813416, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 736 }, { "completion_length": 14.765625, "epoch": 0.1291396530576485, "grad_norm": 17.12707163097698, "kl": 0.2177734375, "learning_rate": 8.71035570352199e-07, "loss": 0.0871, "reward": 1.5152562856674194, "reward_std": 0.11481408774852753, "rewards/accuracy_reward_stage2": 0.7652561664581299, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 737 }, { "completion_length": 10.296875, "epoch": 0.12931487646749606, "grad_norm": 18.785523386499325, "kl": 0.095703125, "learning_rate": 8.708603469423514e-07, "loss": 0.0382, "reward": 1.8014370203018188, "reward_std": 0.17660526931285858, "rewards/accuracy_reward_stage2": 0.8014370203018188, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 738 }, { "completion_length": 13.875, "epoch": 0.1294900998773436, "grad_norm": 24.0749346047583, "kl": 0.09521484375, "learning_rate": 8.706851235325039e-07, "loss": 0.0381, "reward": 1.5514681339263916, "reward_std": 0.2884979546070099, "rewards/accuracy_reward_stage2": 0.5514680743217468, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 739 }, { "completion_length": 13.0, "epoch": 0.12966532328719116, "grad_norm": 23.044753771481854, "kl": 0.09228515625, "learning_rate": 8.705099001226564e-07, "loss": 0.037, "reward": 1.5986329317092896, "reward_std": 0.29432013630867004, "rewards/accuracy_reward_stage2": 0.7236329317092896, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 740 }, { "completion_length": 8.84375, "epoch": 0.12984054669703873, "grad_norm": 28.607090780035833, "kl": 0.018798828125, "learning_rate": 8.703346767128088e-07, "loss": 0.0075, "reward": 1.4213995933532715, "reward_std": 0.2705962657928467, "rewards/accuracy_reward_stage2": 0.42139962315559387, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 741 }, { "completion_length": 8.078125, "epoch": 0.13001577010688628, "grad_norm": 22.206143566559955, "kl": 0.1025390625, "learning_rate": 8.701594533029613e-07, "loss": -0.0031, "reward": 1.5844957828521729, "reward_std": 0.2147214710712433, "rewards/accuracy_reward_stage2": 0.7251207828521729, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 742 }, { "completion_length": 14.46875, "epoch": 0.13019099351673383, "grad_norm": 23.76171747296305, "kl": 0.1259765625, "learning_rate": 8.699842298931137e-07, "loss": 0.0081, "reward": 1.5806400775909424, "reward_std": 0.20044711232185364, "rewards/accuracy_reward_stage2": 0.7212650775909424, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 743 }, { "completion_length": 11.28125, "epoch": 0.1303662169265814, "grad_norm": 26.143393613825847, "kl": 0.03759765625, "learning_rate": 8.698090064832662e-07, "loss": 0.0151, "reward": 1.4961230754852295, "reward_std": 0.25255924463272095, "rewards/accuracy_reward_stage2": 0.4961230754852295, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 744 }, { "completion_length": 8.46875, "epoch": 0.13054144033642895, "grad_norm": 31.395400062191023, "kl": 0.05419921875, "learning_rate": 8.696337830734186e-07, "loss": 0.0217, "reward": 1.7549707889556885, "reward_std": 0.2908035218715668, "rewards/accuracy_reward_stage2": 0.7549707293510437, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 745 }, { "completion_length": 7.0, "epoch": 0.1307166637462765, "grad_norm": 19.43893579940994, "kl": 0.146484375, "learning_rate": 8.69458559663571e-07, "loss": 0.0585, "reward": 1.581559658050537, "reward_std": 0.23520039021968842, "rewards/accuracy_reward_stage2": 0.5815596580505371, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 746 }, { "completion_length": 43.46875, "epoch": 0.13089188715612407, "grad_norm": 17.920030287701124, "kl": 0.1015625, "learning_rate": 8.692833362537234e-07, "loss": 0.0406, "reward": 1.3412933349609375, "reward_std": 0.1644451916217804, "rewards/accuracy_reward_stage2": 0.4662933945655823, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 747 }, { "completion_length": 7.03125, "epoch": 0.13106711056597162, "grad_norm": 20.164146416527387, "kl": 0.02294921875, "learning_rate": 8.691081128438759e-07, "loss": -0.0242, "reward": 1.5950117111206055, "reward_std": 0.22564148902893066, "rewards/accuracy_reward_stage2": 0.6106366515159607, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 748 }, { "completion_length": 13.671875, "epoch": 0.13124233397581916, "grad_norm": 24.397572687906287, "kl": 0.11767578125, "learning_rate": 8.689328894340283e-07, "loss": 0.0096, "reward": 1.5717556476593018, "reward_std": 0.2309304177761078, "rewards/accuracy_reward_stage2": 0.5873807072639465, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 749 }, { "completion_length": 10.28125, "epoch": 0.13141755738566674, "grad_norm": 18.586354742955837, "kl": 0.057861328125, "learning_rate": 8.687576660241808e-07, "loss": -0.0064, "reward": 1.49538254737854, "reward_std": 0.20117239654064178, "rewards/accuracy_reward_stage2": 0.6360074877738953, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 750 }, { "completion_length": 6.875, "epoch": 0.13159278079551429, "grad_norm": 18.772365600988905, "kl": 0.0546875, "learning_rate": 8.685824426143332e-07, "loss": 0.0219, "reward": 1.5222609043121338, "reward_std": 0.1611718237400055, "rewards/accuracy_reward_stage2": 0.647260844707489, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 751 }, { "completion_length": 11.5, "epoch": 0.13176800420536183, "grad_norm": 26.936451249334294, "kl": 0.150390625, "learning_rate": 8.684072192044857e-07, "loss": 0.0601, "reward": 1.3048322200775146, "reward_std": 0.27304765582084656, "rewards/accuracy_reward_stage2": 0.4298322796821594, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 752 }, { "completion_length": 6.921875, "epoch": 0.13194322761520938, "grad_norm": 13.725810132685641, "kl": 0.01495361328125, "learning_rate": 8.682319957946382e-07, "loss": 0.006, "reward": 1.6228388547897339, "reward_std": 0.0823933333158493, "rewards/accuracy_reward_stage2": 0.6228388547897339, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 753 }, { "completion_length": 12.28125, "epoch": 0.13211845102505695, "grad_norm": 17.3231009586425, "kl": 0.095703125, "learning_rate": 8.680567723847905e-07, "loss": 0.0383, "reward": 1.5200889110565186, "reward_std": 0.1671619862318039, "rewards/accuracy_reward_stage2": 0.5200889110565186, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 754 }, { "completion_length": 9.15625, "epoch": 0.1322936744349045, "grad_norm": 21.947823238942373, "kl": 0.123046875, "learning_rate": 8.67881548974943e-07, "loss": 0.0494, "reward": 1.4984718561172485, "reward_std": 0.1990078091621399, "rewards/accuracy_reward_stage2": 0.6234718561172485, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 755 }, { "completion_length": 7.484375, "epoch": 0.13246889784475205, "grad_norm": 20.55658085463294, "kl": 0.068359375, "learning_rate": 8.677063255650955e-07, "loss": 0.0273, "reward": 1.631592035293579, "reward_std": 0.20693038403987885, "rewards/accuracy_reward_stage2": 0.6315920352935791, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 756 }, { "completion_length": 13.109375, "epoch": 0.13264412125459962, "grad_norm": 895.2476838263423, "kl": 3.9375, "learning_rate": 8.675311021552479e-07, "loss": 1.5671, "reward": 1.2633342742919922, "reward_std": 0.12322809547185898, "rewards/accuracy_reward_stage2": 0.3883342742919922, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 757 }, { "completion_length": 9.109375, "epoch": 0.13281934466444717, "grad_norm": 15.094495405516248, "kl": 0.03759765625, "learning_rate": 8.673558787454004e-07, "loss": 0.015, "reward": 1.5943365097045898, "reward_std": 0.07661169767379761, "rewards/accuracy_reward_stage2": 0.5943365693092346, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 758 }, { "completion_length": 11.546875, "epoch": 0.13299456807429472, "grad_norm": 19.67056194967656, "kl": 0.050537109375, "learning_rate": 8.671806553355527e-07, "loss": 0.0202, "reward": 1.7037529945373535, "reward_std": 0.22206325829029083, "rewards/accuracy_reward_stage2": 0.7037530541419983, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 759 }, { "completion_length": 8.421875, "epoch": 0.1331697914841423, "grad_norm": 24.23192740150212, "kl": 0.083984375, "learning_rate": 8.670054319257052e-07, "loss": 0.0047, "reward": 1.543332815170288, "reward_std": 0.26957929134368896, "rewards/accuracy_reward_stage2": 0.5589578151702881, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 760 }, { "completion_length": 11.3125, "epoch": 0.13334501489398984, "grad_norm": 23.52029972754725, "kl": 0.09130859375, "learning_rate": 8.668302085158577e-07, "loss": 0.0367, "reward": 1.6756335496902466, "reward_std": 0.1896108090877533, "rewards/accuracy_reward_stage2": 0.6756335496902466, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 761 }, { "completion_length": 8.640625, "epoch": 0.13352023830383739, "grad_norm": 19.93103272604443, "kl": 0.10498046875, "learning_rate": 8.666549851060101e-07, "loss": -0.0247, "reward": 1.2811100482940674, "reward_std": 0.27886366844177246, "rewards/accuracy_reward_stage2": 0.31236010789871216, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 762 }, { "completion_length": 8.765625, "epoch": 0.13369546171368496, "grad_norm": 28.2057588361612, "kl": 0.20703125, "learning_rate": 8.664797616961626e-07, "loss": 0.083, "reward": 1.4688949584960938, "reward_std": 0.1758657693862915, "rewards/accuracy_reward_stage2": 0.5938950181007385, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 763 }, { "completion_length": 15.296875, "epoch": 0.1338706851235325, "grad_norm": 28.10233930198314, "kl": 0.11328125, "learning_rate": 8.66304538286315e-07, "loss": -0.0117, "reward": 1.5274418592453003, "reward_std": 0.35585153102874756, "rewards/accuracy_reward_stage2": 0.5586917996406555, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 764 }, { "completion_length": 12.21875, "epoch": 0.13404590853338005, "grad_norm": 22.15661040647775, "kl": 0.03173828125, "learning_rate": 8.661293148764674e-07, "loss": 0.0127, "reward": 1.6838589906692505, "reward_std": 0.27672165632247925, "rewards/accuracy_reward_stage2": 0.6838589310646057, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 765 }, { "completion_length": 19.125, "epoch": 0.13422113194322763, "grad_norm": 191.1697869554803, "kl": 0.053955078125, "learning_rate": 8.659540914666199e-07, "loss": 0.0216, "reward": 1.3028383255004883, "reward_std": 0.22169262170791626, "rewards/accuracy_reward_stage2": 0.4278383255004883, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 766 }, { "completion_length": 7.40625, "epoch": 0.13439635535307518, "grad_norm": 22.16596563872041, "kl": 0.09423828125, "learning_rate": 8.657788680567723e-07, "loss": -0.0065, "reward": 1.4294730424880981, "reward_std": 0.2778435945510864, "rewards/accuracy_reward_stage2": 0.5700980424880981, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 767 }, { "completion_length": 12.609375, "epoch": 0.13457157876292272, "grad_norm": 14.85272111547269, "kl": 0.0341796875, "learning_rate": 8.656036446469248e-07, "loss": 0.0137, "reward": 1.395758867263794, "reward_std": 0.15206801891326904, "rewards/accuracy_reward_stage2": 0.5207589268684387, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 768 }, { "completion_length": 7.359375, "epoch": 0.13474680217277027, "grad_norm": 40.27145578675297, "kl": 0.4140625, "learning_rate": 8.654284212370773e-07, "loss": 0.166, "reward": 1.4100593328475952, "reward_std": 0.180698424577713, "rewards/accuracy_reward_stage2": 0.5350593328475952, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 769 }, { "completion_length": 7.96875, "epoch": 0.13492202558261784, "grad_norm": 21.098959385331987, "kl": 0.06640625, "learning_rate": 8.652531978272297e-07, "loss": -0.0516, "reward": 1.5042563676834106, "reward_std": 0.32883530855178833, "rewards/accuracy_reward_stage2": 0.5355063676834106, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 770 }, { "completion_length": 9.59375, "epoch": 0.1350972489924654, "grad_norm": 21.0235097757217, "kl": 0.09033203125, "learning_rate": 8.650779744173822e-07, "loss": -0.0923, "reward": 1.3507249355316162, "reward_std": 0.30107730627059937, "rewards/accuracy_reward_stage2": 0.4132249057292938, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 771 }, { "completion_length": 10.21875, "epoch": 0.13527247240231294, "grad_norm": 19.182190078886475, "kl": 0.042724609375, "learning_rate": 8.649027510075346e-07, "loss": 0.0171, "reward": 1.4446072578430176, "reward_std": 0.20177596807479858, "rewards/accuracy_reward_stage2": 0.569607138633728, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 772 }, { "completion_length": 8.078125, "epoch": 0.1354476958121605, "grad_norm": 21.87737505155135, "kl": 0.0849609375, "learning_rate": 8.64727527597687e-07, "loss": 0.034, "reward": 1.4658043384552002, "reward_std": 0.2499678134918213, "rewards/accuracy_reward_stage2": 0.590804398059845, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 773 }, { "completion_length": 9.765625, "epoch": 0.13562291922200806, "grad_norm": 13.952596182194974, "kl": 0.046630859375, "learning_rate": 8.645523041878394e-07, "loss": 0.0187, "reward": 1.5219866037368774, "reward_std": 0.13566339015960693, "rewards/accuracy_reward_stage2": 0.5219866037368774, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 774 }, { "completion_length": 8.34375, "epoch": 0.1357981426318556, "grad_norm": 20.70125677688085, "kl": 0.078125, "learning_rate": 8.643770807779918e-07, "loss": 0.0312, "reward": 1.6207479238510132, "reward_std": 0.1453634649515152, "rewards/accuracy_reward_stage2": 0.6207479238510132, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 775 }, { "completion_length": 6.53125, "epoch": 0.13597336604170318, "grad_norm": 19.221602229273802, "kl": 0.041259765625, "learning_rate": 8.642018573681443e-07, "loss": 0.0165, "reward": 1.329564094543457, "reward_std": 0.21231430768966675, "rewards/accuracy_reward_stage2": 0.32956403493881226, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 776 }, { "completion_length": 7.546875, "epoch": 0.13614858945155073, "grad_norm": 20.77515018987291, "kl": 0.05419921875, "learning_rate": 8.640266339582968e-07, "loss": 0.0217, "reward": 1.5638327598571777, "reward_std": 0.15072381496429443, "rewards/accuracy_reward_stage2": 0.563832700252533, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 777 }, { "completion_length": 15.21875, "epoch": 0.13632381286139827, "grad_norm": 19.313471380679403, "kl": 0.02978515625, "learning_rate": 8.638514105484492e-07, "loss": 0.0119, "reward": 1.543156385421753, "reward_std": 0.09772832691669464, "rewards/accuracy_reward_stage2": 0.5431563258171082, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 778 }, { "completion_length": 12.40625, "epoch": 0.13649903627124585, "grad_norm": 20.7202571891817, "kl": 0.060791015625, "learning_rate": 8.636761871386017e-07, "loss": 0.0243, "reward": 1.1963204145431519, "reward_std": 0.21110892295837402, "rewards/accuracy_reward_stage2": 0.19632048904895782, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 779 }, { "completion_length": 6.34375, "epoch": 0.1366742596810934, "grad_norm": 10.142569369338114, "kl": 0.058837890625, "learning_rate": 8.635009637287542e-07, "loss": 0.0235, "reward": 1.6852272748947144, "reward_std": 0.051082856953144073, "rewards/accuracy_reward_stage2": 0.6852272748947144, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 780 }, { "completion_length": 8.46875, "epoch": 0.13684948309094094, "grad_norm": 18.331737891811393, "kl": 0.01263427734375, "learning_rate": 8.633257403189066e-07, "loss": 0.005, "reward": 1.6638405323028564, "reward_std": 0.1704673171043396, "rewards/accuracy_reward_stage2": 0.6638404726982117, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 781 }, { "completion_length": 9.125, "epoch": 0.13702470650078852, "grad_norm": 10.998835905647725, "kl": 0.007049560546875, "learning_rate": 8.631505169090591e-07, "loss": 0.0028, "reward": 1.609375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.609375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 782 }, { "completion_length": 16.203125, "epoch": 0.13719992991063606, "grad_norm": 121.12707998830088, "kl": 0.68359375, "learning_rate": 8.629752934992115e-07, "loss": 0.2291, "reward": 1.2269890308380127, "reward_std": 0.20804274082183838, "rewards/accuracy_reward_stage2": 0.4926139712333679, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 783 }, { "completion_length": 11.1875, "epoch": 0.1373751533204836, "grad_norm": 66.34782787803577, "kl": 0.671875, "learning_rate": 8.62800070089364e-07, "loss": 0.2691, "reward": 1.4658288955688477, "reward_std": 0.21245905756950378, "rewards/accuracy_reward_stage2": 0.5908288359642029, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 784 }, { "completion_length": 10.75, "epoch": 0.13755037673033116, "grad_norm": 22.697008213792316, "kl": 0.062255859375, "learning_rate": 8.626248466795163e-07, "loss": 0.025, "reward": 1.5069878101348877, "reward_std": 0.15365807712078094, "rewards/accuracy_reward_stage2": 0.6319879293441772, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 785 }, { "completion_length": 10.171875, "epoch": 0.13772560014017873, "grad_norm": 16.450378612004357, "kl": 0.018798828125, "learning_rate": 8.624496232696687e-07, "loss": 0.0075, "reward": 1.642259120941162, "reward_std": 0.14816491305828094, "rewards/accuracy_reward_stage2": 0.6422590017318726, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 786 }, { "completion_length": 7.640625, "epoch": 0.13790082355002628, "grad_norm": 20.521265091381824, "kl": 0.0546875, "learning_rate": 8.622743998598212e-07, "loss": 0.0009, "reward": 1.5071511268615723, "reward_std": 0.2744485139846802, "rewards/accuracy_reward_stage2": 0.5227761268615723, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 787 }, { "completion_length": 9.890625, "epoch": 0.13807604695987383, "grad_norm": 22.28505104220285, "kl": 0.07861328125, "learning_rate": 8.620991764499737e-07, "loss": 0.0314, "reward": 1.5369726419448853, "reward_std": 0.18660400807857513, "rewards/accuracy_reward_stage2": 0.6619726419448853, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 788 }, { "completion_length": 7.5, "epoch": 0.1382512703697214, "grad_norm": 11.461413144710713, "kl": 0.0228271484375, "learning_rate": 8.619239530401261e-07, "loss": 0.0091, "reward": 1.4349414110183716, "reward_std": 0.07261689007282257, "rewards/accuracy_reward_stage2": 0.4349414110183716, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 789 }, { "completion_length": 8.796875, "epoch": 0.13842649377956895, "grad_norm": 21.565315889628515, "kl": 0.09619140625, "learning_rate": 8.617487296302786e-07, "loss": 0.0384, "reward": 1.4510877132415771, "reward_std": 0.24449574947357178, "rewards/accuracy_reward_stage2": 0.5760876536369324, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 790 }, { "completion_length": 8.734375, "epoch": 0.1386017171894165, "grad_norm": 36.1687120434632, "kl": 0.09521484375, "learning_rate": 8.61573506220431e-07, "loss": 0.0381, "reward": 1.5556349754333496, "reward_std": 0.311960905790329, "rewards/accuracy_reward_stage2": 0.5556348562240601, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 791 }, { "completion_length": 17.453125, "epoch": 0.13877694059926407, "grad_norm": 22.713123989105284, "kl": 0.146484375, "learning_rate": 8.613982828105835e-07, "loss": 0.0308, "reward": 1.6747777462005615, "reward_std": 0.2890956401824951, "rewards/accuracy_reward_stage2": 0.6904026865959167, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 792 }, { "completion_length": 7.296875, "epoch": 0.13895216400911162, "grad_norm": 22.463928540543556, "kl": 0.09765625, "learning_rate": 8.61223059400736e-07, "loss": 0.0008, "reward": 1.462833046913147, "reward_std": 0.1759517639875412, "rewards/accuracy_reward_stage2": 0.603458046913147, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 793 }, { "completion_length": 8.703125, "epoch": 0.13912738741895916, "grad_norm": 22.675946642947583, "kl": 0.0284423828125, "learning_rate": 8.610478359908883e-07, "loss": 0.0114, "reward": 1.3541667461395264, "reward_std": 0.21643014252185822, "rewards/accuracy_reward_stage2": 0.3541666865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 794 }, { "completion_length": 9.90625, "epoch": 0.13930261082880674, "grad_norm": 16.915255498782287, "kl": 0.02685546875, "learning_rate": 8.608726125810408e-07, "loss": 0.0107, "reward": 1.635071039199829, "reward_std": 0.091707743704319, "rewards/accuracy_reward_stage2": 0.6350710391998291, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 795 }, { "completion_length": 12.90625, "epoch": 0.13947783423865429, "grad_norm": 19.268752675673827, "kl": 0.053466796875, "learning_rate": 8.606973891711933e-07, "loss": 0.0213, "reward": 1.5058095455169678, "reward_std": 0.2826850414276123, "rewards/accuracy_reward_stage2": 0.5058095455169678, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 796 }, { "completion_length": 8.734375, "epoch": 0.13965305764850183, "grad_norm": 23.808344356205204, "kl": 0.138671875, "learning_rate": 8.605221657613457e-07, "loss": 0.0554, "reward": 1.6443569660186768, "reward_std": 0.33315661549568176, "rewards/accuracy_reward_stage2": 0.6443569660186768, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 797 }, { "completion_length": 9.171875, "epoch": 0.1398282810583494, "grad_norm": 18.136804001317085, "kl": 0.048583984375, "learning_rate": 8.603469423514981e-07, "loss": -0.0076, "reward": 1.7026225328445435, "reward_std": 0.20409967005252838, "rewards/accuracy_reward_stage2": 0.7182475328445435, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 798 }, { "completion_length": 6.40625, "epoch": 0.14000350446819695, "grad_norm": 20.57066511895683, "kl": 0.05224609375, "learning_rate": 8.601717189416505e-07, "loss": -0.0232, "reward": 1.7694220542907715, "reward_std": 0.30438560247421265, "rewards/accuracy_reward_stage2": 0.7850470542907715, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 799 }, { "completion_length": 8.125, "epoch": 0.1401787278780445, "grad_norm": 22.88045288586515, "kl": 0.15625, "learning_rate": 8.59996495531803e-07, "loss": 0.0313, "reward": 1.3915396928787231, "reward_std": 0.2325017750263214, "rewards/accuracy_reward_stage2": 0.5321646928787231, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 800 }, { "completion_length": 9.96875, "epoch": 0.14035395128789208, "grad_norm": 22.457627248125416, "kl": 0.13671875, "learning_rate": 8.598212721219555e-07, "loss": 0.0547, "reward": 1.691390037536621, "reward_std": 0.30775442719459534, "rewards/accuracy_reward_stage2": 0.6913900375366211, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 801 }, { "completion_length": 8.59375, "epoch": 0.14052917469773962, "grad_norm": 14.506472327882316, "kl": 0.04150390625, "learning_rate": 8.596460487121079e-07, "loss": 0.0165, "reward": 1.573103666305542, "reward_std": 0.07388261705636978, "rewards/accuracy_reward_stage2": 0.573103666305542, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 802 }, { "completion_length": 12.1875, "epoch": 0.14070439810758717, "grad_norm": 13.264301091214213, "kl": 0.06640625, "learning_rate": 8.594708253022604e-07, "loss": 0.0266, "reward": 1.0838366746902466, "reward_std": 0.20186206698417664, "rewards/accuracy_reward_stage2": 0.3338366448879242, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 803 }, { "completion_length": 10.625, "epoch": 0.14087962151743472, "grad_norm": 39.71989699208024, "kl": 0.1767578125, "learning_rate": 8.592956018924127e-07, "loss": -0.0047, "reward": 1.5464773178100586, "reward_std": 0.21688970923423767, "rewards/accuracy_reward_stage2": 0.7027274370193481, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 804 }, { "completion_length": 10.953125, "epoch": 0.1410548449272823, "grad_norm": 16.35935979261754, "kl": 0.0703125, "learning_rate": 8.591203784825652e-07, "loss": 0.0281, "reward": 1.3673032522201538, "reward_std": 0.09058556705713272, "rewards/accuracy_reward_stage2": 0.3673032522201538, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 805 }, { "completion_length": 10.65625, "epoch": 0.14123006833712984, "grad_norm": 30.11380461178658, "kl": 0.062255859375, "learning_rate": 8.589451550727177e-07, "loss": -0.0398, "reward": 1.544505000114441, "reward_std": 0.25117573142051697, "rewards/accuracy_reward_stage2": 0.5757550597190857, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 806 }, { "completion_length": 11.125, "epoch": 0.14140529174697739, "grad_norm": 21.97961959396547, "kl": 0.0673828125, "learning_rate": 8.587699316628701e-07, "loss": -0.0172, "reward": 1.6793184280395508, "reward_std": 0.2505590617656708, "rewards/accuracy_reward_stage2": 0.6949434876441956, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 807 }, { "completion_length": 9.21875, "epoch": 0.14158051515682496, "grad_norm": 43.11287489187185, "kl": 0.02392578125, "learning_rate": 8.585947082530226e-07, "loss": 0.0096, "reward": 1.546875, "reward_std": 0.1530819982290268, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 808 }, { "completion_length": 16.25, "epoch": 0.1417557385666725, "grad_norm": 24.6818306471637, "kl": 0.62890625, "learning_rate": 8.584194848431751e-07, "loss": 0.2514, "reward": 1.486750602722168, "reward_std": 0.1537483036518097, "rewards/accuracy_reward_stage2": 0.6117505431175232, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 809 }, { "completion_length": 13.078125, "epoch": 0.14193096197652005, "grad_norm": 19.039469497624456, "kl": 0.064453125, "learning_rate": 8.582442614333274e-07, "loss": -0.014, "reward": 1.4495387077331543, "reward_std": 0.3235572576522827, "rewards/accuracy_reward_stage2": 0.4651636779308319, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 810 }, { "completion_length": 7.1875, "epoch": 0.14210618538636763, "grad_norm": 24.169194951924073, "kl": 0.06591796875, "learning_rate": 8.580690380234799e-07, "loss": 0.0263, "reward": 1.7233256101608276, "reward_std": 0.23263150453567505, "rewards/accuracy_reward_stage2": 0.7233256101608276, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 811 }, { "completion_length": 12.234375, "epoch": 0.14228140879621518, "grad_norm": 22.087116534277232, "kl": 0.46875, "learning_rate": 8.578938146136323e-07, "loss": 0.1455, "reward": 1.4000294208526611, "reward_std": 0.2222493290901184, "rewards/accuracy_reward_stage2": 0.5406544208526611, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 812 }, { "completion_length": 8.171875, "epoch": 0.14245663220606272, "grad_norm": 8.13413593004782, "kl": 0.007415771484375, "learning_rate": 8.577185912037847e-07, "loss": 0.003, "reward": 1.7436164617538452, "reward_std": 0.018055392429232597, "rewards/accuracy_reward_stage2": 0.7436164617538452, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 813 }, { "completion_length": 15.84375, "epoch": 0.1426318556159103, "grad_norm": 13.88930059775845, "kl": 0.06005859375, "learning_rate": 8.575433677939372e-07, "loss": 0.024, "reward": 1.5528483390808105, "reward_std": 0.07238230854272842, "rewards/accuracy_reward_stage2": 0.5528483390808105, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 814 }, { "completion_length": 12.484375, "epoch": 0.14280707902575784, "grad_norm": 22.855578480267933, "kl": 0.08056640625, "learning_rate": 8.573681443840896e-07, "loss": -0.0002, "reward": 1.4860951900482178, "reward_std": 0.19443252682685852, "rewards/accuracy_reward_stage2": 0.501720130443573, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 815 }, { "completion_length": 7.1875, "epoch": 0.1429823024356054, "grad_norm": 21.597799572130995, "kl": 0.126953125, "learning_rate": 8.571929209742421e-07, "loss": 0.0507, "reward": 1.630251169204712, "reward_std": 0.15653660893440247, "rewards/accuracy_reward_stage2": 0.6302511096000671, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 816 }, { "completion_length": 10.203125, "epoch": 0.14315752584545297, "grad_norm": 17.619651486942193, "kl": 0.205078125, "learning_rate": 8.570176975643946e-07, "loss": 0.0818, "reward": 1.4163931608200073, "reward_std": 0.2162127047777176, "rewards/accuracy_reward_stage2": 0.6663932204246521, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 817 }, { "completion_length": 16.21875, "epoch": 0.1433327492553005, "grad_norm": 20.40379259103948, "kl": 0.09716796875, "learning_rate": 8.56842474154547e-07, "loss": 0.0387, "reward": 1.5769197940826416, "reward_std": 0.24281121790409088, "rewards/accuracy_reward_stage2": 0.7019197344779968, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 818 }, { "completion_length": 12.71875, "epoch": 0.14350797266514806, "grad_norm": 55.3327278627242, "kl": 0.0634765625, "learning_rate": 8.566672507446995e-07, "loss": -0.0059, "reward": 1.5887277126312256, "reward_std": 0.3874633312225342, "rewards/accuracy_reward_stage2": 0.6043526530265808, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 819 }, { "completion_length": 7.28125, "epoch": 0.1436831960749956, "grad_norm": 24.61589629636778, "kl": 0.045166015625, "learning_rate": 8.564920273348519e-07, "loss": 0.018, "reward": 1.4394075870513916, "reward_std": 0.1608877032995224, "rewards/accuracy_reward_stage2": 0.5644077062606812, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 820 }, { "completion_length": 11.78125, "epoch": 0.14385841948484318, "grad_norm": 13.219207764251584, "kl": 0.0120849609375, "learning_rate": 8.563168039250044e-07, "loss": 0.0049, "reward": 1.451958417892456, "reward_std": 0.12982237339019775, "rewards/accuracy_reward_stage2": 0.4519583582878113, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 821 }, { "completion_length": 11.734375, "epoch": 0.14403364289469073, "grad_norm": 18.588382161945407, "kl": 0.05810546875, "learning_rate": 8.561415805151569e-07, "loss": -0.0119, "reward": 1.6447747945785522, "reward_std": 0.20467260479927063, "rewards/accuracy_reward_stage2": 0.7853997945785522, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 822 }, { "completion_length": 10.0625, "epoch": 0.14420886630453827, "grad_norm": 33.79391222827712, "kl": 0.07421875, "learning_rate": 8.559663571053091e-07, "loss": 0.0007, "reward": 1.5684666633605957, "reward_std": 0.2820379137992859, "rewards/accuracy_reward_stage2": 0.7090917229652405, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 823 }, { "completion_length": 11.203125, "epoch": 0.14438408971438585, "grad_norm": 19.944653404203173, "kl": 0.12451171875, "learning_rate": 8.557911336954616e-07, "loss": 0.0499, "reward": 1.621762990951538, "reward_std": 0.2086431235074997, "rewards/accuracy_reward_stage2": 0.6217628717422485, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 824 }, { "completion_length": 8.984375, "epoch": 0.1445593131242334, "grad_norm": 14.625539721921214, "kl": 0.04248046875, "learning_rate": 8.556159102856141e-07, "loss": 0.017, "reward": 1.8925212621688843, "reward_std": 0.09097467362880707, "rewards/accuracy_reward_stage2": 0.8925212621688843, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 825 }, { "completion_length": 15.859375, "epoch": 0.14473453653408094, "grad_norm": 14.368354249599616, "kl": 0.07958984375, "learning_rate": 8.554406868757665e-07, "loss": 0.0319, "reward": 1.4069151878356934, "reward_std": 0.10731781274080276, "rewards/accuracy_reward_stage2": 0.5319151878356934, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 826 }, { "completion_length": 9.0, "epoch": 0.14490975994392852, "grad_norm": 29.72020467621293, "kl": 0.03076171875, "learning_rate": 8.55265463465919e-07, "loss": 0.0123, "reward": 1.6061508655548096, "reward_std": 0.3720834255218506, "rewards/accuracy_reward_stage2": 0.7311508059501648, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 827 }, { "completion_length": 11.03125, "epoch": 0.14508498335377606, "grad_norm": 23.300291074931497, "kl": 0.134765625, "learning_rate": 8.550902400560714e-07, "loss": 0.0538, "reward": 1.2861018180847168, "reward_std": 0.19155770540237427, "rewards/accuracy_reward_stage2": 0.4111018776893616, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 828 }, { "completion_length": 14.546875, "epoch": 0.1452602067636236, "grad_norm": 19.748170632331988, "kl": 0.08056640625, "learning_rate": 8.549150166462239e-07, "loss": 0.0321, "reward": 1.495689868927002, "reward_std": 0.09303957223892212, "rewards/accuracy_reward_stage2": 0.49568989872932434, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 829 }, { "completion_length": 8.9375, "epoch": 0.1454354301734712, "grad_norm": 23.557736521778914, "kl": 0.16015625, "learning_rate": 8.547397932363764e-07, "loss": 0.0642, "reward": 1.2532269954681396, "reward_std": 0.24097494781017303, "rewards/accuracy_reward_stage2": 0.5032269358634949, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 830 }, { "completion_length": 11.0625, "epoch": 0.14561065358331873, "grad_norm": 17.655466469547825, "kl": 0.12109375, "learning_rate": 8.545645698265288e-07, "loss": 0.0485, "reward": 1.512831449508667, "reward_std": 0.14166758954524994, "rewards/accuracy_reward_stage2": 0.637831449508667, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 831 }, { "completion_length": 14.890625, "epoch": 0.14578587699316628, "grad_norm": 45.874030977900944, "kl": 0.37890625, "learning_rate": 8.543893464166813e-07, "loss": 0.1516, "reward": 1.225730061531067, "reward_std": 0.21166354417800903, "rewards/accuracy_reward_stage2": 0.3507300317287445, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 832 }, { "completion_length": 10.046875, "epoch": 0.14596110040301385, "grad_norm": 25.695155827712917, "kl": 0.142578125, "learning_rate": 8.542141230068338e-07, "loss": 0.0187, "reward": 1.499477744102478, "reward_std": 0.2859499454498291, "rewards/accuracy_reward_stage2": 0.530727744102478, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 833 }, { "completion_length": 15.96875, "epoch": 0.1461363238128614, "grad_norm": 12.749670925067296, "kl": 0.03564453125, "learning_rate": 8.540388995969861e-07, "loss": -0.03, "reward": 1.5882692337036133, "reward_std": 0.1215648502111435, "rewards/accuracy_reward_stage2": 0.6038942933082581, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 834 }, { "completion_length": 7.40625, "epoch": 0.14631154722270895, "grad_norm": 20.419820412171152, "kl": 0.0849609375, "learning_rate": 8.538636761871386e-07, "loss": 0.0341, "reward": 1.7913644313812256, "reward_std": 0.1585237681865692, "rewards/accuracy_reward_stage2": 0.7913644313812256, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 835 }, { "completion_length": 13.9375, "epoch": 0.14648677063255652, "grad_norm": 14.851993005874192, "kl": 0.07421875, "learning_rate": 8.536884527772909e-07, "loss": -0.0138, "reward": 1.6181893348693848, "reward_std": 0.09273967146873474, "rewards/accuracy_reward_stage2": 0.63381427526474, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 836 }, { "completion_length": 8.953125, "epoch": 0.14666199404240407, "grad_norm": 15.156193193114621, "kl": 0.06103515625, "learning_rate": 8.535132293674434e-07, "loss": 0.0244, "reward": 1.3110003471374512, "reward_std": 0.11011946201324463, "rewards/accuracy_reward_stage2": 0.5610003471374512, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 837 }, { "completion_length": 12.5625, "epoch": 0.14683721745225162, "grad_norm": 19.635947621442128, "kl": 0.0908203125, "learning_rate": 8.533380059575959e-07, "loss": 0.0363, "reward": 1.588404655456543, "reward_std": 0.2183130383491516, "rewards/accuracy_reward_stage2": 0.5884045958518982, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 838 }, { "completion_length": 10.90625, "epoch": 0.14701244086209916, "grad_norm": 21.805824502522523, "kl": 0.12890625, "learning_rate": 8.531627825477483e-07, "loss": 0.0514, "reward": 1.6451040506362915, "reward_std": 0.2274044007062912, "rewards/accuracy_reward_stage2": 0.7701040506362915, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 839 }, { "completion_length": 12.875, "epoch": 0.14718766427194674, "grad_norm": 16.040950668543186, "kl": 0.07958984375, "learning_rate": 8.529875591379008e-07, "loss": 0.0318, "reward": 1.7686783075332642, "reward_std": 0.1477348506450653, "rewards/accuracy_reward_stage2": 0.7686783075332642, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 840 }, { "completion_length": 11.5625, "epoch": 0.14736288768179429, "grad_norm": 30.501348999856685, "kl": 0.30859375, "learning_rate": 8.528123357280533e-07, "loss": 0.0915, "reward": 1.3233115673065186, "reward_std": 0.2250458151102066, "rewards/accuracy_reward_stage2": 0.46393659710884094, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 841 }, { "completion_length": 12.78125, "epoch": 0.14753811109164183, "grad_norm": 22.250109240384603, "kl": 0.455078125, "learning_rate": 8.526371123182057e-07, "loss": 0.149, "reward": 1.1825652122497559, "reward_std": 0.19976741075515747, "rewards/accuracy_reward_stage2": 0.44819021224975586, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 842 }, { "completion_length": 9.28125, "epoch": 0.1477133345014894, "grad_norm": 16.5712263276257, "kl": 0.041015625, "learning_rate": 8.524618889083582e-07, "loss": 0.0164, "reward": 1.6536760330200195, "reward_std": 0.12514111399650574, "rewards/accuracy_reward_stage2": 0.6536760926246643, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 843 }, { "completion_length": 27.203125, "epoch": 0.14788855791133695, "grad_norm": 17.3294287434311, "kl": 0.51171875, "learning_rate": 8.522866654985105e-07, "loss": 0.2058, "reward": 1.306678295135498, "reward_std": 0.21027681231498718, "rewards/accuracy_reward_stage2": 0.4316784143447876, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 844 }, { "completion_length": 10.640625, "epoch": 0.1480637813211845, "grad_norm": 19.513135768745453, "kl": 0.57421875, "learning_rate": 8.52111442088663e-07, "loss": 0.229, "reward": 1.5448485612869263, "reward_std": 0.17319580912590027, "rewards/accuracy_reward_stage2": 0.6698485612869263, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 845 }, { "completion_length": 7.203125, "epoch": 0.14823900473103208, "grad_norm": 24.577003518223613, "kl": 0.03076171875, "learning_rate": 8.519362186788155e-07, "loss": 0.0123, "reward": 1.5872409343719482, "reward_std": 0.10471472889184952, "rewards/accuracy_reward_stage2": 0.5872409343719482, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 846 }, { "completion_length": 23.0, "epoch": 0.14841422814087962, "grad_norm": 16.92597397416524, "kl": 0.09228515625, "learning_rate": 8.517609952689679e-07, "loss": -0.0072, "reward": 1.4274213314056396, "reward_std": 0.15768280625343323, "rewards/accuracy_reward_stage2": 0.5680463314056396, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 847 }, { "completion_length": 16.5, "epoch": 0.14858945155072717, "grad_norm": 22.36619562360302, "kl": 0.05419921875, "learning_rate": 8.515857718591204e-07, "loss": 0.0217, "reward": 1.5342915058135986, "reward_std": 0.15957878530025482, "rewards/accuracy_reward_stage2": 0.5342913866043091, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 848 }, { "completion_length": 13.25, "epoch": 0.14876467496057474, "grad_norm": 16.30851395059505, "kl": 0.060302734375, "learning_rate": 8.514105484492728e-07, "loss": 0.0242, "reward": 1.6138888597488403, "reward_std": 0.17767907679080963, "rewards/accuracy_reward_stage2": 0.6138888597488403, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 849 }, { "completion_length": 11.265625, "epoch": 0.1489398983704223, "grad_norm": 19.055320023008036, "kl": 0.083984375, "learning_rate": 8.512353250394252e-07, "loss": 0.0335, "reward": 1.6319842338562012, "reward_std": 0.2157711535692215, "rewards/accuracy_reward_stage2": 0.756984293460846, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 850 }, { "completion_length": 10.03125, "epoch": 0.14911512178026984, "grad_norm": 18.371430716507827, "kl": 0.0888671875, "learning_rate": 8.510601016295777e-07, "loss": 0.0355, "reward": 1.6245781183242798, "reward_std": 0.23073048889636993, "rewards/accuracy_reward_stage2": 0.6245781779289246, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 851 }, { "completion_length": 7.9375, "epoch": 0.1492903451901174, "grad_norm": 20.02482116621384, "kl": 0.01544189453125, "learning_rate": 8.508848782197301e-07, "loss": 0.0062, "reward": 1.7079994678497314, "reward_std": 0.1766592115163803, "rewards/accuracy_reward_stage2": 0.7079994678497314, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 852 }, { "completion_length": 10.234375, "epoch": 0.14946556859996496, "grad_norm": 15.743834333932089, "kl": 0.08251953125, "learning_rate": 8.507096548098825e-07, "loss": 0.0331, "reward": 1.6240935325622559, "reward_std": 0.11661704629659653, "rewards/accuracy_reward_stage2": 0.7490935325622559, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 853 }, { "completion_length": 10.28125, "epoch": 0.1496407920098125, "grad_norm": 27.186682983752124, "kl": 0.287109375, "learning_rate": 8.50534431400035e-07, "loss": 0.1149, "reward": 1.171875, "reward_std": 0.13258251547813416, "rewards/accuracy_reward_stage2": 0.296875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 854 }, { "completion_length": 11.40625, "epoch": 0.14981601541966005, "grad_norm": 26.354753154950743, "kl": 0.14453125, "learning_rate": 8.503592079901874e-07, "loss": 0.0577, "reward": 1.4655927419662476, "reward_std": 0.2811081111431122, "rewards/accuracy_reward_stage2": 0.46559271216392517, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 855 }, { "completion_length": 8.65625, "epoch": 0.14999123882950763, "grad_norm": 14.760397091858115, "kl": 0.07177734375, "learning_rate": 8.501839845803399e-07, "loss": -0.0042, "reward": 1.421875, "reward_std": 0.2597545385360718, "rewards/accuracy_reward_stage2": 0.4375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 856 }, { "completion_length": 14.515625, "epoch": 0.15016646223935518, "grad_norm": 29.598384378452092, "kl": 0.07470703125, "learning_rate": 8.500087611704924e-07, "loss": 0.0299, "reward": 1.3475062847137451, "reward_std": 0.3122522532939911, "rewards/accuracy_reward_stage2": 0.47250625491142273, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 857 }, { "completion_length": 7.515625, "epoch": 0.15034168564920272, "grad_norm": 16.23384962972575, "kl": 0.0703125, "learning_rate": 8.498335377606448e-07, "loss": 0.0282, "reward": 1.5904297828674316, "reward_std": 0.19798138737678528, "rewards/accuracy_reward_stage2": 0.7154297828674316, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 858 }, { "completion_length": 7.625, "epoch": 0.1505169090590503, "grad_norm": 24.902021328037982, "kl": 0.05712890625, "learning_rate": 8.496583143507973e-07, "loss": 0.0228, "reward": 1.7132034301757812, "reward_std": 0.2248350977897644, "rewards/accuracy_reward_stage2": 0.7132034301757812, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 859 }, { "completion_length": 12.390625, "epoch": 0.15069213246889784, "grad_norm": 25.012761537860488, "kl": 0.057861328125, "learning_rate": 8.494830909409497e-07, "loss": -0.0116, "reward": 1.5423566102981567, "reward_std": 0.2658918499946594, "rewards/accuracy_reward_stage2": 0.557981550693512, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 860 }, { "completion_length": 8.53125, "epoch": 0.1508673558787454, "grad_norm": 18.010656267314577, "kl": 0.033203125, "learning_rate": 8.493078675311021e-07, "loss": 0.0133, "reward": 1.5944479703903198, "reward_std": 0.09074701368808746, "rewards/accuracy_reward_stage2": 0.5944479703903198, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 861 }, { "completion_length": 11.125, "epoch": 0.15104257928859297, "grad_norm": 19.175468347035594, "kl": 0.01300048828125, "learning_rate": 8.491326441212546e-07, "loss": 0.0052, "reward": 1.7279086112976074, "reward_std": 0.11366454511880875, "rewards/accuracy_reward_stage2": 0.7279086112976074, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 862 }, { "completion_length": 11.59375, "epoch": 0.1512178026984405, "grad_norm": 17.84506185110907, "kl": 0.177734375, "learning_rate": 8.489574207114069e-07, "loss": 0.071, "reward": 1.5485143661499023, "reward_std": 0.19748035073280334, "rewards/accuracy_reward_stage2": 0.5485143661499023, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 863 }, { "completion_length": 12.34375, "epoch": 0.15139302610828806, "grad_norm": 20.25179638432365, "kl": 0.048828125, "learning_rate": 8.487821973015594e-07, "loss": 0.0195, "reward": 1.4129174947738647, "reward_std": 0.3144758641719818, "rewards/accuracy_reward_stage2": 0.41291752457618713, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 864 }, { "completion_length": 17.390625, "epoch": 0.15156824951813563, "grad_norm": 23.015576292466694, "kl": 0.0673828125, "learning_rate": 8.486069738917118e-07, "loss": 0.027, "reward": 1.5686707496643066, "reward_std": 0.1687781810760498, "rewards/accuracy_reward_stage2": 0.5686706304550171, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 865 }, { "completion_length": 10.640625, "epoch": 0.15174347292798318, "grad_norm": 25.940419340471752, "kl": 0.06298828125, "learning_rate": 8.484317504818643e-07, "loss": -0.0758, "reward": 1.4635417461395264, "reward_std": 0.26842159032821655, "rewards/accuracy_reward_stage2": 0.5104166865348816, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 866 }, { "completion_length": 7.53125, "epoch": 0.15191869633783073, "grad_norm": 21.38780500879031, "kl": 0.06591796875, "learning_rate": 8.482565270720168e-07, "loss": -0.0052, "reward": 1.7270491123199463, "reward_std": 0.27854660153388977, "rewards/accuracy_reward_stage2": 0.7426741719245911, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 867 }, { "completion_length": 12.09375, "epoch": 0.1520939197476783, "grad_norm": 16.75516936788131, "kl": 0.0223388671875, "learning_rate": 8.480813036621692e-07, "loss": -0.0345, "reward": 1.648539423942566, "reward_std": 0.1907489001750946, "rewards/accuracy_reward_stage2": 0.6641644239425659, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 868 }, { "completion_length": 9.484375, "epoch": 0.15226914315752585, "grad_norm": 16.02390522378857, "kl": 0.0255126953125, "learning_rate": 8.479060802523217e-07, "loss": 0.0102, "reward": 1.742557406425476, "reward_std": 0.09017640352249146, "rewards/accuracy_reward_stage2": 0.7425574064254761, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 869 }, { "completion_length": 12.3125, "epoch": 0.1524443665673734, "grad_norm": 19525.62924019597, "kl": 81.0, "learning_rate": 8.477308568424742e-07, "loss": 32.4057, "reward": 1.4085581302642822, "reward_std": 0.26161444187164307, "rewards/accuracy_reward_stage2": 0.5491830706596375, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 870 }, { "completion_length": 8.34375, "epoch": 0.15261958997722094, "grad_norm": 14.501936237694595, "kl": 0.091796875, "learning_rate": 8.475556334326266e-07, "loss": 0.0014, "reward": 1.762599229812622, "reward_std": 0.11367248743772507, "rewards/accuracy_reward_stage2": 0.7782242298126221, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 871 }, { "completion_length": 12.5625, "epoch": 0.15279481338706852, "grad_norm": 19.60168459278808, "kl": 0.01531982421875, "learning_rate": 8.473804100227791e-07, "loss": 0.0061, "reward": 1.6145833730697632, "reward_std": 0.1613328456878662, "rewards/accuracy_reward_stage2": 0.6145833134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 872 }, { "completion_length": 20.203125, "epoch": 0.15297003679691606, "grad_norm": 472.34090688262086, "kl": 2.40625, "learning_rate": 8.472051866129316e-07, "loss": 0.9589, "reward": 1.3896098136901855, "reward_std": 0.12324882298707962, "rewards/accuracy_reward_stage2": 0.514609694480896, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 873 }, { "completion_length": 12.0, "epoch": 0.1531452602067636, "grad_norm": 22.644802413426646, "kl": 0.05517578125, "learning_rate": 8.470299632030838e-07, "loss": 0.022, "reward": 1.5755821466445923, "reward_std": 0.16714820265769958, "rewards/accuracy_reward_stage2": 0.5755821466445923, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 874 }, { "completion_length": 8.65625, "epoch": 0.1533204836166112, "grad_norm": 20.082514560590244, "kl": 0.044677734375, "learning_rate": 8.468547397932363e-07, "loss": 0.0179, "reward": 1.7464009523391724, "reward_std": 0.28311580419540405, "rewards/accuracy_reward_stage2": 0.7464009523391724, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 875 }, { "completion_length": 11.21875, "epoch": 0.15349570702645873, "grad_norm": 14.584972917601204, "kl": 0.08544921875, "learning_rate": 8.466795163833887e-07, "loss": 0.0342, "reward": 1.7056055068969727, "reward_std": 0.1420706957578659, "rewards/accuracy_reward_stage2": 0.7056055068969727, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 876 }, { "completion_length": 10.046875, "epoch": 0.15367093043630628, "grad_norm": 15.131222808042867, "kl": 0.10107421875, "learning_rate": 8.465042929735412e-07, "loss": 0.0403, "reward": 1.6770800352096558, "reward_std": 0.11454734951257706, "rewards/accuracy_reward_stage2": 0.677079975605011, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 877 }, { "completion_length": 13.546875, "epoch": 0.15384615384615385, "grad_norm": 33.88045076112149, "kl": 0.349609375, "learning_rate": 8.463290695636937e-07, "loss": 0.1401, "reward": 1.7129206657409668, "reward_std": 0.19110271334648132, "rewards/accuracy_reward_stage2": 0.837920606136322, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 878 }, { "completion_length": 13.390625, "epoch": 0.1540213772560014, "grad_norm": 18.9186422728538, "kl": 0.1640625, "learning_rate": 8.461538461538461e-07, "loss": 0.0284, "reward": 1.4437906742095947, "reward_std": 0.2090407907962799, "rewards/accuracy_reward_stage2": 0.5844157338142395, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 879 }, { "completion_length": 11.5625, "epoch": 0.15419660066584895, "grad_norm": 22.16291882417554, "kl": 0.0771484375, "learning_rate": 8.459786227439986e-07, "loss": 0.018, "reward": 1.5695393085479736, "reward_std": 0.2572243809700012, "rewards/accuracy_reward_stage2": 0.6945393085479736, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 880 }, { "completion_length": 13.09375, "epoch": 0.15437182407569652, "grad_norm": 23.00953445571644, "kl": 0.20703125, "learning_rate": 8.45803399334151e-07, "loss": 0.0833, "reward": 1.3842726945877075, "reward_std": 0.24705079197883606, "rewards/accuracy_reward_stage2": 0.5092726945877075, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 881 }, { "completion_length": 13.328125, "epoch": 0.15454704748554407, "grad_norm": 21.121694948505297, "kl": 0.0673828125, "learning_rate": 8.456281759243035e-07, "loss": 0.027, "reward": 1.80861234664917, "reward_std": 0.12876972556114197, "rewards/accuracy_reward_stage2": 0.8086122870445251, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 882 }, { "completion_length": 12.84375, "epoch": 0.15472227089539162, "grad_norm": 27.381915192859076, "kl": 0.1103515625, "learning_rate": 8.45452952514456e-07, "loss": 0.0055, "reward": 1.6596755981445312, "reward_std": 0.31475722789764404, "rewards/accuracy_reward_stage2": 0.6753007173538208, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 883 }, { "completion_length": 10.390625, "epoch": 0.1548974943052392, "grad_norm": 22.464663083727196, "kl": 0.050048828125, "learning_rate": 8.452777291046083e-07, "loss": 0.02, "reward": 1.599549651145935, "reward_std": 0.23784250020980835, "rewards/accuracy_reward_stage2": 0.5995496511459351, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 884 }, { "completion_length": 8.8125, "epoch": 0.15507271771508674, "grad_norm": 21.711014490365343, "kl": 0.10400390625, "learning_rate": 8.451025056947608e-07, "loss": 0.016, "reward": 1.6585588455200195, "reward_std": 0.2637866735458374, "rewards/accuracy_reward_stage2": 0.6741837859153748, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 885 }, { "completion_length": 10.015625, "epoch": 0.15524794112493429, "grad_norm": 14.963862309249967, "kl": 0.1533203125, "learning_rate": 8.449272822849133e-07, "loss": 0.0252, "reward": 1.5280214548110962, "reward_std": 0.15586411952972412, "rewards/accuracy_reward_stage2": 0.5436464548110962, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 886 }, { "completion_length": 10.859375, "epoch": 0.15542316453478186, "grad_norm": 21.292734357299597, "kl": 0.033203125, "learning_rate": 8.447520588750656e-07, "loss": 0.0133, "reward": 1.5913714170455933, "reward_std": 0.2325046956539154, "rewards/accuracy_reward_stage2": 0.5913714170455933, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 887 }, { "completion_length": 11.828125, "epoch": 0.1555983879446294, "grad_norm": 16.972353026269435, "kl": 0.1337890625, "learning_rate": 8.445768354652181e-07, "loss": 0.0171, "reward": 1.586073637008667, "reward_std": 0.15965032577514648, "rewards/accuracy_reward_stage2": 0.726698637008667, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 888 }, { "completion_length": 9.5, "epoch": 0.15577361135447695, "grad_norm": 18.709718658093724, "kl": 0.0908203125, "learning_rate": 8.444016120553705e-07, "loss": 0.0365, "reward": 1.4933067560195923, "reward_std": 0.1306682527065277, "rewards/accuracy_reward_stage2": 0.6183068156242371, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 889 }, { "completion_length": 11.5625, "epoch": 0.1559488347643245, "grad_norm": 26.23996003618781, "kl": 0.384765625, "learning_rate": 8.44226388645523e-07, "loss": 0.1538, "reward": 1.390785574913025, "reward_std": 0.24497109651565552, "rewards/accuracy_reward_stage2": 0.5157855749130249, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 890 }, { "completion_length": 8.640625, "epoch": 0.15612405817417208, "grad_norm": 20.44810828757477, "kl": 0.045166015625, "learning_rate": 8.440511652356755e-07, "loss": 0.0181, "reward": 1.4914238452911377, "reward_std": 0.11264529824256897, "rewards/accuracy_reward_stage2": 0.4914238750934601, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 891 }, { "completion_length": 19.890625, "epoch": 0.15629928158401962, "grad_norm": 20.186573134056676, "kl": 0.466796875, "learning_rate": 8.438759418258279e-07, "loss": 0.1425, "reward": 1.1410515308380127, "reward_std": 0.1473521888256073, "rewards/accuracy_reward_stage2": 0.28167659044265747, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 892 }, { "completion_length": 7.625, "epoch": 0.15647450499386717, "grad_norm": 20.23432496884711, "kl": 0.0927734375, "learning_rate": 8.437007184159803e-07, "loss": 0.0372, "reward": 1.5041792392730713, "reward_std": 0.07779411971569061, "rewards/accuracy_reward_stage2": 0.5198042392730713, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 893 }, { "completion_length": 13.796875, "epoch": 0.15664972840371474, "grad_norm": 15.623264418195424, "kl": 0.01202392578125, "learning_rate": 8.435254950061328e-07, "loss": 0.0048, "reward": 1.471541404724121, "reward_std": 0.12218821048736572, "rewards/accuracy_reward_stage2": 0.4715413451194763, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 894 }, { "completion_length": 10.1875, "epoch": 0.1568249518135623, "grad_norm": 19.707690843826153, "kl": 0.056884765625, "learning_rate": 8.433502715962852e-07, "loss": -0.0214, "reward": 1.5333350896835327, "reward_std": 0.1759674847126007, "rewards/accuracy_reward_stage2": 0.5489600896835327, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 895 }, { "completion_length": 11.53125, "epoch": 0.15700017522340984, "grad_norm": 28.43513771094324, "kl": 0.052978515625, "learning_rate": 8.431750481864377e-07, "loss": 0.0212, "reward": 1.3929375410079956, "reward_std": 0.3441798985004425, "rewards/accuracy_reward_stage2": 0.5179375410079956, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 896 }, { "completion_length": 9.0, "epoch": 0.1571753986332574, "grad_norm": 34.6067214018072, "kl": 0.18359375, "learning_rate": 8.429998247765901e-07, "loss": 0.0294, "reward": 1.5371513366699219, "reward_std": 0.14979934692382812, "rewards/accuracy_reward_stage2": 0.5527763366699219, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 897 }, { "completion_length": 9.921875, "epoch": 0.15735062204310496, "grad_norm": 29.157215435126794, "kl": 0.1044921875, "learning_rate": 8.428246013667426e-07, "loss": 0.0418, "reward": 1.6572283506393433, "reward_std": 0.16798993945121765, "rewards/accuracy_reward_stage2": 0.6572283506393433, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 898 }, { "completion_length": 11.953125, "epoch": 0.1575258454529525, "grad_norm": 18.84363409067946, "kl": 0.1201171875, "learning_rate": 8.426493779568951e-07, "loss": 0.0481, "reward": 1.5634629726409912, "reward_std": 0.2565038800239563, "rewards/accuracy_reward_stage2": 0.5634629726409912, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 899 }, { "completion_length": 7.65625, "epoch": 0.15770106886280008, "grad_norm": 17.599262016486215, "kl": 0.0595703125, "learning_rate": 8.424741545470474e-07, "loss": 0.0238, "reward": 1.7447772026062012, "reward_std": 0.2339225709438324, "rewards/accuracy_reward_stage2": 0.7447772026062012, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 900 }, { "completion_length": 9.9375, "epoch": 0.15787629227264763, "grad_norm": 25.476523126982208, "kl": 0.0179443359375, "learning_rate": 8.422989311371999e-07, "loss": 0.0072, "reward": 1.4794890880584717, "reward_std": 0.23430663347244263, "rewards/accuracy_reward_stage2": 0.47948914766311646, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 901 }, { "completion_length": 12.46875, "epoch": 0.15805151568249518, "grad_norm": 24.442260381874355, "kl": 0.078125, "learning_rate": 8.421237077273524e-07, "loss": 0.0313, "reward": 1.6743123531341553, "reward_std": 0.19850794970989227, "rewards/accuracy_reward_stage2": 0.6743123531341553, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 902 }, { "completion_length": 9.015625, "epoch": 0.15822673909234275, "grad_norm": 21.14240758724476, "kl": 0.11328125, "learning_rate": 8.419484843175047e-07, "loss": 0.0056, "reward": 1.4228363037109375, "reward_std": 0.25364816188812256, "rewards/accuracy_reward_stage2": 0.5634613633155823, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 903 }, { "completion_length": 15.0, "epoch": 0.1584019625021903, "grad_norm": 20.79188296865029, "kl": 0.17578125, "learning_rate": 8.417732609076572e-07, "loss": -0.0182, "reward": 1.4797170162200928, "reward_std": 0.15435296297073364, "rewards/accuracy_reward_stage2": 0.5109670162200928, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 904 }, { "completion_length": 8.78125, "epoch": 0.15857718591203784, "grad_norm": 14.706728396656908, "kl": 0.08349609375, "learning_rate": 8.415980374978096e-07, "loss": 0.0334, "reward": 1.2869999408721924, "reward_std": 0.08212931454181671, "rewards/accuracy_reward_stage2": 0.5370000004768372, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 905 }, { "completion_length": 21.984375, "epoch": 0.1587524093218854, "grad_norm": 20.45850299415546, "kl": 0.1708984375, "learning_rate": 8.414228140879621e-07, "loss": 0.0685, "reward": 1.2784233093261719, "reward_std": 0.15729627013206482, "rewards/accuracy_reward_stage2": 0.40342339873313904, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 906 }, { "completion_length": 10.625, "epoch": 0.15892763273173297, "grad_norm": 20.26740604800182, "kl": 0.039794921875, "learning_rate": 8.412475906781146e-07, "loss": 0.0159, "reward": 1.5881588459014893, "reward_std": 0.14251913130283356, "rewards/accuracy_reward_stage2": 0.5881587862968445, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 907 }, { "completion_length": 9.09375, "epoch": 0.1591028561415805, "grad_norm": 19.96265675198023, "kl": 0.037353515625, "learning_rate": 8.41072367268267e-07, "loss": 0.0149, "reward": 1.71971595287323, "reward_std": 0.17054104804992676, "rewards/accuracy_reward_stage2": 0.71971595287323, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 908 }, { "completion_length": 9.953125, "epoch": 0.15927807955142806, "grad_norm": 20.517225673527474, "kl": 0.06787109375, "learning_rate": 8.408971438584195e-07, "loss": 0.027, "reward": 1.6317434310913086, "reward_std": 0.21083226799964905, "rewards/accuracy_reward_stage2": 0.6317434906959534, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 909 }, { "completion_length": 9.765625, "epoch": 0.15945330296127563, "grad_norm": 20.84710532062063, "kl": 0.03076171875, "learning_rate": 8.40721920448572e-07, "loss": 0.0123, "reward": 1.46875, "reward_std": 0.2619796395301819, "rewards/accuracy_reward_stage2": 0.46875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 910 }, { "completion_length": 11.0625, "epoch": 0.15962852637112318, "grad_norm": 13.841406690310421, "kl": 0.0196533203125, "learning_rate": 8.405466970387244e-07, "loss": -0.0363, "reward": 1.761332392692566, "reward_std": 0.11642816662788391, "rewards/accuracy_reward_stage2": 0.7769573330879211, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 911 }, { "completion_length": 8.28125, "epoch": 0.15980374978097073, "grad_norm": 17.91290633966341, "kl": 0.14453125, "learning_rate": 8.403714736288767e-07, "loss": 0.0579, "reward": 1.7218239307403564, "reward_std": 0.10979120433330536, "rewards/accuracy_reward_stage2": 0.7218239307403564, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 912 }, { "completion_length": 16.421875, "epoch": 0.1599789731908183, "grad_norm": 7060.804174203911, "kl": 31.375, "learning_rate": 8.401962502190291e-07, "loss": 12.5837, "reward": 1.574540376663208, "reward_std": 0.337665855884552, "rewards/accuracy_reward_stage2": 0.7151654362678528, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 913 }, { "completion_length": 12.03125, "epoch": 0.16015419660066585, "grad_norm": 22.485843157999255, "kl": 0.10498046875, "learning_rate": 8.400210268091816e-07, "loss": 0.0109, "reward": 1.547934651374817, "reward_std": 0.2350578010082245, "rewards/accuracy_reward_stage2": 0.6885595917701721, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 914 }, { "completion_length": 16.75, "epoch": 0.1603294200105134, "grad_norm": 22.681329137939034, "kl": 0.0751953125, "learning_rate": 8.398458033993341e-07, "loss": -0.1154, "reward": 1.7184510231018066, "reward_std": 0.23591922223567963, "rewards/accuracy_reward_stage2": 0.7809509634971619, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 915 }, { "completion_length": 6.515625, "epoch": 0.16050464342036097, "grad_norm": 22.439686965003617, "kl": 0.050048828125, "learning_rate": 8.396705799894865e-07, "loss": 0.0201, "reward": 1.5222173929214478, "reward_std": 0.18794281780719757, "rewards/accuracy_reward_stage2": 0.5222173929214478, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 916 }, { "completion_length": 9.9375, "epoch": 0.16067986683020852, "grad_norm": 20.917834139045315, "kl": 0.09521484375, "learning_rate": 8.39495356579639e-07, "loss": 0.0381, "reward": 1.3864175081253052, "reward_std": 0.1506006121635437, "rewards/accuracy_reward_stage2": 0.5114175081253052, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 917 }, { "completion_length": 19.0, "epoch": 0.16085509024005606, "grad_norm": 22.230505217804176, "kl": 0.08984375, "learning_rate": 8.393201331697915e-07, "loss": 0.0359, "reward": 1.5073747634887695, "reward_std": 0.1665232926607132, "rewards/accuracy_reward_stage2": 0.50737464427948, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 918 }, { "completion_length": 9.921875, "epoch": 0.16103031364990364, "grad_norm": 21.06591612131756, "kl": 0.11962890625, "learning_rate": 8.391449097599439e-07, "loss": 0.0479, "reward": 1.4632796049118042, "reward_std": 0.2604491710662842, "rewards/accuracy_reward_stage2": 0.5882796049118042, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 919 }, { "completion_length": 7.375, "epoch": 0.16120553705975119, "grad_norm": 23.745634816698153, "kl": 0.1220703125, "learning_rate": 8.389696863500964e-07, "loss": 0.0046, "reward": 1.4650936126708984, "reward_std": 0.25936761498451233, "rewards/accuracy_reward_stage2": 0.4807187020778656, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 920 }, { "completion_length": 12.0, "epoch": 0.16138076046959873, "grad_norm": 19.61377943867712, "kl": 0.1015625, "learning_rate": 8.387944629402488e-07, "loss": 0.0407, "reward": 1.5062568187713623, "reward_std": 0.24306440353393555, "rewards/accuracy_reward_stage2": 0.6312568187713623, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 921 }, { "completion_length": 12.8125, "epoch": 0.1615559838794463, "grad_norm": 23.541936474447102, "kl": 0.2060546875, "learning_rate": 8.386192395304013e-07, "loss": 0.0823, "reward": 1.4451262950897217, "reward_std": 0.20797914266586304, "rewards/accuracy_reward_stage2": 0.6951261758804321, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 922 }, { "completion_length": 10.5, "epoch": 0.16173120728929385, "grad_norm": 19.920258716182968, "kl": 0.330078125, "learning_rate": 8.384440161205537e-07, "loss": 0.145, "reward": 1.4631624221801758, "reward_std": 0.17326810956001282, "rewards/accuracy_reward_stage2": 0.5881624221801758, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 923 }, { "completion_length": 10.3125, "epoch": 0.1619064306991414, "grad_norm": 19.987624769580446, "kl": 0.07958984375, "learning_rate": 8.382687927107061e-07, "loss": -0.0358, "reward": 1.5927538871765137, "reward_std": 0.20818987488746643, "rewards/accuracy_reward_stage2": 0.7490040063858032, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 924 }, { "completion_length": 5.3125, "epoch": 0.16208165410898895, "grad_norm": 16.61358413552998, "kl": 0.01495361328125, "learning_rate": 8.380935693008585e-07, "loss": 0.006, "reward": 1.877314805984497, "reward_std": 0.14518392086029053, "rewards/accuracy_reward_stage2": 0.8773148059844971, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 925 }, { "completion_length": 12.421875, "epoch": 0.16225687751883652, "grad_norm": 20.542640522580562, "kl": 0.138671875, "learning_rate": 8.37918345891011e-07, "loss": 0.0555, "reward": 1.7388439178466797, "reward_std": 0.1671719253063202, "rewards/accuracy_reward_stage2": 0.7388438582420349, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 926 }, { "completion_length": 12.40625, "epoch": 0.16243210092868407, "grad_norm": 15.331506669605071, "kl": 0.12255859375, "learning_rate": 8.377431224811634e-07, "loss": 0.0491, "reward": 1.3322160243988037, "reward_std": 0.08134040981531143, "rewards/accuracy_reward_stage2": 0.4572159945964813, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 927 }, { "completion_length": 11.921875, "epoch": 0.16260732433853162, "grad_norm": 19.2444126838387, "kl": 0.028564453125, "learning_rate": 8.375678990713159e-07, "loss": -0.0328, "reward": 1.5271073579788208, "reward_std": 0.30369114875793457, "rewards/accuracy_reward_stage2": 0.6677324175834656, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 928 }, { "completion_length": 8.34375, "epoch": 0.1627825477483792, "grad_norm": 13.892006524907126, "kl": 0.0205078125, "learning_rate": 8.373926756614683e-07, "loss": 0.0082, "reward": 1.6059027910232544, "reward_std": 0.08084940165281296, "rewards/accuracy_reward_stage2": 0.6059027910232544, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 929 }, { "completion_length": 16.46875, "epoch": 0.16295777115822674, "grad_norm": 25.398573109817477, "kl": 0.197265625, "learning_rate": 8.372174522516208e-07, "loss": 0.0396, "reward": 1.4554202556610107, "reward_std": 0.1284077912569046, "rewards/accuracy_reward_stage2": 0.47104525566101074, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 930 }, { "completion_length": 12.984375, "epoch": 0.16313299456807429, "grad_norm": 21.605017942356415, "kl": 0.0693359375, "learning_rate": 8.370422288417733e-07, "loss": -0.0037, "reward": 1.4630324840545654, "reward_std": 0.38340917229652405, "rewards/accuracy_reward_stage2": 0.47865748405456543, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 931 }, { "completion_length": 9.09375, "epoch": 0.16330821797792186, "grad_norm": 22.13872771036885, "kl": 0.046142578125, "learning_rate": 8.368670054319256e-07, "loss": -0.0149, "reward": 1.28238844871521, "reward_std": 0.16411705315113068, "rewards/accuracy_reward_stage2": 0.29801347851753235, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 932 }, { "completion_length": 16.5, "epoch": 0.1634834413877694, "grad_norm": 18.146799048508555, "kl": 0.2412109375, "learning_rate": 8.366917820220781e-07, "loss": 0.063, "reward": 1.306018590927124, "reward_std": 0.2175029218196869, "rewards/accuracy_reward_stage2": 0.4466434717178345, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 933 }, { "completion_length": 10.03125, "epoch": 0.16365866479761695, "grad_norm": 15.35898759648924, "kl": 0.0517578125, "learning_rate": 8.365165586122306e-07, "loss": 0.0207, "reward": 1.464646339416504, "reward_std": 0.1268872767686844, "rewards/accuracy_reward_stage2": 0.46464625000953674, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 934 }, { "completion_length": 7.96875, "epoch": 0.16383388820746453, "grad_norm": 13.403024814567052, "kl": 0.04833984375, "learning_rate": 8.36341335202383e-07, "loss": 0.0193, "reward": 1.6848759651184082, "reward_std": 0.10018566995859146, "rewards/accuracy_reward_stage2": 0.6848759651184082, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 935 }, { "completion_length": 9.03125, "epoch": 0.16400911161731208, "grad_norm": 17.930941069831324, "kl": 0.08251953125, "learning_rate": 8.361661117925355e-07, "loss": -0.0112, "reward": 1.4982693195343018, "reward_std": 0.17483346164226532, "rewards/accuracy_reward_stage2": 0.513894259929657, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 936 }, { "completion_length": 10.5625, "epoch": 0.16418433502715962, "grad_norm": 22.799043223780735, "kl": 0.07421875, "learning_rate": 8.359908883826879e-07, "loss": -0.0142, "reward": 1.4645646810531616, "reward_std": 0.3297140598297119, "rewards/accuracy_reward_stage2": 0.48018965125083923, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 937 }, { "completion_length": 10.8125, "epoch": 0.1643595584370072, "grad_norm": 22.565240947724515, "kl": 0.068359375, "learning_rate": 8.358156649728403e-07, "loss": 0.0182, "reward": 1.7171674966812134, "reward_std": 0.07919125258922577, "rewards/accuracy_reward_stage2": 0.7327924966812134, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 938 }, { "completion_length": 10.625, "epoch": 0.16453478184685474, "grad_norm": 17.200532557520877, "kl": 0.04443359375, "learning_rate": 8.356404415629928e-07, "loss": -0.0128, "reward": 1.4361746311187744, "reward_std": 0.12804433703422546, "rewards/accuracy_reward_stage2": 0.5767996311187744, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 939 }, { "completion_length": 12.609375, "epoch": 0.1647100052567023, "grad_norm": 19.822105679449265, "kl": 0.30859375, "learning_rate": 8.354652181531452e-07, "loss": 0.079, "reward": 1.7268434762954712, "reward_std": 0.13828769326210022, "rewards/accuracy_reward_stage2": 0.8674684166908264, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 940 }, { "completion_length": 19.421875, "epoch": 0.16488522866654984, "grad_norm": 21.698097014939922, "kl": 0.07568359375, "learning_rate": 8.352899947432977e-07, "loss": 0.0081, "reward": 1.5953387022018433, "reward_std": 0.20249740779399872, "rewards/accuracy_reward_stage2": 0.6109637022018433, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 941 }, { "completion_length": 12.046875, "epoch": 0.1650604520763974, "grad_norm": 19.471630114890893, "kl": 0.177734375, "learning_rate": 8.3511477133345e-07, "loss": 0.071, "reward": 1.0085077285766602, "reward_std": 0.19060616195201874, "rewards/accuracy_reward_stage2": 0.2585076689720154, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 942 }, { "completion_length": 14.0, "epoch": 0.16523567548624496, "grad_norm": 18.459723557568463, "kl": 0.10595703125, "learning_rate": 8.349395479236025e-07, "loss": 0.0424, "reward": 1.3339991569519043, "reward_std": 0.1780506670475006, "rewards/accuracy_reward_stage2": 0.4589990973472595, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 943 }, { "completion_length": 8.59375, "epoch": 0.1654108988960925, "grad_norm": 21.711342619576424, "kl": 0.162109375, "learning_rate": 8.34764324513755e-07, "loss": -0.021, "reward": 1.465111494064331, "reward_std": 0.2542092800140381, "rewards/accuracy_reward_stage2": 0.621361494064331, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 944 }, { "completion_length": 15.78125, "epoch": 0.16558612230594008, "grad_norm": 12.957827896906505, "kl": 0.0224609375, "learning_rate": 8.345891011039074e-07, "loss": 0.009, "reward": 1.1822917461395264, "reward_std": 0.08154669404029846, "rewards/accuracy_reward_stage2": 0.3072916865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 945 }, { "completion_length": 15.90625, "epoch": 0.16576134571578763, "grad_norm": 14.16153592665762, "kl": 0.056396484375, "learning_rate": 8.344138776940599e-07, "loss": 0.0226, "reward": 1.3079233169555664, "reward_std": 0.1660769283771515, "rewards/accuracy_reward_stage2": 0.5579233169555664, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 946 }, { "completion_length": 19.765625, "epoch": 0.16593656912563517, "grad_norm": 26.178065837337677, "kl": 0.10595703125, "learning_rate": 8.342386542842124e-07, "loss": 0.0107, "reward": 1.4617750644683838, "reward_std": 0.29273518919944763, "rewards/accuracy_reward_stage2": 0.47740012407302856, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 947 }, { "completion_length": 10.71875, "epoch": 0.16611179253548275, "grad_norm": 23.879074853142832, "kl": 0.150390625, "learning_rate": 8.340634308743648e-07, "loss": 0.0603, "reward": 1.5003604888916016, "reward_std": 0.1546555906534195, "rewards/accuracy_reward_stage2": 0.6253605484962463, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 948 }, { "completion_length": 11.625, "epoch": 0.1662870159453303, "grad_norm": 28.97017047697589, "kl": 0.08935546875, "learning_rate": 8.338882074645173e-07, "loss": 0.0358, "reward": 1.651204228401184, "reward_std": 0.17542898654937744, "rewards/accuracy_reward_stage2": 0.6512041687965393, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 949 }, { "completion_length": 12.015625, "epoch": 0.16646223935517784, "grad_norm": 24.650912731008486, "kl": 0.078125, "learning_rate": 8.337129840546698e-07, "loss": 0.0312, "reward": 1.4317245483398438, "reward_std": 0.16404840350151062, "rewards/accuracy_reward_stage2": 0.5567246079444885, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 950 }, { "completion_length": 8.265625, "epoch": 0.16663746276502542, "grad_norm": 25.199836114032845, "kl": 0.019775390625, "learning_rate": 8.335377606448221e-07, "loss": 0.0079, "reward": 1.5420386791229248, "reward_std": 0.2433536946773529, "rewards/accuracy_reward_stage2": 0.5420386791229248, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 951 }, { "completion_length": 10.9375, "epoch": 0.16681268617487297, "grad_norm": 168.31758714014757, "kl": 0.61328125, "learning_rate": 8.333625372349745e-07, "loss": 0.2342, "reward": 1.4494047164916992, "reward_std": 0.2671560049057007, "rewards/accuracy_reward_stage2": 0.590029776096344, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 952 }, { "completion_length": 11.53125, "epoch": 0.1669879095847205, "grad_norm": 21.797026985006323, "kl": 0.10595703125, "learning_rate": 8.331873138251269e-07, "loss": 0.0095, "reward": 1.448401689529419, "reward_std": 0.21111756563186646, "rewards/accuracy_reward_stage2": 0.46402665972709656, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 953 }, { "completion_length": 19.484375, "epoch": 0.1671631329945681, "grad_norm": 221.20774745773093, "kl": 0.8515625, "learning_rate": 8.330120904152794e-07, "loss": 0.3394, "reward": 1.4512382745742798, "reward_std": 0.15571743249893188, "rewards/accuracy_reward_stage2": 0.5762382745742798, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 954 }, { "completion_length": 8.921875, "epoch": 0.16733835640441563, "grad_norm": 17.292543256694714, "kl": 0.1318359375, "learning_rate": 8.328368670054319e-07, "loss": -0.0206, "reward": 1.715613603591919, "reward_std": 0.19096148014068604, "rewards/accuracy_reward_stage2": 0.746863603591919, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 955 }, { "completion_length": 13.265625, "epoch": 0.16751357981426318, "grad_norm": 20.08735695941081, "kl": 0.11669921875, "learning_rate": 8.326616435955843e-07, "loss": 0.0468, "reward": 1.4697279930114746, "reward_std": 0.1781584620475769, "rewards/accuracy_reward_stage2": 0.5947280526161194, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 956 }, { "completion_length": 28.828125, "epoch": 0.16768880322411073, "grad_norm": 302.9314365613275, "kl": 1.640625, "learning_rate": 8.324864201857368e-07, "loss": 0.6545, "reward": 1.36354398727417, "reward_std": 0.06286264955997467, "rewards/accuracy_reward_stage2": 0.6135439872741699, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 957 }, { "completion_length": 7.9375, "epoch": 0.1678640266339583, "grad_norm": 23.483468245101076, "kl": 0.10595703125, "learning_rate": 8.323111967758892e-07, "loss": 0.0424, "reward": 1.3482142686843872, "reward_std": 0.27842962741851807, "rewards/accuracy_reward_stage2": 0.4732142686843872, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 958 }, { "completion_length": 9.21875, "epoch": 0.16803925004380585, "grad_norm": 23.99415537775469, "kl": 0.12158203125, "learning_rate": 8.321359733660417e-07, "loss": 0.0152, "reward": 1.4995429515838623, "reward_std": 0.2657421827316284, "rewards/accuracy_reward_stage2": 0.5307928919792175, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 959 }, { "completion_length": 11.390625, "epoch": 0.1682144734536534, "grad_norm": 36.190832877832605, "kl": 0.0234375, "learning_rate": 8.319607499561942e-07, "loss": 0.0094, "reward": 1.620686650276184, "reward_std": 0.10265517234802246, "rewards/accuracy_reward_stage2": 0.6206865906715393, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 960 }, { "completion_length": 8.09375, "epoch": 0.16838969686350097, "grad_norm": 18.849443706981564, "kl": 0.0849609375, "learning_rate": 8.317855265463466e-07, "loss": -0.0143, "reward": 1.3765008449554443, "reward_std": 0.20061229169368744, "rewards/accuracy_reward_stage2": 0.4077509045600891, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 961 }, { "completion_length": 16.546875, "epoch": 0.16856492027334852, "grad_norm": 27.7507287897152, "kl": 0.7578125, "learning_rate": 8.31610303136499e-07, "loss": 0.3034, "reward": 1.4773821830749512, "reward_std": 0.04698540270328522, "rewards/accuracy_reward_stage2": 0.727382242679596, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 962 }, { "completion_length": 10.0625, "epoch": 0.16874014368319606, "grad_norm": 15.944671828158617, "kl": 0.0322265625, "learning_rate": 8.314350797266514e-07, "loss": -0.0643, "reward": 1.3591651916503906, "reward_std": 0.22507423162460327, "rewards/accuracy_reward_stage2": 0.390415221452713, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 963 }, { "completion_length": 13.140625, "epoch": 0.16891536709304364, "grad_norm": 21.640372689023383, "kl": 0.1826171875, "learning_rate": 8.312598563168038e-07, "loss": 0.0184, "reward": 1.2974778413772583, "reward_std": 0.22381377220153809, "rewards/accuracy_reward_stage2": 0.4537278115749359, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 964 }, { "completion_length": 22.359375, "epoch": 0.16909059050289119, "grad_norm": 21.901735100068972, "kl": 0.20703125, "learning_rate": 8.310846329069563e-07, "loss": 0.0957, "reward": 1.496659278869629, "reward_std": 0.1753218173980713, "rewards/accuracy_reward_stage2": 0.6216592788696289, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 965 }, { "completion_length": 10.328125, "epoch": 0.16926581391273873, "grad_norm": 20.0283372209553, "kl": 0.08740234375, "learning_rate": 8.309094094971087e-07, "loss": -0.0093, "reward": 1.6073633432388306, "reward_std": 0.212762713432312, "rewards/accuracy_reward_stage2": 0.6229883432388306, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 966 }, { "completion_length": 7.3125, "epoch": 0.1694410373225863, "grad_norm": 20.241618585052276, "kl": 0.1015625, "learning_rate": 8.307341860872612e-07, "loss": -0.0036, "reward": 1.5482078790664673, "reward_std": 0.2085924744606018, "rewards/accuracy_reward_stage2": 0.5638328790664673, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 967 }, { "completion_length": 9.546875, "epoch": 0.16961626073243385, "grad_norm": 17.233476394758718, "kl": 0.04638671875, "learning_rate": 8.305589626774137e-07, "loss": -0.0423, "reward": 1.6302083730697632, "reward_std": 0.1822493076324463, "rewards/accuracy_reward_stage2": 0.6614583730697632, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 968 }, { "completion_length": 27.1875, "epoch": 0.1697914841422814, "grad_norm": 19.06856915302853, "kl": 0.039794921875, "learning_rate": 8.303837392675661e-07, "loss": -0.1353, "reward": 1.31388258934021, "reward_std": 0.23465386033058167, "rewards/accuracy_reward_stage2": 0.3763824701309204, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 969 }, { "completion_length": 13.625, "epoch": 0.16996670755212898, "grad_norm": 21.596284107801807, "kl": 0.3828125, "learning_rate": 8.302085158577186e-07, "loss": 0.1202, "reward": 1.1787537336349487, "reward_std": 0.16805896162986755, "rewards/accuracy_reward_stage2": 0.31937870383262634, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 970 }, { "completion_length": 7.4375, "epoch": 0.17014193096197652, "grad_norm": 16.395785575312768, "kl": 0.038818359375, "learning_rate": 8.300332924478711e-07, "loss": -0.0287, "reward": 1.5564574003219604, "reward_std": 0.1440478414297104, "rewards/accuracy_reward_stage2": 0.5720824003219604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 971 }, { "completion_length": 8.671875, "epoch": 0.17031715437182407, "grad_norm": 14.197834374550085, "kl": 0.03125, "learning_rate": 8.298580690380234e-07, "loss": 0.0126, "reward": 1.618015170097351, "reward_std": 0.0638478696346283, "rewards/accuracy_reward_stage2": 0.6180151700973511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 972 }, { "completion_length": 9.21875, "epoch": 0.17049237778167164, "grad_norm": 18.13707578787983, "kl": 0.07080078125, "learning_rate": 8.296828456281759e-07, "loss": 0.0284, "reward": 1.5550284385681152, "reward_std": 0.16677148640155792, "rewards/accuracy_reward_stage2": 0.5550283789634705, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 973 }, { "completion_length": 9.671875, "epoch": 0.1706676011915192, "grad_norm": 22.426704935069942, "kl": 0.028564453125, "learning_rate": 8.295076222183283e-07, "loss": -0.0747, "reward": 1.7982523441314697, "reward_std": 0.2379818707704544, "rewards/accuracy_reward_stage2": 0.8295024633407593, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 974 }, { "completion_length": 11.40625, "epoch": 0.17084282460136674, "grad_norm": 23.30227030147312, "kl": 0.0693359375, "learning_rate": 8.293323988084808e-07, "loss": 0.0277, "reward": 1.4609272480010986, "reward_std": 0.29046812653541565, "rewards/accuracy_reward_stage2": 0.46092718839645386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 975 }, { "completion_length": 9.875, "epoch": 0.17101804801121429, "grad_norm": 11.607327794748048, "kl": 0.09375, "learning_rate": 8.291571753986332e-07, "loss": 0.0375, "reward": 1.6811981201171875, "reward_std": 0.05087604746222496, "rewards/accuracy_reward_stage2": 0.8061981201171875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 976 }, { "completion_length": 20.390625, "epoch": 0.17119327142106186, "grad_norm": 19.407638180474976, "kl": 0.06396484375, "learning_rate": 8.289819519887856e-07, "loss": 0.0256, "reward": 1.4600509405136108, "reward_std": 0.21399806439876556, "rewards/accuracy_reward_stage2": 0.46005094051361084, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 977 }, { "completion_length": 8.484375, "epoch": 0.1713684948309094, "grad_norm": 23.060799171899124, "kl": 0.0703125, "learning_rate": 8.288067285789381e-07, "loss": 0.028, "reward": 1.6289044618606567, "reward_std": 0.27257654070854187, "rewards/accuracy_reward_stage2": 0.628904402256012, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 978 }, { "completion_length": 8.84375, "epoch": 0.17154371824075695, "grad_norm": 19.641303748012547, "kl": 0.046142578125, "learning_rate": 8.286315051690906e-07, "loss": 0.0185, "reward": 1.4471064805984497, "reward_std": 0.18143045902252197, "rewards/accuracy_reward_stage2": 0.4471064805984497, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 979 }, { "completion_length": 18.9375, "epoch": 0.17171894165060453, "grad_norm": 19.427931995450898, "kl": 0.06298828125, "learning_rate": 8.28456281759243e-07, "loss": -0.0189, "reward": 1.4050720930099487, "reward_std": 0.19070225954055786, "rewards/accuracy_reward_stage2": 0.42069703340530396, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 980 }, { "completion_length": 9.203125, "epoch": 0.17189416506045208, "grad_norm": 15.666858070238954, "kl": 0.099609375, "learning_rate": 8.282810583493955e-07, "loss": -0.0777, "reward": 1.551900863647461, "reward_std": 0.21262939274311066, "rewards/accuracy_reward_stage2": 0.5987757444381714, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 981 }, { "completion_length": 15.375, "epoch": 0.17206938847029962, "grad_norm": 15.354945603315256, "kl": 0.08251953125, "learning_rate": 8.281058349395478e-07, "loss": 0.0331, "reward": 1.2965600490570068, "reward_std": 0.09104090929031372, "rewards/accuracy_reward_stage2": 0.42156004905700684, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 982 }, { "completion_length": 12.09375, "epoch": 0.1722446118801472, "grad_norm": 22.37440013286015, "kl": 0.038330078125, "learning_rate": 8.279306115297003e-07, "loss": 0.0153, "reward": 1.5221551656723022, "reward_std": 0.2770881652832031, "rewards/accuracy_reward_stage2": 0.5221551656723022, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 983 }, { "completion_length": 7.234375, "epoch": 0.17241983528999474, "grad_norm": 16.939924437153255, "kl": 0.00830078125, "learning_rate": 8.277553881198528e-07, "loss": 0.0033, "reward": 1.7744736671447754, "reward_std": 0.17920680344104767, "rewards/accuracy_reward_stage2": 0.7744735479354858, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 984 }, { "completion_length": 22.359375, "epoch": 0.1725950586998423, "grad_norm": 3498.02854669452, "kl": 9.5, "learning_rate": 8.275801647100052e-07, "loss": 3.7769, "reward": 1.7484500408172607, "reward_std": 0.09648245573043823, "rewards/accuracy_reward_stage2": 0.8734498620033264, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 985 }, { "completion_length": 9.734375, "epoch": 0.17277028210968987, "grad_norm": 17.801115214882383, "kl": 0.0634765625, "learning_rate": 8.274049413001577e-07, "loss": 0.0253, "reward": 1.5197932720184326, "reward_std": 0.16293829679489136, "rewards/accuracy_reward_stage2": 0.5197933912277222, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 986 }, { "completion_length": 10.5, "epoch": 0.1729455055195374, "grad_norm": 17.742698414488256, "kl": 0.07470703125, "learning_rate": 8.272297178903102e-07, "loss": 0.0299, "reward": 1.6276013851165771, "reward_std": 0.19804228842258453, "rewards/accuracy_reward_stage2": 0.6276013851165771, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 987 }, { "completion_length": 10.75, "epoch": 0.17312072892938496, "grad_norm": 20.169397919693026, "kl": 0.054443359375, "learning_rate": 8.270544944804626e-07, "loss": 0.0218, "reward": 1.4787770509719849, "reward_std": 0.2628193199634552, "rewards/accuracy_reward_stage2": 0.4787770211696625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 988 }, { "completion_length": 15.921875, "epoch": 0.17329595233923253, "grad_norm": 19.564975038357, "kl": 0.11767578125, "learning_rate": 8.26879271070615e-07, "loss": 0.0472, "reward": 1.4452990293502808, "reward_std": 0.1860145926475525, "rewards/accuracy_reward_stage2": 0.6952989101409912, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 989 }, { "completion_length": 17.71875, "epoch": 0.17347117574908008, "grad_norm": 24.568791594952728, "kl": 0.034423828125, "learning_rate": 8.267040476607674e-07, "loss": 0.0137, "reward": 1.5325812101364136, "reward_std": 0.26802152395248413, "rewards/accuracy_reward_stage2": 0.5325811505317688, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 990 }, { "completion_length": 11.234375, "epoch": 0.17364639915892763, "grad_norm": 31.406451632987274, "kl": 0.046142578125, "learning_rate": 8.265288242509199e-07, "loss": 0.0185, "reward": 1.516391634941101, "reward_std": 0.1867346167564392, "rewards/accuracy_reward_stage2": 0.5163915753364563, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 991 }, { "completion_length": 14.453125, "epoch": 0.17382162256877517, "grad_norm": 24.623943884661465, "kl": 0.1875, "learning_rate": 8.263536008410723e-07, "loss": 0.0753, "reward": 1.708542823791504, "reward_std": 0.16841836273670197, "rewards/accuracy_reward_stage2": 0.8335429430007935, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 992 }, { "completion_length": 8.453125, "epoch": 0.17399684597862275, "grad_norm": 566.4321454989408, "kl": 3.0625, "learning_rate": 8.261783774312247e-07, "loss": 1.1852, "reward": 1.7703094482421875, "reward_std": 0.19710037112236023, "rewards/accuracy_reward_stage2": 0.785934329032898, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 993 }, { "completion_length": 9.65625, "epoch": 0.1741720693884703, "grad_norm": 21.60589644918224, "kl": 0.166015625, "learning_rate": 8.260031540213772e-07, "loss": 0.0663, "reward": 1.2493162155151367, "reward_std": 0.2732129991054535, "rewards/accuracy_reward_stage2": 0.37431615591049194, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 994 }, { "completion_length": 9.140625, "epoch": 0.17434729279831784, "grad_norm": 567.350235789533, "kl": 3.3125, "learning_rate": 8.258279306115297e-07, "loss": 1.2209, "reward": 1.3685557842254639, "reward_std": 0.2889344096183777, "rewards/accuracy_reward_stage2": 0.41543081402778625, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 995 }, { "completion_length": 13.421875, "epoch": 0.17452251620816542, "grad_norm": 132.5916422675832, "kl": 1.015625, "learning_rate": 8.256527072016821e-07, "loss": 0.3629, "reward": 1.3894778490066528, "reward_std": 0.082199826836586, "rewards/accuracy_reward_stage2": 0.6551028490066528, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 996 }, { "completion_length": 7.515625, "epoch": 0.17469773961801296, "grad_norm": 14.370187452272509, "kl": 0.1826171875, "learning_rate": 8.254774837918346e-07, "loss": 0.0285, "reward": 1.5352981090545654, "reward_std": 0.11823684722185135, "rewards/accuracy_reward_stage2": 0.5509230494499207, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 997 }, { "completion_length": 8.09375, "epoch": 0.1748729630278605, "grad_norm": 24.826995542235746, "kl": 0.04345703125, "learning_rate": 8.25302260381987e-07, "loss": 0.0174, "reward": 1.6802245378494263, "reward_std": 0.20285210013389587, "rewards/accuracy_reward_stage2": 0.6802244782447815, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 998 }, { "completion_length": 12.46875, "epoch": 0.1750481864377081, "grad_norm": 35.334960123002354, "kl": 0.0517578125, "learning_rate": 8.251270369721395e-07, "loss": 0.0209, "reward": 1.8040918111801147, "reward_std": 0.2325887829065323, "rewards/accuracy_reward_stage2": 0.8040918111801147, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 999 }, { "completion_length": 9.328125, "epoch": 0.17522340984755563, "grad_norm": 24.793032711533158, "kl": 0.10009765625, "learning_rate": 8.24951813562292e-07, "loss": 0.0399, "reward": 1.706731915473938, "reward_std": 0.34516337513923645, "rewards/accuracy_reward_stage2": 0.7067318558692932, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1000 }, { "completion_length": 12.09375, "epoch": 0.17539863325740318, "grad_norm": 21.874107086733733, "kl": 0.05615234375, "learning_rate": 8.247765901524442e-07, "loss": -0.0217, "reward": 1.6877598762512207, "reward_std": 0.2215467244386673, "rewards/accuracy_reward_stage2": 0.7033848762512207, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1001 }, { "completion_length": 11.265625, "epoch": 0.17557385666725076, "grad_norm": 18.000764914593567, "kl": 0.30078125, "learning_rate": 8.246013667425967e-07, "loss": 0.0758, "reward": 1.5376827716827393, "reward_std": 0.24442484974861145, "rewards/accuracy_reward_stage2": 0.6783077120780945, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1002 }, { "completion_length": 7.265625, "epoch": 0.1757490800770983, "grad_norm": 18.565844961588045, "kl": 0.05517578125, "learning_rate": 8.244261433327491e-07, "loss": 0.0221, "reward": 1.7236640453338623, "reward_std": 0.2000371664762497, "rewards/accuracy_reward_stage2": 0.7236641049385071, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1003 }, { "completion_length": 13.6875, "epoch": 0.17592430348694585, "grad_norm": 20.088877733252122, "kl": 0.396484375, "learning_rate": 8.242509199229016e-07, "loss": 0.1152, "reward": 1.3435370922088623, "reward_std": 0.14601978659629822, "rewards/accuracy_reward_stage2": 0.48416221141815186, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1004 }, { "completion_length": 13.53125, "epoch": 0.17609952689679342, "grad_norm": 16.471332796592286, "kl": 0.061767578125, "learning_rate": 8.240756965130541e-07, "loss": 0.0247, "reward": 1.5413925647735596, "reward_std": 0.1500665247440338, "rewards/accuracy_reward_stage2": 0.5413926243782043, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1005 }, { "completion_length": 8.765625, "epoch": 0.17627475030664097, "grad_norm": 27.168697897342867, "kl": 0.1875, "learning_rate": 8.239004731032065e-07, "loss": -0.0037, "reward": 1.568939447402954, "reward_std": 0.2704058885574341, "rewards/accuracy_reward_stage2": 0.7251893877983093, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1006 }, { "completion_length": 10.8125, "epoch": 0.17644997371648852, "grad_norm": 22.345118749931096, "kl": 0.2060546875, "learning_rate": 8.23725249693359e-07, "loss": 0.0025, "reward": 1.7439064979553223, "reward_std": 0.2780749499797821, "rewards/accuracy_reward_stage2": 0.7751563787460327, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1007 }, { "completion_length": 19.296875, "epoch": 0.1766251971263361, "grad_norm": 29.612030789420974, "kl": 0.07177734375, "learning_rate": 8.235500262835115e-07, "loss": 0.0288, "reward": 1.6531291007995605, "reward_std": 0.1276826113462448, "rewards/accuracy_reward_stage2": 0.6531291007995605, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1008 }, { "completion_length": 7.890625, "epoch": 0.17680042053618364, "grad_norm": 20.102240938891864, "kl": 0.044921875, "learning_rate": 8.233748028736639e-07, "loss": -0.0262, "reward": 1.6397186517715454, "reward_std": 0.2941434979438782, "rewards/accuracy_reward_stage2": 0.6553436517715454, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1009 }, { "completion_length": 12.5, "epoch": 0.17697564394603119, "grad_norm": 27.096622355934347, "kl": 0.035888671875, "learning_rate": 8.231995794638164e-07, "loss": -0.0298, "reward": 1.3705095052719116, "reward_std": 0.1347227394580841, "rewards/accuracy_reward_stage2": 0.38613444566726685, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1010 }, { "completion_length": 11.421875, "epoch": 0.17715086735587873, "grad_norm": 19.0051264155405, "kl": 0.046142578125, "learning_rate": 8.230243560539689e-07, "loss": 0.0185, "reward": 1.4608999490737915, "reward_std": 0.1947353333234787, "rewards/accuracy_reward_stage2": 0.5858998894691467, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1011 }, { "completion_length": 10.65625, "epoch": 0.1773260907657263, "grad_norm": 13.807597260017692, "kl": 0.072265625, "learning_rate": 8.228491326441212e-07, "loss": -0.1012, "reward": 1.7135417461395264, "reward_std": 0.17712606489658356, "rewards/accuracy_reward_stage2": 0.7604166269302368, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1012 }, { "completion_length": 14.15625, "epoch": 0.17750131417557385, "grad_norm": 16.648019138029532, "kl": 0.1962890625, "learning_rate": 8.226739092342737e-07, "loss": 0.0784, "reward": 1.5950548648834229, "reward_std": 0.12263785302639008, "rewards/accuracy_reward_stage2": 0.7200549840927124, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1013 }, { "completion_length": 10.0, "epoch": 0.1776765375854214, "grad_norm": 22.397243983004504, "kl": 0.1416015625, "learning_rate": 8.22498685824426e-07, "loss": -0.0199, "reward": 1.4478588104248047, "reward_std": 0.23456881940364838, "rewards/accuracy_reward_stage2": 0.4791087210178375, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1014 }, { "completion_length": 11.5, "epoch": 0.17785176099526898, "grad_norm": 17.19528296606665, "kl": 0.08544921875, "learning_rate": 8.223234624145785e-07, "loss": -0.0101, "reward": 1.459334135055542, "reward_std": 0.2601989209651947, "rewards/accuracy_reward_stage2": 0.474959135055542, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1015 }, { "completion_length": 11.171875, "epoch": 0.17802698440511652, "grad_norm": 22.32363164934085, "kl": 0.052001953125, "learning_rate": 8.22148239004731e-07, "loss": -0.0141, "reward": 1.3699970245361328, "reward_std": 0.1954904943704605, "rewards/accuracy_reward_stage2": 0.5106220245361328, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1016 }, { "completion_length": 9.171875, "epoch": 0.17820220781496407, "grad_norm": 21.83425997837965, "kl": 0.06201171875, "learning_rate": 8.219730155948834e-07, "loss": -0.0194, "reward": 1.7290661334991455, "reward_std": 0.18619462847709656, "rewards/accuracy_reward_stage2": 0.7446911931037903, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1017 }, { "completion_length": 8.015625, "epoch": 0.17837743122481164, "grad_norm": 14.028706814796353, "kl": 0.040771484375, "learning_rate": 8.217977921850359e-07, "loss": 0.0163, "reward": 1.59375, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1018 }, { "completion_length": 12.828125, "epoch": 0.1785526546346592, "grad_norm": 16.785372668563895, "kl": 0.042236328125, "learning_rate": 8.216225687751883e-07, "loss": 0.0169, "reward": 1.6821075677871704, "reward_std": 0.17112760245800018, "rewards/accuracy_reward_stage2": 0.6821075677871704, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1019 }, { "completion_length": 17.859375, "epoch": 0.17872787804450674, "grad_norm": 17.11698704652708, "kl": 0.015869140625, "learning_rate": 8.214473453653408e-07, "loss": -0.0378, "reward": 1.546691656112671, "reward_std": 0.06553763151168823, "rewards/accuracy_reward_stage2": 0.5623167157173157, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1020 }, { "completion_length": 8.203125, "epoch": 0.1789031014543543, "grad_norm": 23.871929503809273, "kl": 0.095703125, "learning_rate": 8.212721219554933e-07, "loss": -0.063, "reward": 1.7383593320846558, "reward_std": 0.20009878277778625, "rewards/accuracy_reward_stage2": 0.7852343320846558, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1021 }, { "completion_length": 9.84375, "epoch": 0.17907832486420186, "grad_norm": 30.169501250704013, "kl": 0.10888671875, "learning_rate": 8.210968985456456e-07, "loss": 0.0435, "reward": 1.5823220014572144, "reward_std": 0.22581787407398224, "rewards/accuracy_reward_stage2": 0.7073220014572144, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1022 }, { "completion_length": 11.484375, "epoch": 0.1792535482740494, "grad_norm": 20.79046787185343, "kl": 0.10888671875, "learning_rate": 8.209216751357981e-07, "loss": -0.0007, "reward": 1.5117167234420776, "reward_std": 0.21293523907661438, "rewards/accuracy_reward_stage2": 0.5273416638374329, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1023 }, { "completion_length": 12.6875, "epoch": 0.17942877168389698, "grad_norm": 21.841933395446556, "kl": 0.058837890625, "learning_rate": 8.207464517259506e-07, "loss": 0.002, "reward": 1.5082931518554688, "reward_std": 0.2580886483192444, "rewards/accuracy_reward_stage2": 0.5239181518554688, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1024 }, { "completion_length": 10.703125, "epoch": 0.17960399509374453, "grad_norm": 27.915163388669363, "kl": 0.13671875, "learning_rate": 8.20571228316103e-07, "loss": 0.0115, "reward": 1.5165634155273438, "reward_std": 0.245716854929924, "rewards/accuracy_reward_stage2": 0.5478134155273438, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1025 }, { "completion_length": 6.109375, "epoch": 0.17977921850359208, "grad_norm": 12.013743742666504, "kl": 0.010498046875, "learning_rate": 8.203960049062555e-07, "loss": 0.0042, "reward": 1.8125, "reward_std": 0.06681530922651291, "rewards/accuracy_reward_stage2": 0.8125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1026 }, { "completion_length": 11.15625, "epoch": 0.17995444191343962, "grad_norm": 160.2641583697851, "kl": 0.59765625, "learning_rate": 8.202207814964078e-07, "loss": 0.2043, "reward": 1.437111496925354, "reward_std": 0.32900726795196533, "rewards/accuracy_reward_stage2": 0.702736496925354, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1027 }, { "completion_length": 10.65625, "epoch": 0.1801296653232872, "grad_norm": 25.617864049464025, "kl": 0.03515625, "learning_rate": 8.200455580865603e-07, "loss": 0.0141, "reward": 1.6387648582458496, "reward_std": 0.2850415110588074, "rewards/accuracy_reward_stage2": 0.6387649178504944, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1028 }, { "completion_length": 8.015625, "epoch": 0.18030488873313474, "grad_norm": 22.632944374449437, "kl": 0.04150390625, "learning_rate": 8.198703346767128e-07, "loss": 0.0166, "reward": 1.4933924674987793, "reward_std": 0.17765173316001892, "rewards/accuracy_reward_stage2": 0.7433923482894897, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1029 }, { "completion_length": 7.609375, "epoch": 0.1804801121429823, "grad_norm": 11.832671025211214, "kl": 0.00701904296875, "learning_rate": 8.196951112668652e-07, "loss": 0.0028, "reward": 1.65625, "reward_std": 0.0578637570142746, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1030 }, { "completion_length": 11.25, "epoch": 0.18065533555282987, "grad_norm": 21.565992219102643, "kl": 0.07861328125, "learning_rate": 8.195198878570176e-07, "loss": 0.0315, "reward": 1.5944758653640747, "reward_std": 0.23725871741771698, "rewards/accuracy_reward_stage2": 0.5944758653640747, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1031 }, { "completion_length": 9.25, "epoch": 0.1808305589626774, "grad_norm": 19.56751291277132, "kl": 0.058837890625, "learning_rate": 8.193446644471701e-07, "loss": 0.0235, "reward": 1.4681397676467896, "reward_std": 0.2315388321876526, "rewards/accuracy_reward_stage2": 0.5931397676467896, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1032 }, { "completion_length": 11.8125, "epoch": 0.18100578237252496, "grad_norm": 21.39438530811492, "kl": 0.052734375, "learning_rate": 8.191694410373225e-07, "loss": 0.0211, "reward": 1.8061552047729492, "reward_std": 0.13493405282497406, "rewards/accuracy_reward_stage2": 0.8061552047729492, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1033 }, { "completion_length": 9.546875, "epoch": 0.18118100578237253, "grad_norm": 21.69549112720359, "kl": 0.060302734375, "learning_rate": 8.18994217627475e-07, "loss": -0.0642, "reward": 1.5303882360458374, "reward_std": 0.3947955071926117, "rewards/accuracy_reward_stage2": 0.5616382360458374, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1034 }, { "completion_length": 13.515625, "epoch": 0.18135622919222008, "grad_norm": 16.958675766024093, "kl": 0.1796875, "learning_rate": 8.188189942176274e-07, "loss": 0.0719, "reward": 1.3252782821655273, "reward_std": 0.2296641618013382, "rewards/accuracy_reward_stage2": 0.4502781629562378, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1035 }, { "completion_length": 12.28125, "epoch": 0.18153145260206763, "grad_norm": 14.465418387381794, "kl": 0.17578125, "learning_rate": 8.186437708077799e-07, "loss": 0.0261, "reward": 1.515625, "reward_std": 0.1530819982290268, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1036 }, { "completion_length": 9.265625, "epoch": 0.1817066760119152, "grad_norm": 20.9024336904025, "kl": 0.14453125, "learning_rate": 8.184685473979324e-07, "loss": -0.0128, "reward": 1.6768765449523926, "reward_std": 0.1317557990550995, "rewards/accuracy_reward_stage2": 0.7081265449523926, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1037 }, { "completion_length": 11.71875, "epoch": 0.18188189942176275, "grad_norm": 25.559950008766783, "kl": 0.0810546875, "learning_rate": 8.182933239880848e-07, "loss": -0.0117, "reward": 1.5764124393463135, "reward_std": 0.23896048963069916, "rewards/accuracy_reward_stage2": 0.5920374393463135, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1038 }, { "completion_length": 34.03125, "epoch": 0.1820571228316103, "grad_norm": 25.275923160610986, "kl": 0.09765625, "learning_rate": 8.181181005782373e-07, "loss": 0.0392, "reward": 1.6226279735565186, "reward_std": 0.087415412068367, "rewards/accuracy_reward_stage2": 0.6226279735565186, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1039 }, { "completion_length": 7.84375, "epoch": 0.18223234624145787, "grad_norm": 23.407709353813917, "kl": 0.061767578125, "learning_rate": 8.179428771683897e-07, "loss": -0.0194, "reward": 1.4883452653884888, "reward_std": 0.21674522757530212, "rewards/accuracy_reward_stage2": 0.5039702653884888, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1040 }, { "completion_length": 12.578125, "epoch": 0.18240756965130542, "grad_norm": 18.522952285363942, "kl": 0.07568359375, "learning_rate": 8.17767653758542e-07, "loss": -0.0076, "reward": 1.6132853031158447, "reward_std": 0.1885463446378708, "rewards/accuracy_reward_stage2": 0.6289101839065552, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1041 }, { "completion_length": 10.578125, "epoch": 0.18258279306115296, "grad_norm": 33.825003381200695, "kl": 0.1611328125, "learning_rate": 8.175924303486945e-07, "loss": -0.0031, "reward": 1.6792283058166504, "reward_std": 0.24878743290901184, "rewards/accuracy_reward_stage2": 0.7104784250259399, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1042 }, { "completion_length": 15.03125, "epoch": 0.18275801647100054, "grad_norm": 20.278543151963856, "kl": 0.06298828125, "learning_rate": 8.174172069388469e-07, "loss": 0.0253, "reward": 1.1826815605163574, "reward_std": 0.17803940176963806, "rewards/accuracy_reward_stage2": 0.30768144130706787, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1043 }, { "completion_length": 10.90625, "epoch": 0.1829332398808481, "grad_norm": 24.402430870338062, "kl": 0.0732421875, "learning_rate": 8.172419835289994e-07, "loss": 0.0294, "reward": 1.543660283088684, "reward_std": 0.1427423655986786, "rewards/accuracy_reward_stage2": 0.5436602830886841, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1044 }, { "completion_length": 14.140625, "epoch": 0.18310846329069563, "grad_norm": 32.14133710365091, "kl": 0.37109375, "learning_rate": 8.170667601191519e-07, "loss": 0.149, "reward": 1.4166667461395264, "reward_std": 0.3040403723716736, "rewards/accuracy_reward_stage2": 0.5416666865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1045 }, { "completion_length": 5.453125, "epoch": 0.18328368670054318, "grad_norm": 15.213673839148703, "kl": 0.13671875, "learning_rate": 8.168915367093043e-07, "loss": 0.0103, "reward": 1.5307811498641968, "reward_std": 0.07436943054199219, "rewards/accuracy_reward_stage2": 0.6714061498641968, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1046 }, { "completion_length": 10.34375, "epoch": 0.18345891011039075, "grad_norm": 19.228850233741067, "kl": 0.06298828125, "learning_rate": 8.167163132994568e-07, "loss": 0.0253, "reward": 1.7453439235687256, "reward_std": 0.1812073290348053, "rewards/accuracy_reward_stage2": 0.7453439235687256, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1047 }, { "completion_length": 9.078125, "epoch": 0.1836341335202383, "grad_norm": 23.540814203428233, "kl": 0.1689453125, "learning_rate": 8.165410898896093e-07, "loss": -0.008, "reward": 1.5110549926757812, "reward_std": 0.22013264894485474, "rewards/accuracy_reward_stage2": 0.5423049330711365, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1048 }, { "completion_length": 7.328125, "epoch": 0.18380935693008585, "grad_norm": 40.582636908844044, "kl": 0.3046875, "learning_rate": 8.163658664797617e-07, "loss": 0.0398, "reward": 1.451476812362671, "reward_std": 0.20129188895225525, "rewards/accuracy_reward_stage2": 0.4827268123626709, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1049 }, { "completion_length": 10.84375, "epoch": 0.18398458033993342, "grad_norm": 27.574486341804228, "kl": 0.07421875, "learning_rate": 8.161906430699142e-07, "loss": 0.0297, "reward": 1.5007617473602295, "reward_std": 0.35384151339530945, "rewards/accuracy_reward_stage2": 0.5007617473602295, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1050 }, { "completion_length": 11.828125, "epoch": 0.18415980374978097, "grad_norm": 32.54566477332695, "kl": 0.353515625, "learning_rate": 8.160154196600665e-07, "loss": 0.1412, "reward": 1.5478098392486572, "reward_std": 0.31436848640441895, "rewards/accuracy_reward_stage2": 0.6728098392486572, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1051 }, { "completion_length": 14.953125, "epoch": 0.18433502715962852, "grad_norm": 20.455856754457844, "kl": 0.0849609375, "learning_rate": 8.158401962502189e-07, "loss": 0.0341, "reward": 1.5930328369140625, "reward_std": 0.17691928148269653, "rewards/accuracy_reward_stage2": 0.593032956123352, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1052 }, { "completion_length": 9.65625, "epoch": 0.1845102505694761, "grad_norm": 26.652573712542743, "kl": 0.21484375, "learning_rate": 8.156649728403714e-07, "loss": 0.0573, "reward": 1.5435185432434082, "reward_std": 0.28644296526908875, "rewards/accuracy_reward_stage2": 0.6841434836387634, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1053 }, { "completion_length": 10.015625, "epoch": 0.18468547397932364, "grad_norm": 23.14445149802474, "kl": 0.060546875, "learning_rate": 8.154897494305238e-07, "loss": 0.0242, "reward": 1.634928584098816, "reward_std": 0.2715989947319031, "rewards/accuracy_reward_stage2": 0.7599285840988159, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1054 }, { "completion_length": 9.59375, "epoch": 0.18486069738917119, "grad_norm": 15.48288506366053, "kl": 0.34765625, "learning_rate": 8.153145260206763e-07, "loss": 0.0949, "reward": 1.5993139743804932, "reward_std": 0.16807261109352112, "rewards/accuracy_reward_stage2": 0.7399389743804932, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1055 }, { "completion_length": 12.4375, "epoch": 0.18503592079901876, "grad_norm": 22.38243459661095, "kl": 0.2216796875, "learning_rate": 8.151393026108288e-07, "loss": -0.0003, "reward": 1.511639952659607, "reward_std": 0.2581867277622223, "rewards/accuracy_reward_stage2": 0.5585149526596069, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1056 }, { "completion_length": 10.203125, "epoch": 0.1852111442088663, "grad_norm": 20.25981681681545, "kl": 0.0859375, "learning_rate": 8.149640792009812e-07, "loss": 0.0343, "reward": 1.7087209224700928, "reward_std": 0.14371052384376526, "rewards/accuracy_reward_stage2": 0.7087209224700928, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1057 }, { "completion_length": 12.109375, "epoch": 0.18538636761871385, "grad_norm": 22.906658516478977, "kl": 0.2890625, "learning_rate": 8.147888557911337e-07, "loss": 0.0715, "reward": 1.4852299690246582, "reward_std": 0.17032143473625183, "rewards/accuracy_reward_stage2": 0.6258548498153687, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1058 }, { "completion_length": 9.859375, "epoch": 0.18556159102856143, "grad_norm": 26.553366802136384, "kl": 0.09375, "learning_rate": 8.146136323812861e-07, "loss": 0.0159, "reward": 1.4717214107513428, "reward_std": 0.28934115171432495, "rewards/accuracy_reward_stage2": 0.48734647035598755, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1059 }, { "completion_length": 10.28125, "epoch": 0.18573681443840898, "grad_norm": 21.046621537109452, "kl": 0.056396484375, "learning_rate": 8.144384089714386e-07, "loss": 0.0225, "reward": 1.7228630781173706, "reward_std": 0.0990498960018158, "rewards/accuracy_reward_stage2": 0.7228630185127258, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1060 }, { "completion_length": 9.53125, "epoch": 0.18591203784825652, "grad_norm": 18.7875170669259, "kl": 0.072265625, "learning_rate": 8.14263185561591e-07, "loss": 0.029, "reward": 1.4805216789245605, "reward_std": 0.11815441399812698, "rewards/accuracy_reward_stage2": 0.48052167892456055, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1061 }, { "completion_length": 6.921875, "epoch": 0.18608726125810407, "grad_norm": 18.409645335365852, "kl": 0.0693359375, "learning_rate": 8.140879621517434e-07, "loss": -0.0163, "reward": 1.6867990493774414, "reward_std": 0.21753624081611633, "rewards/accuracy_reward_stage2": 0.7024240493774414, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1062 }, { "completion_length": 11.0, "epoch": 0.18626248466795164, "grad_norm": 20.982584348741664, "kl": 0.08740234375, "learning_rate": 8.139127387418959e-07, "loss": 0.0351, "reward": 1.4232040643692017, "reward_std": 0.16690698266029358, "rewards/accuracy_reward_stage2": 0.5482040643692017, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1063 }, { "completion_length": 22.359375, "epoch": 0.1864377080777992, "grad_norm": 17.957689558341478, "kl": 0.1806640625, "learning_rate": 8.137375153320484e-07, "loss": -0.0343, "reward": 1.0880229473114014, "reward_std": 0.25343039631843567, "rewards/accuracy_reward_stage2": 0.5098979473114014, "rewards/format_reward_stage1_pointerpad": 0.578125, "scores/accuracy_reward_stage2": 0.578125, "step": 1064 }, { "completion_length": 7.015625, "epoch": 0.18661293148764674, "grad_norm": 19.408175653268714, "kl": 0.05712890625, "learning_rate": 8.135622919222007e-07, "loss": 0.0229, "reward": 1.560366153717041, "reward_std": 0.19479292631149292, "rewards/accuracy_reward_stage2": 0.560366153717041, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1065 }, { "completion_length": 7.578125, "epoch": 0.1867881548974943, "grad_norm": 31.666787791274242, "kl": 0.0908203125, "learning_rate": 8.133870685123532e-07, "loss": -0.029, "reward": 1.700068473815918, "reward_std": 0.2957872152328491, "rewards/accuracy_reward_stage2": 0.7313185334205627, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1066 }, { "completion_length": 9.203125, "epoch": 0.18696337830734186, "grad_norm": 22.06110956212385, "kl": 0.0712890625, "learning_rate": 8.132118451025056e-07, "loss": 0.0286, "reward": 1.3757433891296387, "reward_std": 0.24035391211509705, "rewards/accuracy_reward_stage2": 0.5007432699203491, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1067 }, { "completion_length": 7.265625, "epoch": 0.1871386017171894, "grad_norm": 21.13342642811075, "kl": 0.12353515625, "learning_rate": 8.130366216926581e-07, "loss": 0.0495, "reward": 1.4848458766937256, "reward_std": 0.18105000257492065, "rewards/accuracy_reward_stage2": 0.48484593629837036, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1068 }, { "completion_length": 5.4375, "epoch": 0.18731382512703698, "grad_norm": 10.078001794329074, "kl": 0.03662109375, "learning_rate": 8.128613982828106e-07, "loss": 0.0146, "reward": 1.4926791191101074, "reward_std": 0.020706364884972572, "rewards/accuracy_reward_stage2": 0.4926791787147522, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1069 }, { "completion_length": 11.90625, "epoch": 0.18748904853688453, "grad_norm": 31.03697558939259, "kl": 0.09375, "learning_rate": 8.12686174872963e-07, "loss": -0.0068, "reward": 1.4946585893630981, "reward_std": 0.18645590543746948, "rewards/accuracy_reward_stage2": 0.6352835893630981, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1070 }, { "completion_length": 8.234375, "epoch": 0.18766427194673208, "grad_norm": 27.770742176690753, "kl": 0.146484375, "learning_rate": 8.125109514631154e-07, "loss": -0.03, "reward": 1.5777851343154907, "reward_std": 0.27430057525634766, "rewards/accuracy_reward_stage2": 0.6090351343154907, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1071 }, { "completion_length": 13.1875, "epoch": 0.18783949535657965, "grad_norm": 635.6301357426747, "kl": 3.203125, "learning_rate": 8.123357280532679e-07, "loss": 1.2792, "reward": 1.5925309658050537, "reward_std": 0.18635889887809753, "rewards/accuracy_reward_stage2": 0.7175308465957642, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1072 }, { "completion_length": 11.5, "epoch": 0.1880147187664272, "grad_norm": 23.2727223905994, "kl": 0.1533203125, "learning_rate": 8.121605046434203e-07, "loss": 0.0613, "reward": 1.76097571849823, "reward_std": 0.15913929045200348, "rewards/accuracy_reward_stage2": 0.76097571849823, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1073 }, { "completion_length": 5.921875, "epoch": 0.18818994217627474, "grad_norm": 20.897642898148753, "kl": 0.04052734375, "learning_rate": 8.119852812335728e-07, "loss": -0.0279, "reward": 1.6360900402069092, "reward_std": 0.19334176182746887, "rewards/accuracy_reward_stage2": 0.6517150402069092, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1074 }, { "completion_length": 9.625, "epoch": 0.18836516558612232, "grad_norm": 13.357495831373486, "kl": 0.033203125, "learning_rate": 8.118100578237252e-07, "loss": 0.0133, "reward": 1.6267361640930176, "reward_std": 0.17079266905784607, "rewards/accuracy_reward_stage2": 0.6267361044883728, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1075 }, { "completion_length": 10.109375, "epoch": 0.18854038899596987, "grad_norm": 31.228774447933233, "kl": 0.162109375, "learning_rate": 8.116348344138777e-07, "loss": 0.0648, "reward": 1.598435640335083, "reward_std": 0.2621033191680908, "rewards/accuracy_reward_stage2": 0.5984355807304382, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1076 }, { "completion_length": 11.078125, "epoch": 0.1887156124058174, "grad_norm": 18.03238472395094, "kl": 0.0849609375, "learning_rate": 8.114596110040302e-07, "loss": 0.0051, "reward": 1.6736887693405151, "reward_std": 0.14449408650398254, "rewards/accuracy_reward_stage2": 0.6893137693405151, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1077 }, { "completion_length": 13.921875, "epoch": 0.18889083581566496, "grad_norm": 31.39284474795272, "kl": 0.068359375, "learning_rate": 8.112843875941825e-07, "loss": -0.0723, "reward": 1.5917601585388184, "reward_std": 0.2058051973581314, "rewards/accuracy_reward_stage2": 0.6386352181434631, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1078 }, { "completion_length": 10.9375, "epoch": 0.18906605922551253, "grad_norm": 22.829727849729412, "kl": 0.0693359375, "learning_rate": 8.11109164184335e-07, "loss": -0.0002, "reward": 1.7911438941955566, "reward_std": 0.19995662569999695, "rewards/accuracy_reward_stage2": 0.8067688941955566, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1079 }, { "completion_length": 6.75, "epoch": 0.18924128263536008, "grad_norm": 35.82822640876689, "kl": 0.1279296875, "learning_rate": 8.109339407744873e-07, "loss": 0.0513, "reward": 1.7231206893920898, "reward_std": 0.14698222279548645, "rewards/accuracy_reward_stage2": 0.8481206297874451, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1080 }, { "completion_length": 9.359375, "epoch": 0.18941650604520763, "grad_norm": 20.760161439985456, "kl": 0.08984375, "learning_rate": 8.107587173646398e-07, "loss": -0.0082, "reward": 1.7030006647109985, "reward_std": 0.2652503252029419, "rewards/accuracy_reward_stage2": 0.7186257243156433, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1081 }, { "completion_length": 6.28125, "epoch": 0.1895917294550552, "grad_norm": 20.00055457981059, "kl": 0.013916015625, "learning_rate": 8.105834939547923e-07, "loss": -0.0362, "reward": 1.7184606790542603, "reward_std": 0.1488310694694519, "rewards/accuracy_reward_stage2": 0.7340856790542603, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1082 }, { "completion_length": 11.796875, "epoch": 0.18976695286490275, "grad_norm": 15.90440340391825, "kl": 0.0966796875, "learning_rate": 8.104082705449447e-07, "loss": 0.0218, "reward": 1.650546669960022, "reward_std": 0.1746128350496292, "rewards/accuracy_reward_stage2": 0.666171669960022, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1083 }, { "completion_length": 13.96875, "epoch": 0.1899421762747503, "grad_norm": 15.21184938869327, "kl": 0.054931640625, "learning_rate": 8.102330471350972e-07, "loss": 0.022, "reward": 1.2956702709197998, "reward_std": 0.12772494554519653, "rewards/accuracy_reward_stage2": 0.29567036032676697, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1084 }, { "completion_length": 5.984375, "epoch": 0.19011739968459787, "grad_norm": 13.046954389444721, "kl": 0.037841796875, "learning_rate": 8.100578237252497e-07, "loss": 0.0152, "reward": 1.6959822177886963, "reward_std": 0.08044600486755371, "rewards/accuracy_reward_stage2": 0.6959822177886963, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1085 }, { "completion_length": 8.765625, "epoch": 0.19029262309444542, "grad_norm": 21.808137766383577, "kl": 0.10791015625, "learning_rate": 8.098826003154021e-07, "loss": -0.0244, "reward": 1.6888558864593506, "reward_std": 0.17180359363555908, "rewards/accuracy_reward_stage2": 0.8451060056686401, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1086 }, { "completion_length": 11.671875, "epoch": 0.19046784650429296, "grad_norm": 14.883602876711144, "kl": 0.0252685546875, "learning_rate": 8.097073769055546e-07, "loss": 0.0101, "reward": 1.473452091217041, "reward_std": 0.12437894195318222, "rewards/accuracy_reward_stage2": 0.473452091217041, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1087 }, { "completion_length": 14.59375, "epoch": 0.19064306991414054, "grad_norm": 12.088597325510479, "kl": 0.1767578125, "learning_rate": 8.095321534957071e-07, "loss": 0.0708, "reward": 1.215935468673706, "reward_std": 0.06121998280286789, "rewards/accuracy_reward_stage2": 0.34093552827835083, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1088 }, { "completion_length": 8.984375, "epoch": 0.1908182933239881, "grad_norm": 15.176806742901167, "kl": 0.0947265625, "learning_rate": 8.093569300858595e-07, "loss": -0.0115, "reward": 1.570344090461731, "reward_std": 0.22428151965141296, "rewards/accuracy_reward_stage2": 0.601594090461731, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1089 }, { "completion_length": 13.96875, "epoch": 0.19099351673383563, "grad_norm": 24.956813132983317, "kl": 0.10791015625, "learning_rate": 8.09181706676012e-07, "loss": 0.0873, "reward": 1.2724158763885498, "reward_std": 0.2870734632015228, "rewards/accuracy_reward_stage2": 0.5224158763885498, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1090 }, { "completion_length": 8.8125, "epoch": 0.1911687401436832, "grad_norm": 19.027927979774955, "kl": 0.09228515625, "learning_rate": 8.090064832661642e-07, "loss": 0.0112, "reward": 1.6313860416412354, "reward_std": 0.20960725843906403, "rewards/accuracy_reward_stage2": 0.6626360416412354, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1091 }, { "completion_length": 9.6875, "epoch": 0.19134396355353075, "grad_norm": 22.860291887433835, "kl": 0.0908203125, "learning_rate": 8.088312598563167e-07, "loss": -0.0496, "reward": 1.5270261764526367, "reward_std": 0.3534308969974518, "rewards/accuracy_reward_stage2": 0.5582762360572815, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1092 }, { "completion_length": 11.421875, "epoch": 0.1915191869633783, "grad_norm": 22.108352820227587, "kl": 0.1376953125, "learning_rate": 8.086560364464692e-07, "loss": 0.011, "reward": 1.4140586853027344, "reward_std": 0.20068290829658508, "rewards/accuracy_reward_stage2": 0.4296835660934448, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1093 }, { "completion_length": 5.984375, "epoch": 0.19169441037322588, "grad_norm": 16.302770410900887, "kl": 0.10009765625, "learning_rate": 8.084808130366216e-07, "loss": -0.004, "reward": 1.6134690046310425, "reward_std": 0.1999814510345459, "rewards/accuracy_reward_stage2": 0.6290940046310425, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1094 }, { "completion_length": 19.375, "epoch": 0.19186963378307342, "grad_norm": 19.348644015651782, "kl": 0.07763671875, "learning_rate": 8.083055896267741e-07, "loss": -0.0012, "reward": 1.392347812652588, "reward_std": 0.1986556202173233, "rewards/accuracy_reward_stage2": 0.5329726934432983, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1095 }, { "completion_length": 11.34375, "epoch": 0.19204485719292097, "grad_norm": 20.1216235120964, "kl": 0.08642578125, "learning_rate": 8.081303662169265e-07, "loss": 0.0461, "reward": 1.4377480745315552, "reward_std": 0.22237080335617065, "rewards/accuracy_reward_stage2": 0.5627480745315552, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1096 }, { "completion_length": 9.875, "epoch": 0.19222008060276852, "grad_norm": 17.62212799015951, "kl": 0.0654296875, "learning_rate": 8.07955142807079e-07, "loss": 0.0262, "reward": 1.527910828590393, "reward_std": 0.1326691210269928, "rewards/accuracy_reward_stage2": 0.5279108881950378, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1097 }, { "completion_length": 8.984375, "epoch": 0.1923953040126161, "grad_norm": 20.687971074598163, "kl": 0.11328125, "learning_rate": 8.077799193972315e-07, "loss": 0.0454, "reward": 1.5507917404174805, "reward_std": 0.336439311504364, "rewards/accuracy_reward_stage2": 0.5507918000221252, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1098 }, { "completion_length": 10.421875, "epoch": 0.19257052742246364, "grad_norm": 10.686266812372155, "kl": 0.01470947265625, "learning_rate": 8.076046959873839e-07, "loss": 0.0059, "reward": 1.5433006286621094, "reward_std": 0.0748034194111824, "rewards/accuracy_reward_stage2": 0.6683006286621094, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1099 }, { "completion_length": 8.78125, "epoch": 0.19274575083231119, "grad_norm": 20.305827220265968, "kl": 0.10986328125, "learning_rate": 8.074294725775364e-07, "loss": 0.0141, "reward": 1.7699222564697266, "reward_std": 0.1974988877773285, "rewards/accuracy_reward_stage2": 0.785547137260437, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1100 }, { "completion_length": 6.78125, "epoch": 0.19292097424215876, "grad_norm": 46.005300340627144, "kl": 0.3046875, "learning_rate": 8.072542491676888e-07, "loss": 0.0777, "reward": 1.5677083730697632, "reward_std": 0.3144148588180542, "rewards/accuracy_reward_stage2": 0.5833333134651184, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1101 }, { "completion_length": 12.8125, "epoch": 0.1930961976520063, "grad_norm": 23.389305778485923, "kl": 0.076171875, "learning_rate": 8.070790257578412e-07, "loss": 0.0304, "reward": 1.4332172870635986, "reward_std": 0.20425169169902802, "rewards/accuracy_reward_stage2": 0.5582171678543091, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1102 }, { "completion_length": 8.546875, "epoch": 0.19327142106185385, "grad_norm": 25.892819741346866, "kl": 0.19921875, "learning_rate": 8.069038023479936e-07, "loss": -0.063, "reward": 1.413844347000122, "reward_std": 0.30733755230903625, "rewards/accuracy_reward_stage2": 0.4919692873954773, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 1103 }, { "completion_length": 8.203125, "epoch": 0.19344664447170143, "grad_norm": 26.958837496349805, "kl": 0.0712890625, "learning_rate": 8.06728578938146e-07, "loss": 0.0285, "reward": 1.6971174478530884, "reward_std": 0.23234084248542786, "rewards/accuracy_reward_stage2": 0.6971173882484436, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1104 }, { "completion_length": 12.140625, "epoch": 0.19362186788154898, "grad_norm": 23.112960942928332, "kl": 0.020263671875, "learning_rate": 8.065533555282985e-07, "loss": 0.0144, "reward": 1.2263470888137817, "reward_std": 0.10987623780965805, "rewards/accuracy_reward_stage2": 0.4763471186161041, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1105 }, { "completion_length": 11.515625, "epoch": 0.19379709129139652, "grad_norm": 19.83519566726814, "kl": 0.0673828125, "learning_rate": 8.06378132118451e-07, "loss": 0.027, "reward": 1.6052569150924683, "reward_std": 0.1178978905081749, "rewards/accuracy_reward_stage2": 0.6052569150924683, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1106 }, { "completion_length": 13.5625, "epoch": 0.1939723147012441, "grad_norm": 1856.6081789063355, "kl": 2.390625, "learning_rate": 8.062029087086034e-07, "loss": 0.9175, "reward": 1.3263888359069824, "reward_std": 0.12646648287773132, "rewards/accuracy_reward_stage2": 0.3420138955116272, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1107 }, { "completion_length": 11.640625, "epoch": 0.19414753811109164, "grad_norm": 22.62046556055641, "kl": 0.08447265625, "learning_rate": 8.060276852987559e-07, "loss": -0.0104, "reward": 1.5089672803878784, "reward_std": 0.24158672988414764, "rewards/accuracy_reward_stage2": 0.6495921611785889, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1108 }, { "completion_length": 19.890625, "epoch": 0.1943227615209392, "grad_norm": 21.380509612699353, "kl": 0.041259765625, "learning_rate": 8.058524618889084e-07, "loss": 0.0165, "reward": 1.525514841079712, "reward_std": 0.1627964973449707, "rewards/accuracy_reward_stage2": 0.5255147814750671, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1109 }, { "completion_length": 11.3125, "epoch": 0.19449798493078677, "grad_norm": 18.69847620677502, "kl": 0.08349609375, "learning_rate": 8.056772384790608e-07, "loss": 0.0334, "reward": 1.5814133882522583, "reward_std": 0.1631113588809967, "rewards/accuracy_reward_stage2": 0.5814133882522583, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1110 }, { "completion_length": 9.25, "epoch": 0.1946732083406343, "grad_norm": 18.246474115447, "kl": 0.04345703125, "learning_rate": 8.055020150692132e-07, "loss": -0.0166, "reward": 1.7617158889770508, "reward_std": 0.17398208379745483, "rewards/accuracy_reward_stage2": 0.7773408889770508, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1111 }, { "completion_length": 22.890625, "epoch": 0.19484843175048186, "grad_norm": 17.920499342663035, "kl": 0.11083984375, "learning_rate": 8.053267916593656e-07, "loss": -0.0287, "reward": 1.2582231760025024, "reward_std": 0.18175308406352997, "rewards/accuracy_reward_stage2": 0.41447317600250244, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1112 }, { "completion_length": 9.328125, "epoch": 0.1950236551603294, "grad_norm": 18.210669269174822, "kl": 0.03759765625, "learning_rate": 8.051515682495181e-07, "loss": -0.0606, "reward": 1.638547658920288, "reward_std": 0.2656075954437256, "rewards/accuracy_reward_stage2": 0.6697976589202881, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1113 }, { "completion_length": 9.984375, "epoch": 0.19519887857017698, "grad_norm": 22.267678059768336, "kl": 0.0703125, "learning_rate": 8.049763448396706e-07, "loss": 0.0281, "reward": 1.6199893951416016, "reward_std": 0.29019030928611755, "rewards/accuracy_reward_stage2": 0.6199893951416016, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1114 }, { "completion_length": 11.65625, "epoch": 0.19537410198002453, "grad_norm": 22.77008648581453, "kl": 0.0419921875, "learning_rate": 8.04801121429823e-07, "loss": -0.0713, "reward": 1.839109182357788, "reward_std": 0.1781412959098816, "rewards/accuracy_reward_stage2": 0.8703591823577881, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1115 }, { "completion_length": 11.0, "epoch": 0.19554932538987208, "grad_norm": 19.049510532142538, "kl": 0.08251953125, "learning_rate": 8.046258980199754e-07, "loss": -0.0112, "reward": 1.5514042377471924, "reward_std": 0.17855620384216309, "rewards/accuracy_reward_stage2": 0.6920292377471924, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1116 }, { "completion_length": 10.140625, "epoch": 0.19572454879971965, "grad_norm": 25.80724586964345, "kl": 0.103515625, "learning_rate": 8.044506746101279e-07, "loss": 0.0414, "reward": 1.433285117149353, "reward_std": 0.16345283389091492, "rewards/accuracy_reward_stage2": 0.558285117149353, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1117 }, { "completion_length": 9.6875, "epoch": 0.1958997722095672, "grad_norm": 19.673693253177163, "kl": 0.056640625, "learning_rate": 8.042754512002803e-07, "loss": 0.0227, "reward": 1.4013640880584717, "reward_std": 0.14320652186870575, "rewards/accuracy_reward_stage2": 0.7763641476631165, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1118 }, { "completion_length": 17.109375, "epoch": 0.19607499561941474, "grad_norm": 95.22382663635376, "kl": 0.416015625, "learning_rate": 8.041002277904328e-07, "loss": 0.1661, "reward": 1.4747977256774902, "reward_std": 0.2025090903043747, "rewards/accuracy_reward_stage2": 0.5997976660728455, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1119 }, { "completion_length": 10.046875, "epoch": 0.19625021902926232, "grad_norm": 23.39398079319575, "kl": 0.0546875, "learning_rate": 8.039250043805851e-07, "loss": 0.0219, "reward": 1.6447781324386597, "reward_std": 0.22827255725860596, "rewards/accuracy_reward_stage2": 0.6447781324386597, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1120 }, { "completion_length": 15.765625, "epoch": 0.19642544243910987, "grad_norm": 18.078899714493488, "kl": 0.033447265625, "learning_rate": 8.037497809707376e-07, "loss": -0.0308, "reward": 1.4864583015441895, "reward_std": 0.2050531953573227, "rewards/accuracy_reward_stage2": 0.5020833015441895, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1121 }, { "completion_length": 8.6875, "epoch": 0.1966006658489574, "grad_norm": 21.8183949180115, "kl": 0.03271484375, "learning_rate": 8.035745575608901e-07, "loss": 0.013, "reward": 1.651584506034851, "reward_std": 0.2105352282524109, "rewards/accuracy_reward_stage2": 0.6515845060348511, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1122 }, { "completion_length": 10.15625, "epoch": 0.196775889258805, "grad_norm": 15.930285731895992, "kl": 0.0634765625, "learning_rate": 8.033993341510425e-07, "loss": 0.0252, "reward": 1.454774260520935, "reward_std": 0.1642688512802124, "rewards/accuracy_reward_stage2": 0.47039929032325745, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1123 }, { "completion_length": 5.15625, "epoch": 0.19695111266865253, "grad_norm": 21.751143246670633, "kl": 0.1025390625, "learning_rate": 8.03224110741195e-07, "loss": 0.0126, "reward": 1.7925353050231934, "reward_std": 0.1622324138879776, "rewards/accuracy_reward_stage2": 0.8081602454185486, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1124 }, { "completion_length": 10.375, "epoch": 0.19712633607850008, "grad_norm": 11.346635286919138, "kl": 0.05859375, "learning_rate": 8.030488873313475e-07, "loss": -0.0011, "reward": 1.5891244411468506, "reward_std": 0.08004673570394516, "rewards/accuracy_reward_stage2": 0.6047494411468506, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1125 }, { "completion_length": 7.421875, "epoch": 0.19730155948834766, "grad_norm": 26.259271451383956, "kl": 0.173828125, "learning_rate": 8.028736639214999e-07, "loss": 0.0304, "reward": 1.3985657691955566, "reward_std": 0.21947166323661804, "rewards/accuracy_reward_stage2": 0.5391908288002014, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1126 }, { "completion_length": 10.203125, "epoch": 0.1974767828981952, "grad_norm": 30.238017587159945, "kl": 0.3515625, "learning_rate": 8.026984405116524e-07, "loss": 0.1409, "reward": 1.228277564048767, "reward_std": 0.133104607462883, "rewards/accuracy_reward_stage2": 0.6032775640487671, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1127 }, { "completion_length": 7.203125, "epoch": 0.19765200630804275, "grad_norm": 17.460221312721277, "kl": 0.04833984375, "learning_rate": 8.025232171018048e-07, "loss": 0.0192, "reward": 1.3928313255310059, "reward_std": 0.15808694064617157, "rewards/accuracy_reward_stage2": 0.39283138513565063, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1128 }, { "completion_length": 9.53125, "epoch": 0.19782722971789032, "grad_norm": 18.46095666098341, "kl": 0.051025390625, "learning_rate": 8.023479936919572e-07, "loss": 0.0075, "reward": 1.6616019010543823, "reward_std": 0.13137364387512207, "rewards/accuracy_reward_stage2": 0.6772269010543823, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1129 }, { "completion_length": 11.09375, "epoch": 0.19800245312773787, "grad_norm": 20.90678156081599, "kl": 0.0947265625, "learning_rate": 8.021727702821096e-07, "loss": -0.0064, "reward": 1.7562756538391113, "reward_std": 0.35668328404426575, "rewards/accuracy_reward_stage2": 0.7719005346298218, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1130 }, { "completion_length": 8.90625, "epoch": 0.19817767653758542, "grad_norm": 13.7045249582709, "kl": 0.04150390625, "learning_rate": 8.01997546872262e-07, "loss": 0.0166, "reward": 1.6348446607589722, "reward_std": 0.08211646229028702, "rewards/accuracy_reward_stage2": 0.6348447203636169, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1131 }, { "completion_length": 11.25, "epoch": 0.19835289994743296, "grad_norm": 16.81366964105771, "kl": 0.1279296875, "learning_rate": 8.018223234624145e-07, "loss": 0.0071, "reward": 1.7176910638809204, "reward_std": 0.28899407386779785, "rewards/accuracy_reward_stage2": 0.7333160042762756, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1132 }, { "completion_length": 9.4375, "epoch": 0.19852812335728054, "grad_norm": 19.902013500283253, "kl": 0.09912109375, "learning_rate": 8.01647100052567e-07, "loss": 0.0397, "reward": 1.7990940809249878, "reward_std": 0.21646492183208466, "rewards/accuracy_reward_stage2": 0.799094021320343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1133 }, { "completion_length": 9.234375, "epoch": 0.1987033467671281, "grad_norm": 13.540574761629962, "kl": 0.040283203125, "learning_rate": 8.014718766427194e-07, "loss": 0.0161, "reward": 1.744128704071045, "reward_std": 0.05768556892871857, "rewards/accuracy_reward_stage2": 0.7441287040710449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1134 }, { "completion_length": 13.34375, "epoch": 0.19887857017697563, "grad_norm": 25.82564634090041, "kl": 0.1923828125, "learning_rate": 8.012966532328719e-07, "loss": 0.0769, "reward": 1.1752233505249023, "reward_std": 0.09604233503341675, "rewards/accuracy_reward_stage2": 0.42522335052490234, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1135 }, { "completion_length": 10.3125, "epoch": 0.1990537935868232, "grad_norm": 19.314766378810056, "kl": 0.1220703125, "learning_rate": 8.011214298230243e-07, "loss": 0.0046, "reward": 1.4528274536132812, "reward_std": 0.18750616908073425, "rewards/accuracy_reward_stage2": 0.4684523642063141, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1136 }, { "completion_length": 14.109375, "epoch": 0.19922901699667075, "grad_norm": 16.821980930138615, "kl": 0.0615234375, "learning_rate": 8.009462064131768e-07, "loss": 0.0247, "reward": 1.4835102558135986, "reward_std": 0.166833758354187, "rewards/accuracy_reward_stage2": 0.48351022601127625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1137 }, { "completion_length": 15.625, "epoch": 0.1994042404065183, "grad_norm": 31.868072743598333, "kl": 0.109375, "learning_rate": 8.007709830033293e-07, "loss": -0.0005, "reward": 1.5000779628753662, "reward_std": 0.3490258753299713, "rewards/accuracy_reward_stage2": 0.5157029628753662, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1138 }, { "completion_length": 11.453125, "epoch": 0.19957946381636588, "grad_norm": 25.6116140210764, "kl": 0.07080078125, "learning_rate": 8.005957595934817e-07, "loss": 0.0284, "reward": 1.364166498184204, "reward_std": 0.20888856053352356, "rewards/accuracy_reward_stage2": 0.4891664981842041, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1139 }, { "completion_length": 7.890625, "epoch": 0.19975468722621342, "grad_norm": 24.85245877767008, "kl": 0.072265625, "learning_rate": 8.004205361836342e-07, "loss": 0.0288, "reward": 1.5358493328094482, "reward_std": 0.27124035358428955, "rewards/accuracy_reward_stage2": 0.5358492732048035, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1140 }, { "completion_length": 12.8125, "epoch": 0.19992991063606097, "grad_norm": 14.61945538091605, "kl": 0.0308837890625, "learning_rate": 8.002453127737866e-07, "loss": -0.0043, "reward": 1.4979475736618042, "reward_std": 0.11111369729042053, "rewards/accuracy_reward_stage2": 0.5135725736618042, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1141 }, { "completion_length": 12.53125, "epoch": 0.20010513404590854, "grad_norm": 16.880834934148936, "kl": 0.038818359375, "learning_rate": 8.000700893639389e-07, "loss": 0.0156, "reward": 1.6113297939300537, "reward_std": 0.13249364495277405, "rewards/accuracy_reward_stage2": 0.7363297343254089, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1142 }, { "completion_length": 9.140625, "epoch": 0.2002803574557561, "grad_norm": 15.358944238027423, "kl": 0.06396484375, "learning_rate": 7.998948659540914e-07, "loss": -0.0186, "reward": 1.5073845386505127, "reward_std": 0.10182757675647736, "rewards/accuracy_reward_stage2": 0.5230096578598022, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1143 }, { "completion_length": 10.6875, "epoch": 0.20045558086560364, "grad_norm": 19.06697388533954, "kl": 0.1533203125, "learning_rate": 7.997196425442438e-07, "loss": 0.0613, "reward": 1.7727296352386475, "reward_std": 0.1398918628692627, "rewards/accuracy_reward_stage2": 0.7727296948432922, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1144 }, { "completion_length": 8.390625, "epoch": 0.2006308042754512, "grad_norm": 17.057848521186898, "kl": 0.1728515625, "learning_rate": 7.995444191343963e-07, "loss": 0.0691, "reward": 1.6468448638916016, "reward_std": 0.1826433539390564, "rewards/accuracy_reward_stage2": 0.6468449234962463, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1145 }, { "completion_length": 9.703125, "epoch": 0.20080602768529876, "grad_norm": 14.97255121728084, "kl": 0.0556640625, "learning_rate": 7.993691957245488e-07, "loss": -0.0111, "reward": 1.500192642211914, "reward_std": 0.16978204250335693, "rewards/accuracy_reward_stage2": 0.5158176422119141, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1146 }, { "completion_length": 9.21875, "epoch": 0.2009812510951463, "grad_norm": 12.285037631467146, "kl": 0.08203125, "learning_rate": 7.991939723147012e-07, "loss": 0.0329, "reward": 1.5083717107772827, "reward_std": 0.09912580996751785, "rewards/accuracy_reward_stage2": 0.6333716511726379, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1147 }, { "completion_length": 9.84375, "epoch": 0.20115647450499385, "grad_norm": 10.819531810185309, "kl": 0.01275634765625, "learning_rate": 7.990187489048537e-07, "loss": 0.0051, "reward": 1.8347173929214478, "reward_std": 0.06732519716024399, "rewards/accuracy_reward_stage2": 0.8347173929214478, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1148 }, { "completion_length": 10.46875, "epoch": 0.20133169791484143, "grad_norm": 17.510513441111854, "kl": 0.08203125, "learning_rate": 7.988435254950062e-07, "loss": -0.1252, "reward": 1.4475042819976807, "reward_std": 0.3052405118942261, "rewards/accuracy_reward_stage2": 0.5100042819976807, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1149 }, { "completion_length": 9.71875, "epoch": 0.20150692132468898, "grad_norm": 24.563211909751576, "kl": 0.11328125, "learning_rate": 7.986683020851585e-07, "loss": 0.0517, "reward": 1.265645980834961, "reward_std": 0.2737762928009033, "rewards/accuracy_reward_stage2": 0.39064595103263855, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1150 }, { "completion_length": 8.609375, "epoch": 0.20168214473453652, "grad_norm": 20.56948420996003, "kl": 0.07568359375, "learning_rate": 7.98493078675311e-07, "loss": 0.0304, "reward": 1.5552361011505127, "reward_std": 0.23706388473510742, "rewards/accuracy_reward_stage2": 0.5552360415458679, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1151 }, { "completion_length": 28.578125, "epoch": 0.2018573681443841, "grad_norm": 21.06436355277442, "kl": 0.2578125, "learning_rate": 7.983178552654634e-07, "loss": 0.1027, "reward": 1.3616572618484497, "reward_std": 0.19103842973709106, "rewards/accuracy_reward_stage2": 0.48665720224380493, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1152 }, { "completion_length": 12.359375, "epoch": 0.20203259155423164, "grad_norm": 23.185828052219872, "kl": 0.0361328125, "learning_rate": 7.981426318556159e-07, "loss": 0.0144, "reward": 1.584661602973938, "reward_std": 0.22834071516990662, "rewards/accuracy_reward_stage2": 0.5846616625785828, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1153 }, { "completion_length": 18.421875, "epoch": 0.2022078149640792, "grad_norm": 18.080209220125052, "kl": 0.06982421875, "learning_rate": 7.979674084457683e-07, "loss": 0.028, "reward": 1.4647139310836792, "reward_std": 0.16236665844917297, "rewards/accuracy_reward_stage2": 0.4647139012813568, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1154 }, { "completion_length": 12.59375, "epoch": 0.20238303837392677, "grad_norm": 41.51210090003271, "kl": 0.06982421875, "learning_rate": 7.977921850359207e-07, "loss": -0.0054, "reward": 1.5676136016845703, "reward_std": 0.2357185333967209, "rewards/accuracy_reward_stage2": 0.5832385420799255, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1155 }, { "completion_length": 10.515625, "epoch": 0.2025582617837743, "grad_norm": 12.776722404361198, "kl": 0.1318359375, "learning_rate": 7.976169616260732e-07, "loss": 0.0095, "reward": 1.0949840545654297, "reward_std": 0.2613898515701294, "rewards/accuracy_reward_stage2": 0.2356090545654297, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1156 }, { "completion_length": 11.875, "epoch": 0.20273348519362186, "grad_norm": 12.378745536969769, "kl": 0.06689453125, "learning_rate": 7.974417382162256e-07, "loss": 0.0269, "reward": 1.6979291439056396, "reward_std": 0.04445386305451393, "rewards/accuracy_reward_stage2": 0.6979291439056396, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1157 }, { "completion_length": 11.34375, "epoch": 0.20290870860346943, "grad_norm": 15.21828623768596, "kl": 0.04052734375, "learning_rate": 7.972665148063781e-07, "loss": 0.0162, "reward": 1.7189762592315674, "reward_std": 0.17833425104618073, "rewards/accuracy_reward_stage2": 0.7189762592315674, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1158 }, { "completion_length": 11.125, "epoch": 0.20308393201331698, "grad_norm": 23.90652706021063, "kl": 0.06787109375, "learning_rate": 7.970912913965306e-07, "loss": -0.0171, "reward": 1.751429796218872, "reward_std": 0.2318522334098816, "rewards/accuracy_reward_stage2": 0.7670547366142273, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1159 }, { "completion_length": 11.53125, "epoch": 0.20325915542316453, "grad_norm": 19.569111992947953, "kl": 0.0966796875, "learning_rate": 7.969160679866829e-07, "loss": 0.0385, "reward": 1.4768517017364502, "reward_std": 0.2140122354030609, "rewards/accuracy_reward_stage2": 0.6018517017364502, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1160 }, { "completion_length": 11.453125, "epoch": 0.2034343788330121, "grad_norm": 17.326516841882064, "kl": 0.0291748046875, "learning_rate": 7.967408445768354e-07, "loss": 0.0117, "reward": 1.625, "reward_std": 0.22236785292625427, "rewards/accuracy_reward_stage2": 0.625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1161 }, { "completion_length": 12.296875, "epoch": 0.20360960224285965, "grad_norm": 20.93028425320662, "kl": 0.134765625, "learning_rate": 7.965656211669879e-07, "loss": -0.0578, "reward": 1.5847158432006836, "reward_std": 0.3005771040916443, "rewards/accuracy_reward_stage2": 0.6315909624099731, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1162 }, { "completion_length": 10.75, "epoch": 0.2037848256527072, "grad_norm": 17.116689083154068, "kl": 0.36328125, "learning_rate": 7.963903977571403e-07, "loss": 0.1448, "reward": 1.4689933061599731, "reward_std": 0.18849441409111023, "rewards/accuracy_reward_stage2": 0.8439933061599731, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1163 }, { "completion_length": 15.296875, "epoch": 0.20396004906255474, "grad_norm": 20.782040905035508, "kl": 0.04638671875, "learning_rate": 7.962151743472928e-07, "loss": 0.0186, "reward": 1.5337340831756592, "reward_std": 0.23394903540611267, "rewards/accuracy_reward_stage2": 0.533734142780304, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1164 }, { "completion_length": 10.953125, "epoch": 0.20413527247240232, "grad_norm": 20.41830434742461, "kl": 0.08984375, "learning_rate": 7.960399509374453e-07, "loss": 0.0359, "reward": 1.777416706085205, "reward_std": 0.35240471363067627, "rewards/accuracy_reward_stage2": 0.7774167060852051, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1165 }, { "completion_length": 8.765625, "epoch": 0.20431049588224987, "grad_norm": 23.215274778028444, "kl": 0.2060546875, "learning_rate": 7.958647275275977e-07, "loss": 0.0823, "reward": 1.5440936088562012, "reward_std": 0.24966692924499512, "rewards/accuracy_reward_stage2": 0.5440936088562012, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1166 }, { "completion_length": 8.59375, "epoch": 0.2044857192920974, "grad_norm": 16.453486111218634, "kl": 0.0400390625, "learning_rate": 7.956895041177501e-07, "loss": 0.016, "reward": 1.7762818336486816, "reward_std": 0.11226281523704529, "rewards/accuracy_reward_stage2": 0.7762819528579712, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1167 }, { "completion_length": 11.875, "epoch": 0.204660942701945, "grad_norm": 20.38053897005864, "kl": 0.10107421875, "learning_rate": 7.955142807079025e-07, "loss": 0.0096, "reward": 1.6993811130523682, "reward_std": 0.14725381135940552, "rewards/accuracy_reward_stage2": 0.7150062322616577, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1168 }, { "completion_length": 9.46875, "epoch": 0.20483616611179253, "grad_norm": 20.873197553058322, "kl": 0.0277099609375, "learning_rate": 7.95339057298055e-07, "loss": 0.0111, "reward": 1.6127396821975708, "reward_std": 0.1205105409026146, "rewards/accuracy_reward_stage2": 0.6127396821975708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1169 }, { "completion_length": 24.578125, "epoch": 0.20501138952164008, "grad_norm": 22.375963993171645, "kl": 0.25, "learning_rate": 7.951638338882074e-07, "loss": 0.1002, "reward": 1.1214206218719482, "reward_std": 0.24306708574295044, "rewards/accuracy_reward_stage2": 0.24642051756381989, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1170 }, { "completion_length": 8.796875, "epoch": 0.20518661293148766, "grad_norm": 27.950636957218467, "kl": 0.027099609375, "learning_rate": 7.949886104783598e-07, "loss": 0.0108, "reward": 1.6019957065582275, "reward_std": 0.245945006608963, "rewards/accuracy_reward_stage2": 0.6019957661628723, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1171 }, { "completion_length": 6.828125, "epoch": 0.2053618363413352, "grad_norm": 15.034136443009805, "kl": 0.0751953125, "learning_rate": 7.948133870685123e-07, "loss": 0.0301, "reward": 1.8496700525283813, "reward_std": 0.05726194754242897, "rewards/accuracy_reward_stage2": 0.8496700525283813, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1172 }, { "completion_length": 9.03125, "epoch": 0.20553705975118275, "grad_norm": 24.505177220334467, "kl": 0.109375, "learning_rate": 7.946381636586647e-07, "loss": 0.0438, "reward": 1.6173192262649536, "reward_std": 0.3106708824634552, "rewards/accuracy_reward_stage2": 0.6173191666603088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1173 }, { "completion_length": 14.28125, "epoch": 0.20571228316103032, "grad_norm": 21.066084623597465, "kl": 0.0250244140625, "learning_rate": 7.944629402488172e-07, "loss": 0.01, "reward": 1.5202445983886719, "reward_std": 0.15491852164268494, "rewards/accuracy_reward_stage2": 0.5202445983886719, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1174 }, { "completion_length": 11.0625, "epoch": 0.20588750657087787, "grad_norm": 20.666768324535514, "kl": 0.032470703125, "learning_rate": 7.942877168389697e-07, "loss": 0.013, "reward": 1.59375, "reward_std": 0.29143062233924866, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1175 }, { "completion_length": 7.59375, "epoch": 0.20606272998072542, "grad_norm": 18.042454348711036, "kl": 0.107421875, "learning_rate": 7.941124934291221e-07, "loss": 0.0145, "reward": 1.7791666984558105, "reward_std": 0.20184138417243958, "rewards/accuracy_reward_stage2": 0.9197916984558105, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1176 }, { "completion_length": 11.78125, "epoch": 0.206237953390573, "grad_norm": 7.186882587218823, "kl": 0.035400390625, "learning_rate": 7.939372700192746e-07, "loss": 0.0142, "reward": 1.4192759990692139, "reward_std": 0.02953476831316948, "rewards/accuracy_reward_stage2": 0.41927602887153625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1177 }, { "completion_length": 7.8125, "epoch": 0.20641317680042054, "grad_norm": 23.64243820773421, "kl": 0.058837890625, "learning_rate": 7.937620466094271e-07, "loss": -0.051, "reward": 1.6805853843688965, "reward_std": 0.2879638075828552, "rewards/accuracy_reward_stage2": 0.7274603247642517, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1178 }, { "completion_length": 12.90625, "epoch": 0.20658840021026809, "grad_norm": 19.158613745103754, "kl": 0.08056640625, "learning_rate": 7.935868231995795e-07, "loss": -0.0373, "reward": 1.4834885597229004, "reward_std": 0.23706209659576416, "rewards/accuracy_reward_stage2": 0.5303636193275452, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1179 }, { "completion_length": 11.453125, "epoch": 0.20676362362011566, "grad_norm": 17.980312882318678, "kl": 0.11328125, "learning_rate": 7.934115997897318e-07, "loss": -0.031, "reward": 1.6598703861236572, "reward_std": 0.1916605830192566, "rewards/accuracy_reward_stage2": 0.691120445728302, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1180 }, { "completion_length": 12.734375, "epoch": 0.2069388470299632, "grad_norm": 23.35800848338547, "kl": 0.453125, "learning_rate": 7.932363763798842e-07, "loss": 0.1307, "reward": 1.2424618005752563, "reward_std": 0.17332936823368073, "rewards/accuracy_reward_stage2": 0.39871180057525635, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1181 }, { "completion_length": 10.03125, "epoch": 0.20711407043981075, "grad_norm": 34.072947702124445, "kl": 0.32421875, "learning_rate": 7.930611529700367e-07, "loss": 0.1296, "reward": 1.5852689743041992, "reward_std": 0.29314032196998596, "rewards/accuracy_reward_stage2": 0.7102688550949097, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1182 }, { "completion_length": 12.03125, "epoch": 0.2072892938496583, "grad_norm": 22.76788574523607, "kl": 0.07568359375, "learning_rate": 7.928859295601892e-07, "loss": -0.0027, "reward": 1.4920098781585693, "reward_std": 0.2621656060218811, "rewards/accuracy_reward_stage2": 0.5076348781585693, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1183 }, { "completion_length": 10.015625, "epoch": 0.20746451725950588, "grad_norm": 17.1175348012611, "kl": 0.09130859375, "learning_rate": 7.927107061503416e-07, "loss": -0.0052, "reward": 1.480435848236084, "reward_std": 0.1929093450307846, "rewards/accuracy_reward_stage2": 0.621060848236084, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1184 }, { "completion_length": 11.046875, "epoch": 0.20763974066935342, "grad_norm": 22.068041434728716, "kl": 0.0341796875, "learning_rate": 7.925354827404941e-07, "loss": -0.0369, "reward": 1.7756726741790771, "reward_std": 0.11437465250492096, "rewards/accuracy_reward_stage2": 0.8069226741790771, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1185 }, { "completion_length": 9.0625, "epoch": 0.20781496407920097, "grad_norm": 18.76005311417912, "kl": 0.054931640625, "learning_rate": 7.923602593306466e-07, "loss": 0.022, "reward": 1.5227527618408203, "reward_std": 0.2675768733024597, "rewards/accuracy_reward_stage2": 0.5227527618408203, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1186 }, { "completion_length": 8.953125, "epoch": 0.20799018748904854, "grad_norm": 14.812565593998304, "kl": 0.042724609375, "learning_rate": 7.92185035920799e-07, "loss": -0.0246, "reward": 1.7401741743087769, "reward_std": 0.17801398038864136, "rewards/accuracy_reward_stage2": 0.7557991147041321, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1187 }, { "completion_length": 11.75, "epoch": 0.2081654108988961, "grad_norm": 21.42100080905678, "kl": 0.03857421875, "learning_rate": 7.920098125109515e-07, "loss": 0.0154, "reward": 1.7521522045135498, "reward_std": 0.2301727831363678, "rewards/accuracy_reward_stage2": 0.7521520853042603, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1188 }, { "completion_length": 16.96875, "epoch": 0.20834063430874364, "grad_norm": 25.1811684105259, "kl": 0.10009765625, "learning_rate": 7.918345891011039e-07, "loss": 0.0168, "reward": 1.161747694015503, "reward_std": 0.22458958625793457, "rewards/accuracy_reward_stage2": 0.4273727238178253, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1189 }, { "completion_length": 9.609375, "epoch": 0.2085158577185912, "grad_norm": 26.014563985480596, "kl": 0.271484375, "learning_rate": 7.916593656912563e-07, "loss": 0.0525, "reward": 1.4985435009002686, "reward_std": 0.34751009941101074, "rewards/accuracy_reward_stage2": 0.6547934412956238, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1190 }, { "completion_length": 30.4375, "epoch": 0.20869108112843876, "grad_norm": 20.238894284968815, "kl": 0.07763671875, "learning_rate": 7.914841422814088e-07, "loss": -0.0129, "reward": 1.6505463123321533, "reward_std": 0.21846714615821838, "rewards/accuracy_reward_stage2": 0.6661714315414429, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1191 }, { "completion_length": 9.4375, "epoch": 0.2088663045382863, "grad_norm": 18.85616159916294, "kl": 0.08544921875, "learning_rate": 7.913089188715612e-07, "loss": -0.0099, "reward": 1.4091585874557495, "reward_std": 0.20862269401550293, "rewards/accuracy_reward_stage2": 0.4404085874557495, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1192 }, { "completion_length": 11.34375, "epoch": 0.20904152794813388, "grad_norm": 20.685152739147835, "kl": 0.1259765625, "learning_rate": 7.911336954617136e-07, "loss": 0.0132, "reward": 1.276153326034546, "reward_std": 0.15010762214660645, "rewards/accuracy_reward_stage2": 0.5417782664299011, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1193 }, { "completion_length": 12.796875, "epoch": 0.20921675135798143, "grad_norm": 31.895420228966728, "kl": 0.1953125, "learning_rate": 7.909584720518661e-07, "loss": 0.0562, "reward": 1.4700446128845215, "reward_std": 0.21943055093288422, "rewards/accuracy_reward_stage2": 0.4856695532798767, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1194 }, { "completion_length": 11.90625, "epoch": 0.20939197476782898, "grad_norm": 22.69955377588612, "kl": 0.0439453125, "learning_rate": 7.907832486420185e-07, "loss": -0.0104, "reward": 1.5846116542816162, "reward_std": 0.22462745010852814, "rewards/accuracy_reward_stage2": 0.6002365946769714, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1195 }, { "completion_length": 19.3125, "epoch": 0.20956719817767655, "grad_norm": 34.454333953277235, "kl": 0.1357421875, "learning_rate": 7.90608025232171e-07, "loss": 0.0543, "reward": 1.2979505062103271, "reward_std": 0.309200644493103, "rewards/accuracy_reward_stage2": 0.42295050621032715, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1196 }, { "completion_length": 8.125, "epoch": 0.2097424215875241, "grad_norm": 18.722270676400463, "kl": 0.07470703125, "learning_rate": 7.904328018223234e-07, "loss": 0.03, "reward": 1.7971199750900269, "reward_std": 0.13342170417308807, "rewards/accuracy_reward_stage2": 0.7971200942993164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1197 }, { "completion_length": 16.125, "epoch": 0.20991764499737164, "grad_norm": 24.03594003478929, "kl": 0.08447265625, "learning_rate": 7.902575784124759e-07, "loss": -0.0022, "reward": 1.4187324047088623, "reward_std": 0.24392619729042053, "rewards/accuracy_reward_stage2": 0.43435734510421753, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1198 }, { "completion_length": 8.6875, "epoch": 0.2100928684072192, "grad_norm": 25.46547914674772, "kl": 0.09716796875, "learning_rate": 7.900823550026284e-07, "loss": 0.0174, "reward": 1.6383514404296875, "reward_std": 0.3255687654018402, "rewards/accuracy_reward_stage2": 0.6539763808250427, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1199 }, { "completion_length": 7.234375, "epoch": 0.21026809181706677, "grad_norm": 16.712020493406808, "kl": 0.06298828125, "learning_rate": 7.899071315927807e-07, "loss": -0.0016, "reward": 1.3724149465560913, "reward_std": 0.25119179487228394, "rewards/accuracy_reward_stage2": 0.3880399465560913, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1200 }, { "completion_length": 10.859375, "epoch": 0.2104433152269143, "grad_norm": 22.661832595121936, "kl": 0.072265625, "learning_rate": 7.897319081829332e-07, "loss": -0.0128, "reward": 1.7846884727478027, "reward_std": 0.2079046070575714, "rewards/accuracy_reward_stage2": 0.800313413143158, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1201 }, { "completion_length": 9.421875, "epoch": 0.21061853863676186, "grad_norm": 27.33161021288942, "kl": 0.041259765625, "learning_rate": 7.895566847730857e-07, "loss": 0.0165, "reward": 1.3854670524597168, "reward_std": 0.2343364655971527, "rewards/accuracy_reward_stage2": 0.3854671120643616, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1202 }, { "completion_length": 6.78125, "epoch": 0.21079376204660943, "grad_norm": 23.866240175046933, "kl": 0.020751953125, "learning_rate": 7.893814613632381e-07, "loss": 0.0083, "reward": 1.6647517681121826, "reward_std": 0.19862306118011475, "rewards/accuracy_reward_stage2": 0.6647517085075378, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1203 }, { "completion_length": 9.125, "epoch": 0.21096898545645698, "grad_norm": 18.7644930982759, "kl": 0.07763671875, "learning_rate": 7.892062379533906e-07, "loss": 0.0309, "reward": 1.561574101448059, "reward_std": 0.16064518690109253, "rewards/accuracy_reward_stage2": 0.5615741610527039, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1204 }, { "completion_length": 8.515625, "epoch": 0.21114420886630453, "grad_norm": 16.056818653041567, "kl": 0.0419921875, "learning_rate": 7.890310145435429e-07, "loss": 0.0169, "reward": 1.705617904663086, "reward_std": 0.09264673292636871, "rewards/accuracy_reward_stage2": 0.8306180238723755, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1205 }, { "completion_length": 12.6875, "epoch": 0.2113194322761521, "grad_norm": 15.735374459387911, "kl": 0.034423828125, "learning_rate": 7.888557911336954e-07, "loss": 0.0138, "reward": 1.639738917350769, "reward_std": 0.12377573549747467, "rewards/accuracy_reward_stage2": 0.6397388577461243, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1206 }, { "completion_length": 7.40625, "epoch": 0.21149465568599965, "grad_norm": 18.19948266499784, "kl": 0.08935546875, "learning_rate": 7.886805677238479e-07, "loss": 0.0356, "reward": 1.5436455011367798, "reward_std": 0.17799049615859985, "rewards/accuracy_reward_stage2": 0.5436455011367798, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1207 }, { "completion_length": 10.875, "epoch": 0.2116698790958472, "grad_norm": 18.79429944475187, "kl": 0.11865234375, "learning_rate": 7.885053443140003e-07, "loss": 0.0174, "reward": 1.5807538032531738, "reward_std": 0.2054576575756073, "rewards/accuracy_reward_stage2": 0.5963788628578186, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1208 }, { "completion_length": 12.5625, "epoch": 0.21184510250569477, "grad_norm": 24.119986223232797, "kl": 0.16796875, "learning_rate": 7.883301209041528e-07, "loss": 0.0291, "reward": 1.428783655166626, "reward_std": 0.3158418834209442, "rewards/accuracy_reward_stage2": 0.44440874457359314, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1209 }, { "completion_length": 18.4375, "epoch": 0.21202032591554232, "grad_norm": 20.70340367230994, "kl": 0.11181640625, "learning_rate": 7.881548974943052e-07, "loss": 0.0446, "reward": 1.3978643417358398, "reward_std": 0.23703636229038239, "rewards/accuracy_reward_stage2": 0.39786434173583984, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1210 }, { "completion_length": 11.09375, "epoch": 0.21219554932538987, "grad_norm": 17.57744479600521, "kl": 0.08642578125, "learning_rate": 7.879796740844576e-07, "loss": -0.0095, "reward": 1.2551759481430054, "reward_std": 0.1964961737394333, "rewards/accuracy_reward_stage2": 0.27080094814300537, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1211 }, { "completion_length": 6.140625, "epoch": 0.21237077273523744, "grad_norm": 14.552088548686209, "kl": 0.0260009765625, "learning_rate": 7.878044506746101e-07, "loss": -0.023, "reward": 1.8353174924850464, "reward_std": 0.20921599864959717, "rewards/accuracy_reward_stage2": 0.8509424924850464, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1212 }, { "completion_length": 15.125, "epoch": 0.212545996145085, "grad_norm": 13.133946279031946, "kl": 0.022705078125, "learning_rate": 7.876292272647625e-07, "loss": -0.0351, "reward": 1.4045956134796143, "reward_std": 0.1509314924478531, "rewards/accuracy_reward_stage2": 0.42022058367729187, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1213 }, { "completion_length": 10.421875, "epoch": 0.21272121955493253, "grad_norm": 30.50049286912514, "kl": 0.04736328125, "learning_rate": 7.87454003854915e-07, "loss": 0.0189, "reward": 1.411908507347107, "reward_std": 0.18127146363258362, "rewards/accuracy_reward_stage2": 0.41190850734710693, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1214 }, { "completion_length": 15.3125, "epoch": 0.2128964429647801, "grad_norm": 18.48778725455288, "kl": 0.0269775390625, "learning_rate": 7.872787804450675e-07, "loss": -0.0334, "reward": 1.590174913406372, "reward_std": 0.20116549730300903, "rewards/accuracy_reward_stage2": 0.6057999134063721, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1215 }, { "completion_length": 11.296875, "epoch": 0.21307166637462766, "grad_norm": 20.392004552064943, "kl": 0.134765625, "learning_rate": 7.871035570352199e-07, "loss": 0.0098, "reward": 1.435154914855957, "reward_std": 0.1450117528438568, "rewards/accuracy_reward_stage2": 0.4507799446582794, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1216 }, { "completion_length": 12.21875, "epoch": 0.2132468897844752, "grad_norm": 13.498308548221969, "kl": 0.12060546875, "learning_rate": 7.869283336253724e-07, "loss": 0.0171, "reward": 1.0430498123168945, "reward_std": 0.09386852383613586, "rewards/accuracy_reward_stage2": 0.18367479741573334, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1217 }, { "completion_length": 8.34375, "epoch": 0.21342211319432275, "grad_norm": 22.306457599821844, "kl": 0.2197265625, "learning_rate": 7.867531102155247e-07, "loss": 0.0879, "reward": 1.6171928644180298, "reward_std": 0.2317761927843094, "rewards/accuracy_reward_stage2": 0.6171928644180298, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1218 }, { "completion_length": 9.546875, "epoch": 0.21359733660417032, "grad_norm": 11.867011647782718, "kl": 0.044189453125, "learning_rate": 7.865778868056771e-07, "loss": 0.0177, "reward": 1.38582181930542, "reward_std": 0.06935185939073563, "rewards/accuracy_reward_stage2": 0.5108217000961304, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1219 }, { "completion_length": 10.890625, "epoch": 0.21377256001401787, "grad_norm": 15.016949281265788, "kl": 0.051513671875, "learning_rate": 7.864026633958296e-07, "loss": 0.0206, "reward": 1.1094141006469727, "reward_std": 0.1364666223526001, "rewards/accuracy_reward_stage2": 0.10941408574581146, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1220 }, { "completion_length": 20.1875, "epoch": 0.21394778342386542, "grad_norm": 18.42504754043461, "kl": 0.068359375, "learning_rate": 7.86227439985982e-07, "loss": -0.0826, "reward": 1.5647317171096802, "reward_std": 0.17734460532665253, "rewards/accuracy_reward_stage2": 0.6116067171096802, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1221 }, { "completion_length": 8.0625, "epoch": 0.214123006833713, "grad_norm": 21.446315689231362, "kl": 0.032470703125, "learning_rate": 7.860522165761345e-07, "loss": 0.013, "reward": 1.7758066654205322, "reward_std": 0.19553758203983307, "rewards/accuracy_reward_stage2": 0.7758066058158875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1222 }, { "completion_length": 14.265625, "epoch": 0.21429823024356054, "grad_norm": 23.97497416542969, "kl": 0.07666015625, "learning_rate": 7.85876993166287e-07, "loss": 0.0306, "reward": 1.5564537048339844, "reward_std": 0.21574008464813232, "rewards/accuracy_reward_stage2": 0.5564536452293396, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1223 }, { "completion_length": 10.875, "epoch": 0.21447345365340809, "grad_norm": 18.180349359021452, "kl": 0.10205078125, "learning_rate": 7.857017697564394e-07, "loss": 0.0407, "reward": 1.3234200477600098, "reward_std": 0.18061794340610504, "rewards/accuracy_reward_stage2": 0.44842010736465454, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1224 }, { "completion_length": 13.578125, "epoch": 0.21464867706325566, "grad_norm": 18.325128493418834, "kl": 0.07373046875, "learning_rate": 7.855265463465919e-07, "loss": 0.0296, "reward": 1.7562847137451172, "reward_std": 0.22711655497550964, "rewards/accuracy_reward_stage2": 0.7562847137451172, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1225 }, { "completion_length": 11.859375, "epoch": 0.2148239004731032, "grad_norm": 17.05566843473295, "kl": 0.040283203125, "learning_rate": 7.853513229367444e-07, "loss": -0.0987, "reward": 1.5513169765472412, "reward_std": 0.16350050270557404, "rewards/accuracy_reward_stage2": 0.7231919765472412, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1226 }, { "completion_length": 12.59375, "epoch": 0.21499912388295075, "grad_norm": 26.692888583630907, "kl": 0.296875, "learning_rate": 7.851760995268968e-07, "loss": 0.114, "reward": 1.2322908639907837, "reward_std": 0.3121843636035919, "rewards/accuracy_reward_stage2": 0.4822908937931061, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1227 }, { "completion_length": 9.828125, "epoch": 0.21517434729279833, "grad_norm": 21.447089137118855, "kl": 0.0947265625, "learning_rate": 7.850008761170493e-07, "loss": 0.0378, "reward": 1.6512415409088135, "reward_std": 0.2510579824447632, "rewards/accuracy_reward_stage2": 0.6512414813041687, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1228 }, { "completion_length": 13.859375, "epoch": 0.21534957070264588, "grad_norm": 64.9879149157075, "kl": 0.33203125, "learning_rate": 7.848256527072016e-07, "loss": 0.1329, "reward": 1.4411249160766602, "reward_std": 0.1767362356185913, "rewards/accuracy_reward_stage2": 0.6911249160766602, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1229 }, { "completion_length": 9.421875, "epoch": 0.21552479411249342, "grad_norm": 73.17992385714928, "kl": 0.404296875, "learning_rate": 7.846504292973541e-07, "loss": 0.0985, "reward": 1.5135424137115479, "reward_std": 0.28798261284828186, "rewards/accuracy_reward_stage2": 0.6697924733161926, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1230 }, { "completion_length": 6.25, "epoch": 0.215700017522341, "grad_norm": 17.784801022236678, "kl": 0.1298828125, "learning_rate": 7.844752058875065e-07, "loss": 0.008, "reward": 1.4939236640930176, "reward_std": 0.21711787581443787, "rewards/accuracy_reward_stage2": 0.5095486640930176, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1231 }, { "completion_length": 8.671875, "epoch": 0.21587524093218854, "grad_norm": 24.758583579901067, "kl": 0.1044921875, "learning_rate": 7.842999824776589e-07, "loss": -0.0597, "reward": 1.3905413150787354, "reward_std": 0.24760988354682922, "rewards/accuracy_reward_stage2": 0.4374162256717682, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1232 }, { "completion_length": 11.640625, "epoch": 0.2160504643420361, "grad_norm": 21.77876149803177, "kl": 0.20703125, "learning_rate": 7.841247590678114e-07, "loss": 0.0494, "reward": 1.3181720972061157, "reward_std": 0.32347288727760315, "rewards/accuracy_reward_stage2": 0.4587971866130829, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1233 }, { "completion_length": 7.390625, "epoch": 0.21622568775188364, "grad_norm": 22.39564970611295, "kl": 0.1357421875, "learning_rate": 7.839495356579638e-07, "loss": 0.0376, "reward": 1.6510417461395264, "reward_std": 0.22779880464076996, "rewards/accuracy_reward_stage2": 0.6666666865348816, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1234 }, { "completion_length": 8.265625, "epoch": 0.2164009111617312, "grad_norm": 25.959874464914165, "kl": 0.027099609375, "learning_rate": 7.837743122481163e-07, "loss": 0.0109, "reward": 1.3759396076202393, "reward_std": 0.29031646251678467, "rewards/accuracy_reward_stage2": 0.3759395480155945, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1235 }, { "completion_length": 7.234375, "epoch": 0.21657613457157876, "grad_norm": 20.21760185077339, "kl": 0.1298828125, "learning_rate": 7.835990888382688e-07, "loss": 0.0519, "reward": 1.5539495944976807, "reward_std": 0.2241932451725006, "rewards/accuracy_reward_stage2": 0.5539496541023254, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1236 }, { "completion_length": 20.5625, "epoch": 0.2167513579814263, "grad_norm": 35.69379614088005, "kl": 0.142578125, "learning_rate": 7.834238654284212e-07, "loss": 0.0634, "reward": 1.6434786319732666, "reward_std": 0.18241646885871887, "rewards/accuracy_reward_stage2": 0.7684785723686218, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1237 }, { "completion_length": 11.265625, "epoch": 0.21692658139127388, "grad_norm": 14.892077914037673, "kl": 0.044921875, "learning_rate": 7.832486420185737e-07, "loss": 0.018, "reward": 1.382194995880127, "reward_std": 0.12057159096002579, "rewards/accuracy_reward_stage2": 0.3821950852870941, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1238 }, { "completion_length": 8.421875, "epoch": 0.21710180480112143, "grad_norm": 17.29611686120686, "kl": 0.09765625, "learning_rate": 7.830734186087262e-07, "loss": 0.039, "reward": 1.4690710306167603, "reward_std": 0.15366077423095703, "rewards/accuracy_reward_stage2": 0.46907100081443787, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1239 }, { "completion_length": 12.59375, "epoch": 0.21727702821096898, "grad_norm": 15.11141683071002, "kl": 0.10009765625, "learning_rate": 7.828981951988785e-07, "loss": -0.0029, "reward": 1.627720832824707, "reward_std": 0.15333834290504456, "rewards/accuracy_reward_stage2": 0.643345832824707, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1240 }, { "completion_length": 14.609375, "epoch": 0.21745225162081655, "grad_norm": 21.22067711348212, "kl": 0.024169921875, "learning_rate": 7.82722971789031e-07, "loss": 0.0097, "reward": 1.6614583730697632, "reward_std": 0.1928693801164627, "rewards/accuracy_reward_stage2": 0.6614582538604736, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1241 }, { "completion_length": 7.609375, "epoch": 0.2176274750306641, "grad_norm": 13.858555219264197, "kl": 0.0849609375, "learning_rate": 7.825477483791834e-07, "loss": -0.0452, "reward": 1.7296041250228882, "reward_std": 0.1613890528678894, "rewards/accuracy_reward_stage2": 0.760854184627533, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1242 }, { "completion_length": 10.5, "epoch": 0.21780269844051164, "grad_norm": 23.196623299024466, "kl": 0.208984375, "learning_rate": 7.823725249693359e-07, "loss": 0.0503, "reward": 1.2230807542800903, "reward_std": 0.24334552884101868, "rewards/accuracy_reward_stage2": 0.36370575428009033, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1243 }, { "completion_length": 14.28125, "epoch": 0.21797792185035922, "grad_norm": 12.800626978253057, "kl": 0.08154296875, "learning_rate": 7.821973015594883e-07, "loss": -0.0013, "reward": 1.4294836521148682, "reward_std": 0.10467779636383057, "rewards/accuracy_reward_stage2": 0.6951085925102234, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1244 }, { "completion_length": 34.484375, "epoch": 0.21815314526020677, "grad_norm": 11.287570658853458, "kl": 0.0517578125, "learning_rate": 7.820220781496407e-07, "loss": 0.0207, "reward": 1.5413293838500977, "reward_std": 0.17220129072666168, "rewards/accuracy_reward_stage2": 0.5413292646408081, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1245 }, { "completion_length": 8.890625, "epoch": 0.2183283686700543, "grad_norm": 86.19789461067052, "kl": 0.1123046875, "learning_rate": 7.818468547397932e-07, "loss": 0.0449, "reward": 1.6259727478027344, "reward_std": 0.21742461621761322, "rewards/accuracy_reward_stage2": 0.6259727478027344, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1246 }, { "completion_length": 12.125, "epoch": 0.2185035920799019, "grad_norm": 19.16149566715807, "kl": 0.06787109375, "learning_rate": 7.816716313299457e-07, "loss": -0.0171, "reward": 1.2638311386108398, "reward_std": 0.20646998286247253, "rewards/accuracy_reward_stage2": 0.2794560194015503, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1247 }, { "completion_length": 14.609375, "epoch": 0.21867881548974943, "grad_norm": 25.56266166845523, "kl": 0.059814453125, "learning_rate": 7.814964079200981e-07, "loss": 0.024, "reward": 1.5029523372650146, "reward_std": 0.205928772687912, "rewards/accuracy_reward_stage2": 0.5029522776603699, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1248 }, { "completion_length": 13.171875, "epoch": 0.21885403889959698, "grad_norm": 17.61052129867045, "kl": 0.10400390625, "learning_rate": 7.813211845102505e-07, "loss": -0.0468, "reward": 1.2945280075073242, "reward_std": 0.2459794282913208, "rewards/accuracy_reward_stage2": 0.3257780075073242, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1249 }, { "completion_length": 11.71875, "epoch": 0.21902926230944456, "grad_norm": 11.221758396438073, "kl": 0.0120849609375, "learning_rate": 7.811459611004029e-07, "loss": 0.0048, "reward": 1.6181175708770752, "reward_std": 0.01946648210287094, "rewards/accuracy_reward_stage2": 0.6181175708770752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1250 }, { "completion_length": 7.0625, "epoch": 0.2192044857192921, "grad_norm": 16.126607670282013, "kl": 0.07958984375, "learning_rate": 7.809707376905554e-07, "loss": -0.0123, "reward": 1.7962268590927124, "reward_std": 0.10672344267368317, "rewards/accuracy_reward_stage2": 0.8118518590927124, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1251 }, { "completion_length": 12.671875, "epoch": 0.21937970912913965, "grad_norm": 29.519790301106305, "kl": 0.0277099609375, "learning_rate": 7.807955142807079e-07, "loss": 0.0111, "reward": 1.421875, "reward_std": 0.38664889335632324, "rewards/accuracy_reward_stage2": 0.421875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1252 }, { "completion_length": 9.4375, "epoch": 0.2195549325389872, "grad_norm": 31.182060614934088, "kl": 0.056884765625, "learning_rate": 7.806202908708603e-07, "loss": 0.0228, "reward": 1.4860175848007202, "reward_std": 0.2832135558128357, "rewards/accuracy_reward_stage2": 0.4860175848007202, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1253 }, { "completion_length": 7.921875, "epoch": 0.21973015594883477, "grad_norm": 11.564218986630497, "kl": 0.033935546875, "learning_rate": 7.804450674610128e-07, "loss": 0.0136, "reward": 1.6184473037719727, "reward_std": 0.11314624547958374, "rewards/accuracy_reward_stage2": 0.6184473633766174, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1254 }, { "completion_length": 11.9375, "epoch": 0.21990537935868232, "grad_norm": 6.513019818636513, "kl": 0.039306640625, "learning_rate": 7.802698440511653e-07, "loss": 0.0158, "reward": 1.671875, "reward_std": 0.0646936446428299, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1255 }, { "completion_length": 9.203125, "epoch": 0.22008060276852986, "grad_norm": 20.747896613755444, "kl": 0.059814453125, "learning_rate": 7.800946206413176e-07, "loss": 0.024, "reward": 1.3773555755615234, "reward_std": 0.25220632553100586, "rewards/accuracy_reward_stage2": 0.3773554861545563, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1256 }, { "completion_length": 9.265625, "epoch": 0.22025582617837744, "grad_norm": 24.392216642627467, "kl": 0.1396484375, "learning_rate": 7.799193972314701e-07, "loss": 0.0117, "reward": 1.505408763885498, "reward_std": 0.16076895594596863, "rewards/accuracy_reward_stage2": 0.6460338234901428, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1257 }, { "completion_length": 11.015625, "epoch": 0.220431049588225, "grad_norm": 187.6336735161918, "kl": 1.0, "learning_rate": 7.797441738216225e-07, "loss": 0.3622, "reward": 1.6911745071411133, "reward_std": 0.21796134114265442, "rewards/accuracy_reward_stage2": 0.8317995071411133, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1258 }, { "completion_length": 9.1875, "epoch": 0.22060627299807253, "grad_norm": 17.763683470304855, "kl": 0.10595703125, "learning_rate": 7.795689504117749e-07, "loss": 0.0424, "reward": 1.6380715370178223, "reward_std": 0.17473512887954712, "rewards/accuracy_reward_stage2": 0.6380715370178223, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1259 }, { "completion_length": 16.09375, "epoch": 0.2207814964079201, "grad_norm": 18.883167626689094, "kl": 0.09521484375, "learning_rate": 7.793937270019274e-07, "loss": 0.0012, "reward": 1.2897546291351318, "reward_std": 0.21671342849731445, "rewards/accuracy_reward_stage2": 0.30537962913513184, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1260 }, { "completion_length": 7.859375, "epoch": 0.22095671981776766, "grad_norm": 24.10659942453599, "kl": 0.09375, "learning_rate": 7.792185035920798e-07, "loss": 0.0375, "reward": 1.5981502532958984, "reward_std": 0.27921926975250244, "rewards/accuracy_reward_stage2": 0.5981503129005432, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1261 }, { "completion_length": 7.390625, "epoch": 0.2211319432276152, "grad_norm": 14.815017372927116, "kl": 0.05908203125, "learning_rate": 7.790432801822323e-07, "loss": 0.0237, "reward": 1.590078592300415, "reward_std": 0.21127043664455414, "rewards/accuracy_reward_stage2": 0.590078592300415, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1262 }, { "completion_length": 8.875, "epoch": 0.22130716663746278, "grad_norm": 18.08682737782561, "kl": 0.10302734375, "learning_rate": 7.788680567723848e-07, "loss": 0.0014, "reward": 1.4229505062103271, "reward_std": 0.25344976782798767, "rewards/accuracy_reward_stage2": 0.5635755062103271, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1263 }, { "completion_length": 9.8125, "epoch": 0.22148239004731032, "grad_norm": 17.866828523480077, "kl": 0.06103515625, "learning_rate": 7.786928333625372e-07, "loss": 0.0244, "reward": 1.3703603744506836, "reward_std": 0.19101378321647644, "rewards/accuracy_reward_stage2": 0.495360404253006, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1264 }, { "completion_length": 12.46875, "epoch": 0.22165761345715787, "grad_norm": 17.874063479543306, "kl": 0.1728515625, "learning_rate": 7.785176099526897e-07, "loss": 0.0245, "reward": 1.5815012454986572, "reward_std": 0.23980316519737244, "rewards/accuracy_reward_stage2": 0.7221262454986572, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1265 }, { "completion_length": 18.40625, "epoch": 0.22183283686700545, "grad_norm": 25.11799851246929, "kl": 0.1953125, "learning_rate": 7.783423865428421e-07, "loss": 0.0466, "reward": 1.4459002017974854, "reward_std": 0.24602550268173218, "rewards/accuracy_reward_stage2": 0.5865253210067749, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1266 }, { "completion_length": 13.265625, "epoch": 0.222008060276853, "grad_norm": 21.582078379159743, "kl": 0.07275390625, "learning_rate": 7.781671631329946e-07, "loss": 0.029, "reward": 1.214440107345581, "reward_std": 0.16473287343978882, "rewards/accuracy_reward_stage2": 0.3394400477409363, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1267 }, { "completion_length": 9.75, "epoch": 0.22218328368670054, "grad_norm": 24.881310227565013, "kl": 0.62890625, "learning_rate": 7.779919397231471e-07, "loss": 0.2498, "reward": 1.3854460716247559, "reward_std": 0.29034459590911865, "rewards/accuracy_reward_stage2": 0.5104460716247559, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1268 }, { "completion_length": 7.34375, "epoch": 0.22235850709654809, "grad_norm": 14.940564340974653, "kl": 0.0810546875, "learning_rate": 7.778167163132993e-07, "loss": 0.0323, "reward": 1.6923959255218506, "reward_std": 0.09215311706066132, "rewards/accuracy_reward_stage2": 0.6923958659172058, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1269 }, { "completion_length": 12.25, "epoch": 0.22253373050639566, "grad_norm": 14.231371081239205, "kl": 0.045654296875, "learning_rate": 7.776414929034518e-07, "loss": 0.0182, "reward": 1.698338270187378, "reward_std": 0.13502109050750732, "rewards/accuracy_reward_stage2": 0.6983382701873779, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1270 }, { "completion_length": 17.015625, "epoch": 0.2227089539162432, "grad_norm": 17.951746853772647, "kl": 0.0537109375, "learning_rate": 7.774662694936043e-07, "loss": 0.0215, "reward": 1.1886450052261353, "reward_std": 0.17170041799545288, "rewards/accuracy_reward_stage2": 0.31364506483078003, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1271 }, { "completion_length": 13.234375, "epoch": 0.22288417732609075, "grad_norm": 20.725766439084733, "kl": 0.08154296875, "learning_rate": 7.772910460837567e-07, "loss": 0.0159, "reward": 1.6198360919952393, "reward_std": 0.25739431381225586, "rewards/accuracy_reward_stage2": 0.6354610919952393, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1272 }, { "completion_length": 7.28125, "epoch": 0.22305940073593833, "grad_norm": 23.434106099546728, "kl": 0.1669921875, "learning_rate": 7.771158226739092e-07, "loss": 0.0227, "reward": 1.688348650932312, "reward_std": 0.2744218707084656, "rewards/accuracy_reward_stage2": 0.719598650932312, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1273 }, { "completion_length": 11.734375, "epoch": 0.22323462414578588, "grad_norm": 23.463084611699912, "kl": 0.0498046875, "learning_rate": 7.769405992640616e-07, "loss": -0.0242, "reward": 1.5922805070877075, "reward_std": 0.2871755361557007, "rewards/accuracy_reward_stage2": 0.7329055666923523, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1274 }, { "completion_length": 6.921875, "epoch": 0.22340984755563342, "grad_norm": 22.14878463098126, "kl": 0.134765625, "learning_rate": 7.767653758542141e-07, "loss": 0.0096, "reward": 1.6665723323822021, "reward_std": 0.27279797196388245, "rewards/accuracy_reward_stage2": 0.6821973323822021, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1275 }, { "completion_length": 6.96875, "epoch": 0.223585070965481, "grad_norm": 17.98879927843346, "kl": 0.09619140625, "learning_rate": 7.765901524443666e-07, "loss": -0.0058, "reward": 1.4087541103363037, "reward_std": 0.24091657996177673, "rewards/accuracy_reward_stage2": 0.4243791103363037, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1276 }, { "completion_length": 13.03125, "epoch": 0.22376029437532854, "grad_norm": 22.424179520437004, "kl": 0.080078125, "learning_rate": 7.76414929034519e-07, "loss": 0.011, "reward": 1.6969799995422363, "reward_std": 0.1915295571088791, "rewards/accuracy_reward_stage2": 0.7126048803329468, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1277 }, { "completion_length": 7.765625, "epoch": 0.2239355177851761, "grad_norm": 17.401912665765387, "kl": 0.091796875, "learning_rate": 7.762397056246715e-07, "loss": -0.0055, "reward": 1.5669753551483154, "reward_std": 0.09806131571531296, "rewards/accuracy_reward_stage2": 0.5826001763343811, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1278 }, { "completion_length": 16.234375, "epoch": 0.22411074119502367, "grad_norm": 19.61910716871486, "kl": 0.08349609375, "learning_rate": 7.76064482214824e-07, "loss": -0.0385, "reward": 1.4222311973571777, "reward_std": 0.29086655378341675, "rewards/accuracy_reward_stage2": 0.45348113775253296, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1279 }, { "completion_length": 15.765625, "epoch": 0.2242859646048712, "grad_norm": 20.170390141515178, "kl": 0.384765625, "learning_rate": 7.758892588049763e-07, "loss": 0.1094, "reward": 1.4278593063354492, "reward_std": 0.14910349249839783, "rewards/accuracy_reward_stage2": 0.5684843063354492, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1280 }, { "completion_length": 11.40625, "epoch": 0.22446118801471876, "grad_norm": 17.25825844167911, "kl": 0.0615234375, "learning_rate": 7.757140353951288e-07, "loss": -0.0195, "reward": 1.255530595779419, "reward_std": 0.2321069836616516, "rewards/accuracy_reward_stage2": 0.39615553617477417, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1281 }, { "completion_length": 10.6875, "epoch": 0.22463641142456633, "grad_norm": 16.864030485310032, "kl": 0.0673828125, "learning_rate": 7.755388119852811e-07, "loss": -0.0173, "reward": 1.738398551940918, "reward_std": 0.1555314064025879, "rewards/accuracy_reward_stage2": 0.7540234923362732, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1282 }, { "completion_length": 9.59375, "epoch": 0.22481163483441388, "grad_norm": 21.18331323146374, "kl": 0.14453125, "learning_rate": 7.753635885754336e-07, "loss": 0.0577, "reward": 1.6540005207061768, "reward_std": 0.1803339272737503, "rewards/accuracy_reward_stage2": 0.6540004014968872, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1283 }, { "completion_length": 12.9375, "epoch": 0.22498685824426143, "grad_norm": 23.49818812479695, "kl": 0.056884765625, "learning_rate": 7.751883651655861e-07, "loss": -0.0179, "reward": 1.664846658706665, "reward_std": 0.19526709616184235, "rewards/accuracy_reward_stage2": 0.680471658706665, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1284 }, { "completion_length": 13.890625, "epoch": 0.22516208165410898, "grad_norm": 23.28590729981405, "kl": 0.057373046875, "learning_rate": 7.750131417557385e-07, "loss": -0.0213, "reward": 1.4284430742263794, "reward_std": 0.2543635070323944, "rewards/accuracy_reward_stage2": 0.4440680146217346, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1285 }, { "completion_length": 22.96875, "epoch": 0.22533730506395655, "grad_norm": 19.21656824927377, "kl": 0.09912109375, "learning_rate": 7.74837918345891e-07, "loss": 0.0395, "reward": 1.3398176431655884, "reward_std": 0.16608867049217224, "rewards/accuracy_reward_stage2": 0.4648175835609436, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1286 }, { "completion_length": 13.171875, "epoch": 0.2255125284738041, "grad_norm": 23.36717503355835, "kl": 0.083984375, "learning_rate": 7.746626949360435e-07, "loss": 0.0335, "reward": 1.4650541543960571, "reward_std": 0.22038257122039795, "rewards/accuracy_reward_stage2": 0.46505406498908997, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1287 }, { "completion_length": 11.65625, "epoch": 0.22568775188365164, "grad_norm": 19.443791282030777, "kl": 0.0203857421875, "learning_rate": 7.744874715261959e-07, "loss": 0.0081, "reward": 1.7337589263916016, "reward_std": 0.21359241008758545, "rewards/accuracy_reward_stage2": 0.7337589263916016, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1288 }, { "completion_length": 12.578125, "epoch": 0.22586297529349922, "grad_norm": 14.696962381102287, "kl": 0.0546875, "learning_rate": 7.743122481163483e-07, "loss": 0.0219, "reward": 1.4507033824920654, "reward_std": 0.11517933756113052, "rewards/accuracy_reward_stage2": 0.45070335268974304, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1289 }, { "completion_length": 12.140625, "epoch": 0.22603819870334677, "grad_norm": 18.039059813289775, "kl": 0.06591796875, "learning_rate": 7.741370247065007e-07, "loss": 0.0135, "reward": 1.5439950227737427, "reward_std": 0.13915899395942688, "rewards/accuracy_reward_stage2": 0.6689950227737427, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1290 }, { "completion_length": 13.875, "epoch": 0.2262134221131943, "grad_norm": 14.804979613619244, "kl": 0.05859375, "learning_rate": 7.739618012966532e-07, "loss": 0.0234, "reward": 1.4256266355514526, "reward_std": 0.12437914311885834, "rewards/accuracy_reward_stage2": 0.42562660574913025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1291 }, { "completion_length": 8.71875, "epoch": 0.2263886455230419, "grad_norm": 27.679772931226807, "kl": 0.251953125, "learning_rate": 7.737865778868057e-07, "loss": 0.1072, "reward": 1.5422618389129639, "reward_std": 0.14960823953151703, "rewards/accuracy_reward_stage2": 0.7922618985176086, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1292 }, { "completion_length": 8.796875, "epoch": 0.22656386893288943, "grad_norm": 16.907333227979905, "kl": 0.12109375, "learning_rate": 7.736113544769581e-07, "loss": 0.014, "reward": 1.551778793334961, "reward_std": 0.12833553552627563, "rewards/accuracy_reward_stage2": 0.5674037337303162, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1293 }, { "completion_length": 7.765625, "epoch": 0.22673909234273698, "grad_norm": 20.22861234970417, "kl": 0.048583984375, "learning_rate": 7.734361310671105e-07, "loss": 0.0195, "reward": 1.5390467643737793, "reward_std": 0.2336047738790512, "rewards/accuracy_reward_stage2": 0.6640467047691345, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1294 }, { "completion_length": 11.203125, "epoch": 0.22691431575258456, "grad_norm": 26.05748474925187, "kl": 0.07666015625, "learning_rate": 7.732609076572629e-07, "loss": 0.014, "reward": 1.462762713432312, "reward_std": 0.2649151384830475, "rewards/accuracy_reward_stage2": 0.603387713432312, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1295 }, { "completion_length": 11.0, "epoch": 0.2270895391624321, "grad_norm": 25.09086900081551, "kl": 0.57421875, "learning_rate": 7.730856842474154e-07, "loss": 0.2078, "reward": 1.268276572227478, "reward_std": 0.2907959818840027, "rewards/accuracy_reward_stage2": 0.518276572227478, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1296 }, { "completion_length": 13.515625, "epoch": 0.22726476257227965, "grad_norm": 19.14340541773585, "kl": 0.037109375, "learning_rate": 7.729104608375679e-07, "loss": 0.0149, "reward": 1.7883508205413818, "reward_std": 0.16622239351272583, "rewards/accuracy_reward_stage2": 0.7883508801460266, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1297 }, { "completion_length": 11.15625, "epoch": 0.22743998598212722, "grad_norm": 21.16259113723349, "kl": 0.0751953125, "learning_rate": 7.727352374277202e-07, "loss": -0.0475, "reward": 1.6002087593078613, "reward_std": 0.25808942317962646, "rewards/accuracy_reward_stage2": 0.6314586997032166, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1298 }, { "completion_length": 10.765625, "epoch": 0.22761520939197477, "grad_norm": 19.104041721217694, "kl": 0.1318359375, "learning_rate": 7.725600140178727e-07, "loss": 0.0527, "reward": 1.3541667461395264, "reward_std": 0.26745420694351196, "rewards/accuracy_reward_stage2": 0.4791666567325592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1299 }, { "completion_length": 9.3125, "epoch": 0.22779043280182232, "grad_norm": 9.065751220738832, "kl": 0.00518798828125, "learning_rate": 7.723847906080252e-07, "loss": 0.0021, "reward": 1.7539682388305664, "reward_std": 0.01122391689568758, "rewards/accuracy_reward_stage2": 0.7539682388305664, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1300 }, { "completion_length": 10.796875, "epoch": 0.2279656562116699, "grad_norm": 16.62573543229576, "kl": 0.07958984375, "learning_rate": 7.722095671981776e-07, "loss": 0.0003, "reward": 1.459397792816162, "reward_std": 0.28754448890686035, "rewards/accuracy_reward_stage2": 0.6000228524208069, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1301 }, { "completion_length": 13.859375, "epoch": 0.22814087962151744, "grad_norm": 18.95066625577687, "kl": 0.08203125, "learning_rate": 7.720343437883301e-07, "loss": 0.0327, "reward": 1.5195292234420776, "reward_std": 0.1240844875574112, "rewards/accuracy_reward_stage2": 0.5195292234420776, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1302 }, { "completion_length": 7.234375, "epoch": 0.228316103031365, "grad_norm": 27.896227499752165, "kl": 0.022216796875, "learning_rate": 7.718591203784826e-07, "loss": 0.0089, "reward": 1.6503667831420898, "reward_std": 0.19524559378623962, "rewards/accuracy_reward_stage2": 0.7753667831420898, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1303 }, { "completion_length": 9.5625, "epoch": 0.22849132644121253, "grad_norm": 21.168475667163364, "kl": 0.138671875, "learning_rate": 7.71683896968635e-07, "loss": -0.0267, "reward": 1.5251177549362183, "reward_std": 0.23324428498744965, "rewards/accuracy_reward_stage2": 0.5563677549362183, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1304 }, { "completion_length": 11.03125, "epoch": 0.2286665498510601, "grad_norm": 17.068660299280747, "kl": 0.1181640625, "learning_rate": 7.715086735587875e-07, "loss": 0.0471, "reward": 1.2996182441711426, "reward_std": 0.19639158248901367, "rewards/accuracy_reward_stage2": 0.2996181845664978, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1305 }, { "completion_length": 10.859375, "epoch": 0.22884177326090765, "grad_norm": 22.08925315478146, "kl": 0.2216796875, "learning_rate": 7.713334501489399e-07, "loss": 0.0091, "reward": 1.5402624607086182, "reward_std": 0.22633978724479675, "rewards/accuracy_reward_stage2": 0.5871374607086182, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1306 }, { "completion_length": 13.625, "epoch": 0.2290169966707552, "grad_norm": 18.914502186490715, "kl": 0.0888671875, "learning_rate": 7.711582267390923e-07, "loss": -0.0086, "reward": 1.4000566005706787, "reward_std": 0.14851221442222595, "rewards/accuracy_reward_stage2": 0.5406815409660339, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1307 }, { "completion_length": 13.140625, "epoch": 0.22919222008060278, "grad_norm": 24.534198975775887, "kl": 0.058837890625, "learning_rate": 7.709830033292448e-07, "loss": 0.0236, "reward": 1.3969494104385376, "reward_std": 0.3108880817890167, "rewards/accuracy_reward_stage2": 0.3969494104385376, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1308 }, { "completion_length": 15.03125, "epoch": 0.22936744349045032, "grad_norm": 24.113349660176898, "kl": 0.06982421875, "learning_rate": 7.708077799193971e-07, "loss": -0.0163, "reward": 1.5292431116104126, "reward_std": 0.2674151659011841, "rewards/accuracy_reward_stage2": 0.5448680520057678, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1309 }, { "completion_length": 11.625, "epoch": 0.22954266690029787, "grad_norm": 17.911011858964358, "kl": 0.0888671875, "learning_rate": 7.706325565095496e-07, "loss": -0.0087, "reward": 1.62459397315979, "reward_std": 0.20538245141506195, "rewards/accuracy_reward_stage2": 0.6402188539505005, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1310 }, { "completion_length": 6.0625, "epoch": 0.22971789031014545, "grad_norm": 19.31570319495111, "kl": 0.0546875, "learning_rate": 7.70457333099702e-07, "loss": -0.0115, "reward": 1.784255862236023, "reward_std": 0.25300198793411255, "rewards/accuracy_reward_stage2": 0.799880862236023, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1311 }, { "completion_length": 10.828125, "epoch": 0.229893113719993, "grad_norm": 20.540568906206264, "kl": 0.05224609375, "learning_rate": 7.702821096898545e-07, "loss": 0.0209, "reward": 1.6209710836410522, "reward_std": 0.1423826813697815, "rewards/accuracy_reward_stage2": 0.6209710836410522, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1312 }, { "completion_length": 9.703125, "epoch": 0.23006833712984054, "grad_norm": 28.448715831191063, "kl": 0.11083984375, "learning_rate": 7.70106886280007e-07, "loss": 0.0443, "reward": 1.6625604629516602, "reward_std": 0.336778849363327, "rewards/accuracy_reward_stage2": 0.6625604629516602, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1313 }, { "completion_length": 9.75, "epoch": 0.2302435605396881, "grad_norm": 23.162356470137542, "kl": 0.150390625, "learning_rate": 7.699316628701594e-07, "loss": 0.0108, "reward": 1.3917927742004395, "reward_std": 0.36611586809158325, "rewards/accuracy_reward_stage2": 0.5480427742004395, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1314 }, { "completion_length": 8.015625, "epoch": 0.23041878394953566, "grad_norm": 18.946663303104412, "kl": 0.041259765625, "learning_rate": 7.697564394603119e-07, "loss": -0.0277, "reward": 1.6170215606689453, "reward_std": 0.20247286558151245, "rewards/accuracy_reward_stage2": 0.6326465606689453, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1315 }, { "completion_length": 15.703125, "epoch": 0.2305940073593832, "grad_norm": 24.83974962007514, "kl": 0.046142578125, "learning_rate": 7.695812160504644e-07, "loss": 0.0184, "reward": 1.5687530040740967, "reward_std": 0.29624682664871216, "rewards/accuracy_reward_stage2": 0.5687530040740967, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1316 }, { "completion_length": 7.578125, "epoch": 0.23076923076923078, "grad_norm": 18.37249053762782, "kl": 0.0849609375, "learning_rate": 7.694059926406168e-07, "loss": -0.0102, "reward": 1.5225424766540527, "reward_std": 0.23137205839157104, "rewards/accuracy_reward_stage2": 0.5381674766540527, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1317 }, { "completion_length": 8.265625, "epoch": 0.23094445417907833, "grad_norm": 24.39965195019764, "kl": 0.10302734375, "learning_rate": 7.692307692307693e-07, "loss": 0.0412, "reward": 1.7179219722747803, "reward_std": 0.25506922602653503, "rewards/accuracy_reward_stage2": 0.7179219126701355, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1318 }, { "completion_length": 10.90625, "epoch": 0.23111967758892588, "grad_norm": 22.134302049761775, "kl": 0.10498046875, "learning_rate": 7.690555458209216e-07, "loss": -0.0022, "reward": 1.3819736242294312, "reward_std": 0.24998074769973755, "rewards/accuracy_reward_stage2": 0.3975986838340759, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1319 }, { "completion_length": 13.703125, "epoch": 0.23129490099877342, "grad_norm": 28.60472856199761, "kl": 0.048583984375, "learning_rate": 7.68880322411074e-07, "loss": 0.0194, "reward": 1.5254038572311401, "reward_std": 0.2446960210800171, "rewards/accuracy_reward_stage2": 0.5254038572311401, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1320 }, { "completion_length": 9.9375, "epoch": 0.231470124408621, "grad_norm": 13.988686696055257, "kl": 0.03271484375, "learning_rate": 7.687050990012265e-07, "loss": -0.0312, "reward": 1.84375, "reward_std": 0.1462520956993103, "rewards/accuracy_reward_stage2": 0.859375, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1321 }, { "completion_length": 11.578125, "epoch": 0.23164534781846854, "grad_norm": 18.07158760655491, "kl": 0.1171875, "learning_rate": 7.685298755913789e-07, "loss": 0.0033, "reward": 1.8339933156967163, "reward_std": 0.2281760275363922, "rewards/accuracy_reward_stage2": 0.8496183156967163, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1322 }, { "completion_length": 7.75, "epoch": 0.2318205712283161, "grad_norm": 26.39845456134637, "kl": 0.0947265625, "learning_rate": 7.683546521815314e-07, "loss": 0.0023, "reward": 1.6090365648269653, "reward_std": 0.2769169807434082, "rewards/accuracy_reward_stage2": 0.6246616244316101, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1323 }, { "completion_length": 9.8125, "epoch": 0.23199579463816367, "grad_norm": 22.387896866550953, "kl": 0.08544921875, "learning_rate": 7.681794287716839e-07, "loss": 0.0341, "reward": 1.5603952407836914, "reward_std": 0.253578782081604, "rewards/accuracy_reward_stage2": 0.5603952407836914, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1324 }, { "completion_length": 10.96875, "epoch": 0.2321710180480112, "grad_norm": 17.495621738171565, "kl": 0.01275634765625, "learning_rate": 7.680042053618363e-07, "loss": 0.0051, "reward": 1.5471508502960205, "reward_std": 0.09170855581760406, "rewards/accuracy_reward_stage2": 0.5471509099006653, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1325 }, { "completion_length": 7.453125, "epoch": 0.23234624145785876, "grad_norm": 20.347795944323746, "kl": 0.083984375, "learning_rate": 7.678289819519888e-07, "loss": 0.0336, "reward": 1.5921326875686646, "reward_std": 0.21103455126285553, "rewards/accuracy_reward_stage2": 0.7171327471733093, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1326 }, { "completion_length": 8.140625, "epoch": 0.23252146486770633, "grad_norm": 18.193345428480434, "kl": 0.1015625, "learning_rate": 7.676537585421412e-07, "loss": 0.0072, "reward": 1.6968261003494263, "reward_std": 0.2153952419757843, "rewards/accuracy_reward_stage2": 0.7124510407447815, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1327 }, { "completion_length": 11.46875, "epoch": 0.23269668827755388, "grad_norm": 26.928705144530184, "kl": 0.056640625, "learning_rate": 7.674785351322936e-07, "loss": 0.0227, "reward": 1.645999789237976, "reward_std": 0.19765979051589966, "rewards/accuracy_reward_stage2": 0.6459997892379761, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1328 }, { "completion_length": 11.828125, "epoch": 0.23287191168740143, "grad_norm": 31.28999988787986, "kl": 0.1591796875, "learning_rate": 7.673033117224461e-07, "loss": 0.03, "reward": 1.1927083730697632, "reward_std": 0.2777610421180725, "rewards/accuracy_reward_stage2": 0.4583333134651184, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1329 }, { "completion_length": 7.453125, "epoch": 0.233047135097249, "grad_norm": 12.114613846952071, "kl": 0.14453125, "learning_rate": 7.671280883125985e-07, "loss": 0.0579, "reward": 1.640345811843872, "reward_std": 0.20387586951255798, "rewards/accuracy_reward_stage2": 0.6403458118438721, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1330 }, { "completion_length": 6.4375, "epoch": 0.23322235850709655, "grad_norm": 36.1777020029657, "kl": 0.04833984375, "learning_rate": 7.66952864902751e-07, "loss": 0.0194, "reward": 1.5745203495025635, "reward_std": 0.12884950637817383, "rewards/accuracy_reward_stage2": 0.5745203495025635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1331 }, { "completion_length": 5.765625, "epoch": 0.2333975819169441, "grad_norm": 21.644659188058338, "kl": 0.0390625, "learning_rate": 7.667776414929035e-07, "loss": 0.0156, "reward": 1.721125602722168, "reward_std": 0.21974197030067444, "rewards/accuracy_reward_stage2": 0.7211256623268127, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1332 }, { "completion_length": 8.046875, "epoch": 0.23357280532679167, "grad_norm": 21.40238993393703, "kl": 0.11376953125, "learning_rate": 7.666024180830558e-07, "loss": 0.0455, "reward": 1.632363200187683, "reward_std": 0.19989193975925446, "rewards/accuracy_reward_stage2": 0.6323632001876831, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1333 }, { "completion_length": 10.4375, "epoch": 0.23374802873663922, "grad_norm": 17.30648927448642, "kl": 0.1318359375, "learning_rate": 7.664271946732083e-07, "loss": 0.0529, "reward": 1.3868898153305054, "reward_std": 0.08634155243635178, "rewards/accuracy_reward_stage2": 0.6368898153305054, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1334 }, { "completion_length": 7.875, "epoch": 0.23392325214648677, "grad_norm": 17.350471710700887, "kl": 0.0625, "learning_rate": 7.662519712633607e-07, "loss": 0.025, "reward": 1.398812174797058, "reward_std": 0.14234712719917297, "rewards/accuracy_reward_stage2": 0.3988121747970581, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1335 }, { "completion_length": 12.3125, "epoch": 0.23409847555633434, "grad_norm": 3030.131469498715, "kl": 10.0, "learning_rate": 7.660767478535132e-07, "loss": 3.9625, "reward": 1.5227144956588745, "reward_std": 0.24410173296928406, "rewards/accuracy_reward_stage2": 0.5383394956588745, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1336 }, { "completion_length": 12.609375, "epoch": 0.2342736989661819, "grad_norm": 15.117023305589338, "kl": 0.07958984375, "learning_rate": 7.659015244436657e-07, "loss": -0.0124, "reward": 1.3785531520843506, "reward_std": 0.16757111251354218, "rewards/accuracy_reward_stage2": 0.5191780924797058, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1337 }, { "completion_length": 8.03125, "epoch": 0.23444892237602943, "grad_norm": 22.17685357158358, "kl": 0.036865234375, "learning_rate": 7.65726301033818e-07, "loss": 0.0147, "reward": 1.6041667461395264, "reward_std": 0.26043471693992615, "rewards/accuracy_reward_stage2": 0.6041666269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1338 }, { "completion_length": 10.921875, "epoch": 0.23462414578587698, "grad_norm": 22.342387603619894, "kl": 0.177734375, "learning_rate": 7.655510776239705e-07, "loss": 0.0711, "reward": 1.3732129335403442, "reward_std": 0.24331887066364288, "rewards/accuracy_reward_stage2": 0.49821293354034424, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1339 }, { "completion_length": 21.734375, "epoch": 0.23479936919572456, "grad_norm": 16.78143151695997, "kl": 0.05615234375, "learning_rate": 7.65375854214123e-07, "loss": 0.0225, "reward": 1.254507064819336, "reward_std": 0.12991392612457275, "rewards/accuracy_reward_stage2": 0.3795071244239807, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1340 }, { "completion_length": 10.90625, "epoch": 0.2349745926055721, "grad_norm": 22.350945720327807, "kl": 0.0986328125, "learning_rate": 7.652006308042754e-07, "loss": 0.0396, "reward": 1.622948408126831, "reward_std": 0.242707759141922, "rewards/accuracy_reward_stage2": 0.622948408126831, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1341 }, { "completion_length": 9.578125, "epoch": 0.23514981601541965, "grad_norm": 9.096876079815528, "kl": 0.0308837890625, "learning_rate": 7.650254073944279e-07, "loss": 0.0124, "reward": 1.5761775970458984, "reward_std": 0.022409232333302498, "rewards/accuracy_reward_stage2": 0.7011775970458984, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1342 }, { "completion_length": 7.984375, "epoch": 0.23532503942526722, "grad_norm": 17.327402970574514, "kl": 0.10009765625, "learning_rate": 7.648501839845803e-07, "loss": -0.0041, "reward": 1.7431111335754395, "reward_std": 0.2279052436351776, "rewards/accuracy_reward_stage2": 0.7587360143661499, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1343 }, { "completion_length": 11.03125, "epoch": 0.23550026283511477, "grad_norm": 20.08448032414623, "kl": 0.10009765625, "learning_rate": 7.646749605747328e-07, "loss": 0.0066, "reward": 1.4869825839996338, "reward_std": 0.26277047395706177, "rewards/accuracy_reward_stage2": 0.5026075839996338, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1344 }, { "completion_length": 6.359375, "epoch": 0.23567548624496232, "grad_norm": 16.92679779383406, "kl": 0.078125, "learning_rate": 7.644997371648852e-07, "loss": -0.013, "reward": 1.6867897510528564, "reward_std": 0.15283793210983276, "rewards/accuracy_reward_stage2": 0.7024147510528564, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1345 }, { "completion_length": 10.765625, "epoch": 0.2358507096548099, "grad_norm": 14.929758046595353, "kl": 0.037109375, "learning_rate": 7.643245137550376e-07, "loss": -0.0293, "reward": 1.4871182441711426, "reward_std": 0.12831583619117737, "rewards/accuracy_reward_stage2": 0.5027433037757874, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1346 }, { "completion_length": 15.0625, "epoch": 0.23602593306465744, "grad_norm": 26.23246360003974, "kl": 0.052978515625, "learning_rate": 7.641492903451901e-07, "loss": 0.0211, "reward": 1.2774174213409424, "reward_std": 0.26563411951065063, "rewards/accuracy_reward_stage2": 0.4024173617362976, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1347 }, { "completion_length": 14.171875, "epoch": 0.236201156474505, "grad_norm": 26.44971956693815, "kl": 0.08056640625, "learning_rate": 7.639740669353425e-07, "loss": 0.0322, "reward": 1.3588321208953857, "reward_std": 0.3321700990200043, "rewards/accuracy_reward_stage2": 0.48383209109306335, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1348 }, { "completion_length": 6.609375, "epoch": 0.23637637988435256, "grad_norm": 17.11005799023166, "kl": 0.0849609375, "learning_rate": 7.637988435254949e-07, "loss": -0.0103, "reward": 1.85406494140625, "reward_std": 0.1711689531803131, "rewards/accuracy_reward_stage2": 0.8696897625923157, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1349 }, { "completion_length": 11.359375, "epoch": 0.2365516032942001, "grad_norm": 5.892884889471868, "kl": 0.0179443359375, "learning_rate": 7.636236201156474e-07, "loss": 0.0072, "reward": 1.640625, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.640625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1350 }, { "completion_length": 9.171875, "epoch": 0.23672682670404765, "grad_norm": 20.400597342361245, "kl": 0.0966796875, "learning_rate": 7.634483967057998e-07, "loss": 0.0386, "reward": 1.3030681610107422, "reward_std": 0.18530681729316711, "rewards/accuracy_reward_stage2": 0.5530681610107422, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1351 }, { "completion_length": 6.84375, "epoch": 0.23690205011389523, "grad_norm": 26.95413948025918, "kl": 0.150390625, "learning_rate": 7.632731732959523e-07, "loss": 0.0602, "reward": 1.445420742034912, "reward_std": 0.3342668116092682, "rewards/accuracy_reward_stage2": 0.5860457420349121, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1352 }, { "completion_length": 11.703125, "epoch": 0.23707727352374278, "grad_norm": 26.432871876528985, "kl": 0.057373046875, "learning_rate": 7.630979498861048e-07, "loss": 0.0229, "reward": 1.5638206005096436, "reward_std": 0.2039935439825058, "rewards/accuracy_reward_stage2": 0.5638206005096436, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1353 }, { "completion_length": 23.75, "epoch": 0.23725249693359032, "grad_norm": 24.686417389604138, "kl": 0.09716796875, "learning_rate": 7.629227264762572e-07, "loss": 0.0389, "reward": 1.6011197566986084, "reward_std": 0.13856875896453857, "rewards/accuracy_reward_stage2": 0.601119875907898, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1354 }, { "completion_length": 12.828125, "epoch": 0.23742772034343787, "grad_norm": 21.37557254466655, "kl": 0.064453125, "learning_rate": 7.627475030664097e-07, "loss": 0.0259, "reward": 1.537844181060791, "reward_std": 0.2930099666118622, "rewards/accuracy_reward_stage2": 0.5378442406654358, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1355 }, { "completion_length": 14.71875, "epoch": 0.23760294375328544, "grad_norm": 23.396625871669126, "kl": 0.251953125, "learning_rate": 7.625722796565622e-07, "loss": 0.062, "reward": 1.6967604160308838, "reward_std": 0.1177949458360672, "rewards/accuracy_reward_stage2": 0.8373852968215942, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1356 }, { "completion_length": 9.09375, "epoch": 0.237778167163133, "grad_norm": 17.31944187214885, "kl": 0.12255859375, "learning_rate": 7.623970562467146e-07, "loss": 0.0274, "reward": 1.5858019590377808, "reward_std": 0.09252850711345673, "rewards/accuracy_reward_stage2": 0.7264269590377808, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1357 }, { "completion_length": 7.328125, "epoch": 0.23795339057298054, "grad_norm": 30.80854639682047, "kl": 0.11474609375, "learning_rate": 7.622218328368669e-07, "loss": 0.0459, "reward": 1.5312702655792236, "reward_std": 0.3283507525920868, "rewards/accuracy_reward_stage2": 0.5312702655792236, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1358 }, { "completion_length": 14.46875, "epoch": 0.2381286139828281, "grad_norm": 23.954151165638518, "kl": 0.1103515625, "learning_rate": 7.620466094270193e-07, "loss": 0.0442, "reward": 1.4710814952850342, "reward_std": 0.2931392788887024, "rewards/accuracy_reward_stage2": 0.5960814952850342, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1359 }, { "completion_length": 6.703125, "epoch": 0.23830383739267566, "grad_norm": 17.772351813432653, "kl": 0.03369140625, "learning_rate": 7.618713860171718e-07, "loss": 0.0135, "reward": 1.5546684265136719, "reward_std": 0.2100159227848053, "rewards/accuracy_reward_stage2": 0.5546684265136719, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1360 }, { "completion_length": 14.09375, "epoch": 0.2384790608025232, "grad_norm": 26.058587993641353, "kl": 0.080078125, "learning_rate": 7.616961626073243e-07, "loss": 0.032, "reward": 1.7208819389343262, "reward_std": 0.2127433568239212, "rewards/accuracy_reward_stage2": 0.7208819389343262, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1361 }, { "completion_length": 5.625, "epoch": 0.23865428421237078, "grad_norm": 12.00886325535751, "kl": 0.035400390625, "learning_rate": 7.615209391974767e-07, "loss": 0.0142, "reward": 1.6302083730697632, "reward_std": 0.16082212328910828, "rewards/accuracy_reward_stage2": 0.6302083730697632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1362 }, { "completion_length": 10.0625, "epoch": 0.23882950762221833, "grad_norm": 18.772731912132876, "kl": 0.0673828125, "learning_rate": 7.613457157876292e-07, "loss": 0.0269, "reward": 1.3479543924331665, "reward_std": 0.2074214518070221, "rewards/accuracy_reward_stage2": 0.47295433282852173, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1363 }, { "completion_length": 10.3125, "epoch": 0.23900473103206588, "grad_norm": 34.46678613000854, "kl": 0.11865234375, "learning_rate": 7.611704923777817e-07, "loss": 0.0475, "reward": 1.659529209136963, "reward_std": 0.219321608543396, "rewards/accuracy_reward_stage2": 0.6595291495323181, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1364 }, { "completion_length": 10.40625, "epoch": 0.23917995444191345, "grad_norm": 17.89859350913036, "kl": 0.049560546875, "learning_rate": 7.609952689679341e-07, "loss": 0.0198, "reward": 1.6767587661743164, "reward_std": 0.16518397629261017, "rewards/accuracy_reward_stage2": 0.6767587661743164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1365 }, { "completion_length": 10.171875, "epoch": 0.239355177851761, "grad_norm": 19.233591594971617, "kl": 0.1337890625, "learning_rate": 7.608200455580866e-07, "loss": 0.0176, "reward": 1.380042314529419, "reward_std": 0.17864108085632324, "rewards/accuracy_reward_stage2": 0.3956674337387085, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1366 }, { "completion_length": 19.609375, "epoch": 0.23953040126160854, "grad_norm": 23.0646300156008, "kl": 0.1943359375, "learning_rate": 7.60644822148239e-07, "loss": 0.0336, "reward": 1.3207111358642578, "reward_std": 0.2439693659543991, "rewards/accuracy_reward_stage2": 0.46133607625961304, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1367 }, { "completion_length": 6.234375, "epoch": 0.23970562467145612, "grad_norm": 31.815325142177, "kl": 0.04248046875, "learning_rate": 7.604695987383914e-07, "loss": 0.017, "reward": 1.5883839130401611, "reward_std": 0.17570313811302185, "rewards/accuracy_reward_stage2": 0.5883838534355164, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1368 }, { "completion_length": 11.6875, "epoch": 0.23988084808130367, "grad_norm": 21.302564625440397, "kl": 0.11279296875, "learning_rate": 7.602943753285439e-07, "loss": 0.045, "reward": 1.4860469102859497, "reward_std": 0.26849985122680664, "rewards/accuracy_reward_stage2": 0.4860469698905945, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1369 }, { "completion_length": 10.890625, "epoch": 0.2400560714911512, "grad_norm": 24.77217627681895, "kl": 0.09228515625, "learning_rate": 7.601191519186963e-07, "loss": -0.0067, "reward": 1.4838837385177612, "reward_std": 0.2148168683052063, "rewards/accuracy_reward_stage2": 0.49950873851776123, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1370 }, { "completion_length": 9.28125, "epoch": 0.24023129490099876, "grad_norm": 17.564101825345425, "kl": 0.07275390625, "learning_rate": 7.599439285088487e-07, "loss": -0.0152, "reward": 1.560983657836914, "reward_std": 0.17692884802818298, "rewards/accuracy_reward_stage2": 0.5766085982322693, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1371 }, { "completion_length": 16.21875, "epoch": 0.24040651831084633, "grad_norm": 26.161195250356116, "kl": 0.0289306640625, "learning_rate": 7.597687050990011e-07, "loss": -0.0326, "reward": 1.6595048904418945, "reward_std": 0.16430558264255524, "rewards/accuracy_reward_stage2": 0.6751298308372498, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1372 }, { "completion_length": 10.515625, "epoch": 0.24058174172069388, "grad_norm": 18.986393795275184, "kl": 0.09375, "learning_rate": 7.595934816891536e-07, "loss": 0.0072, "reward": 1.644540548324585, "reward_std": 0.18388310074806213, "rewards/accuracy_reward_stage2": 0.6601656675338745, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1373 }, { "completion_length": 20.140625, "epoch": 0.24075696513054143, "grad_norm": 24.46065165314992, "kl": 0.1103515625, "learning_rate": 7.594182582793061e-07, "loss": 0.0441, "reward": 1.6360423564910889, "reward_std": 0.13997207581996918, "rewards/accuracy_reward_stage2": 0.6360422372817993, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1374 }, { "completion_length": 12.078125, "epoch": 0.240932188540389, "grad_norm": 33.907210977500775, "kl": 0.0947265625, "learning_rate": 7.592430348694585e-07, "loss": 0.0379, "reward": 1.600438117980957, "reward_std": 0.2929736375808716, "rewards/accuracy_reward_stage2": 0.6004381775856018, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1375 }, { "completion_length": 8.40625, "epoch": 0.24110741195023655, "grad_norm": 47.093239141282, "kl": 0.298828125, "learning_rate": 7.59067811459611e-07, "loss": 0.0861, "reward": 1.4094713926315308, "reward_std": 0.1661936342716217, "rewards/accuracy_reward_stage2": 0.5500965118408203, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1376 }, { "completion_length": 9.734375, "epoch": 0.2412826353600841, "grad_norm": 27.281382236546175, "kl": 0.0859375, "learning_rate": 7.588925880497635e-07, "loss": 0.0343, "reward": 1.6173287630081177, "reward_std": 0.25000786781311035, "rewards/accuracy_reward_stage2": 0.6173287630081177, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1377 }, { "completion_length": 18.359375, "epoch": 0.24145785876993167, "grad_norm": 25.230940558343345, "kl": 0.1435546875, "learning_rate": 7.587173646399158e-07, "loss": 0.0576, "reward": 1.389910340309143, "reward_std": 0.24498048424720764, "rewards/accuracy_reward_stage2": 0.5149103999137878, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1378 }, { "completion_length": 19.234375, "epoch": 0.24163308217977922, "grad_norm": 16.820536169236927, "kl": 0.01904296875, "learning_rate": 7.585421412300683e-07, "loss": 0.0076, "reward": 1.838128685951233, "reward_std": 0.15188559889793396, "rewards/accuracy_reward_stage2": 0.8381286859512329, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1379 }, { "completion_length": 13.34375, "epoch": 0.24180830558962677, "grad_norm": 32.13695119853367, "kl": 0.1474609375, "learning_rate": 7.583669178202208e-07, "loss": 0.0245, "reward": 1.5661542415618896, "reward_std": 0.20494423806667328, "rewards/accuracy_reward_stage2": 0.7067792415618896, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1380 }, { "completion_length": 13.828125, "epoch": 0.24198352899947434, "grad_norm": 25.714738880686244, "kl": 0.10986328125, "learning_rate": 7.581916944103732e-07, "loss": 0.0437, "reward": 1.4965579509735107, "reward_std": 0.2931256890296936, "rewards/accuracy_reward_stage2": 0.4965580105781555, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1381 }, { "completion_length": 16.625, "epoch": 0.2421587524093219, "grad_norm": 25.706780804110668, "kl": 0.1826171875, "learning_rate": 7.580164710005257e-07, "loss": 0.073, "reward": 1.5389642715454102, "reward_std": 0.20363156497478485, "rewards/accuracy_reward_stage2": 0.6639642715454102, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1382 }, { "completion_length": 13.140625, "epoch": 0.24233397581916943, "grad_norm": 22.98550952191299, "kl": 0.056640625, "learning_rate": 7.578412475906781e-07, "loss": 0.0226, "reward": 1.11177396774292, "reward_std": 0.140178382396698, "rewards/accuracy_reward_stage2": 0.3617740869522095, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1383 }, { "completion_length": 11.890625, "epoch": 0.242509199229017, "grad_norm": 11.553671486294625, "kl": 0.0208740234375, "learning_rate": 7.576660241808305e-07, "loss": 0.0083, "reward": 1.509393572807312, "reward_std": 0.061819229274988174, "rewards/accuracy_reward_stage2": 0.509393572807312, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1384 }, { "completion_length": 13.65625, "epoch": 0.24268442263886456, "grad_norm": 16.0570012110359, "kl": 0.0830078125, "learning_rate": 7.57490800770983e-07, "loss": -0.012, "reward": 1.3893593549728394, "reward_std": 0.1712827980518341, "rewards/accuracy_reward_stage2": 0.42060935497283936, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1385 }, { "completion_length": 10.484375, "epoch": 0.2428596460487121, "grad_norm": 21.494736968570507, "kl": 0.12158203125, "learning_rate": 7.573155773611354e-07, "loss": 0.0044, "reward": 1.6255983114242554, "reward_std": 0.26709023118019104, "rewards/accuracy_reward_stage2": 0.6412232518196106, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1386 }, { "completion_length": 11.96875, "epoch": 0.24303486945855968, "grad_norm": 24.6418008716554, "kl": 0.07958984375, "learning_rate": 7.571403539512879e-07, "loss": -0.0123, "reward": 1.5523234605789185, "reward_std": 0.3348226249217987, "rewards/accuracy_reward_stage2": 0.5679484009742737, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1387 }, { "completion_length": 10.203125, "epoch": 0.24321009286840722, "grad_norm": 16.023201165440923, "kl": 0.0966796875, "learning_rate": 7.569651305414402e-07, "loss": 0.0004, "reward": 1.6837525367736816, "reward_std": 0.1582503616809845, "rewards/accuracy_reward_stage2": 0.6993776559829712, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1388 }, { "completion_length": 6.203125, "epoch": 0.24338531627825477, "grad_norm": 21.374905539716238, "kl": 0.103515625, "learning_rate": 7.567899071315927e-07, "loss": -0.0381, "reward": 1.5039560794830322, "reward_std": 0.363511860370636, "rewards/accuracy_reward_stage2": 0.535206139087677, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1389 }, { "completion_length": 9.9375, "epoch": 0.24356053968810232, "grad_norm": 20.894175897950706, "kl": 0.072265625, "learning_rate": 7.566146837217452e-07, "loss": -0.0154, "reward": 1.888974666595459, "reward_std": 0.16132690012454987, "rewards/accuracy_reward_stage2": 0.9045996069908142, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1390 }, { "completion_length": 10.515625, "epoch": 0.2437357630979499, "grad_norm": 21.923229610078614, "kl": 0.1572265625, "learning_rate": 7.564394603118976e-07, "loss": 0.0628, "reward": 1.4778380393981934, "reward_std": 0.2118120789527893, "rewards/accuracy_reward_stage2": 0.6028379797935486, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1391 }, { "completion_length": 8.359375, "epoch": 0.24391098650779744, "grad_norm": 25.056572660788657, "kl": 0.11474609375, "learning_rate": 7.562642369020501e-07, "loss": 0.0459, "reward": 1.3547465801239014, "reward_std": 0.26677781343460083, "rewards/accuracy_reward_stage2": 0.3547465205192566, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1392 }, { "completion_length": 10.171875, "epoch": 0.244086209917645, "grad_norm": 19.10810836995206, "kl": 0.166015625, "learning_rate": 7.560890134922026e-07, "loss": 0.0118, "reward": 1.3078572750091553, "reward_std": 0.2675933539867401, "rewards/accuracy_reward_stage2": 0.4641074240207672, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1393 }, { "completion_length": 9.6875, "epoch": 0.24426143332749256, "grad_norm": 20.465538641356297, "kl": 0.0869140625, "learning_rate": 7.55913790082355e-07, "loss": 0.0059, "reward": 1.4665038585662842, "reward_std": 0.20319469273090363, "rewards/accuracy_reward_stage2": 0.6071288585662842, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1394 }, { "completion_length": 7.703125, "epoch": 0.2444366567373401, "grad_norm": 102.19387315630294, "kl": 0.0849609375, "learning_rate": 7.557385666725075e-07, "loss": 0.0228, "reward": 1.081196904182434, "reward_std": 0.2433708757162094, "rewards/accuracy_reward_stage2": 0.3311968743801117, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1395 }, { "completion_length": 7.875, "epoch": 0.24461188014718765, "grad_norm": 20.99816953808676, "kl": 0.056640625, "learning_rate": 7.555633432626598e-07, "loss": 0.0227, "reward": 1.4936261177062988, "reward_std": 0.24948295950889587, "rewards/accuracy_reward_stage2": 0.6186261773109436, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1396 }, { "completion_length": 9.28125, "epoch": 0.24478710355703523, "grad_norm": 20.247613253089263, "kl": 0.041748046875, "learning_rate": 7.553881198528122e-07, "loss": -0.0146, "reward": 1.596388816833496, "reward_std": 0.24301937222480774, "rewards/accuracy_reward_stage2": 0.6120138168334961, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1397 }, { "completion_length": 12.84375, "epoch": 0.24496232696688278, "grad_norm": 18.695765728375, "kl": 0.11767578125, "learning_rate": 7.552128964429647e-07, "loss": 0.0472, "reward": 1.503840684890747, "reward_std": 0.07957211136817932, "rewards/accuracy_reward_stage2": 0.6288406848907471, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1398 }, { "completion_length": 10.828125, "epoch": 0.24513755037673032, "grad_norm": 18.080296158546545, "kl": 0.06396484375, "learning_rate": 7.550376730331171e-07, "loss": 0.0256, "reward": 1.3729963302612305, "reward_std": 0.14897583425045013, "rewards/accuracy_reward_stage2": 0.4979962408542633, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1399 }, { "completion_length": 7.625, "epoch": 0.2453127737865779, "grad_norm": 15.46236736095634, "kl": 0.045654296875, "learning_rate": 7.548624496232696e-07, "loss": 0.0183, "reward": 1.765625, "reward_std": 0.1804211586713791, "rewards/accuracy_reward_stage2": 0.765625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1400 }, { "completion_length": 6.640625, "epoch": 0.24548799719642544, "grad_norm": 24.35220226987739, "kl": 0.083984375, "learning_rate": 7.546872262134221e-07, "loss": -0.0106, "reward": 1.58424711227417, "reward_std": 0.31146925687789917, "rewards/accuracy_reward_stage2": 0.5998721122741699, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1401 }, { "completion_length": 12.609375, "epoch": 0.245663220606273, "grad_norm": 12.630191645005317, "kl": 0.058349609375, "learning_rate": 7.545120028035745e-07, "loss": 0.0234, "reward": 1.3900747299194336, "reward_std": 0.11785703897476196, "rewards/accuracy_reward_stage2": 0.39007464051246643, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1402 }, { "completion_length": 9.984375, "epoch": 0.24583844401612057, "grad_norm": 19.35825390615252, "kl": 0.053955078125, "learning_rate": 7.54336779393727e-07, "loss": -0.0226, "reward": 1.5636982917785645, "reward_std": 0.2708263397216797, "rewards/accuracy_reward_stage2": 0.5793232321739197, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1403 }, { "completion_length": 9.625, "epoch": 0.2460136674259681, "grad_norm": 21.76949510719285, "kl": 0.037353515625, "learning_rate": 7.541615559838794e-07, "loss": 0.0149, "reward": 1.489206075668335, "reward_std": 0.16451683640480042, "rewards/accuracy_reward_stage2": 0.6142061948776245, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1404 }, { "completion_length": 9.046875, "epoch": 0.24618889083581566, "grad_norm": 60.765544867318496, "kl": 0.404296875, "learning_rate": 7.539863325740319e-07, "loss": 0.162, "reward": 1.5769790410995483, "reward_std": 0.16119015216827393, "rewards/accuracy_reward_stage2": 0.8269790410995483, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1405 }, { "completion_length": 19.171875, "epoch": 0.2463641142456632, "grad_norm": 27.58976702655693, "kl": 0.15234375, "learning_rate": 7.538111091641844e-07, "loss": 0.061, "reward": 1.7771577835083008, "reward_std": 0.20885083079338074, "rewards/accuracy_reward_stage2": 0.902157723903656, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1406 }, { "completion_length": 13.96875, "epoch": 0.24653933765551078, "grad_norm": 15.13368734660908, "kl": 0.0654296875, "learning_rate": 7.536358857543368e-07, "loss": -0.0586, "reward": 1.3415193557739258, "reward_std": 0.15027299523353577, "rewards/accuracy_reward_stage2": 0.37276941537857056, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1407 }, { "completion_length": 12.5, "epoch": 0.24671456106535833, "grad_norm": 18.73681435107374, "kl": 0.040771484375, "learning_rate": 7.534606623444892e-07, "loss": 0.0163, "reward": 1.562386155128479, "reward_std": 0.201747328042984, "rewards/accuracy_reward_stage2": 0.562386155128479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1408 }, { "completion_length": 9.21875, "epoch": 0.24688978447520588, "grad_norm": 22.44108350092471, "kl": 0.044677734375, "learning_rate": 7.532854389346416e-07, "loss": 0.0179, "reward": 1.6416747570037842, "reward_std": 0.19642874598503113, "rewards/accuracy_reward_stage2": 0.7666747570037842, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1409 }, { "completion_length": 16.28125, "epoch": 0.24706500788505345, "grad_norm": 15.592344850737916, "kl": 0.1376953125, "learning_rate": 7.53110215524794e-07, "loss": 0.055, "reward": 1.1909624338150024, "reward_std": 0.13738305866718292, "rewards/accuracy_reward_stage2": 0.44096243381500244, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1410 }, { "completion_length": 17.859375, "epoch": 0.247240231294901, "grad_norm": 29.324014213943627, "kl": 0.244140625, "learning_rate": 7.529349921149465e-07, "loss": 0.0586, "reward": 1.564818024635315, "reward_std": 0.3653239607810974, "rewards/accuracy_reward_stage2": 0.7054431438446045, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1411 }, { "completion_length": 4.921875, "epoch": 0.24741545470474854, "grad_norm": 20.83423318481693, "kl": 0.1259765625, "learning_rate": 7.527597687050989e-07, "loss": 0.0377, "reward": 1.7137820720672607, "reward_std": 0.2640119194984436, "rewards/accuracy_reward_stage2": 0.7294071316719055, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1412 }, { "completion_length": 12.53125, "epoch": 0.24759067811459612, "grad_norm": 18.45961976763546, "kl": 0.06396484375, "learning_rate": 7.525845452952514e-07, "loss": 0.0255, "reward": 1.313084363937378, "reward_std": 0.25324708223342896, "rewards/accuracy_reward_stage2": 0.4380842447280884, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1413 }, { "completion_length": 12.421875, "epoch": 0.24776590152444367, "grad_norm": 36.03340900309115, "kl": 0.049072265625, "learning_rate": 7.524093218854039e-07, "loss": 0.0196, "reward": 1.501349925994873, "reward_std": 0.25983327627182007, "rewards/accuracy_reward_stage2": 0.5013498663902283, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1414 }, { "completion_length": 12.8125, "epoch": 0.2479411249342912, "grad_norm": 32.04826873057793, "kl": 0.2109375, "learning_rate": 7.522340984755563e-07, "loss": 0.0842, "reward": 1.3114793300628662, "reward_std": 0.19367793202400208, "rewards/accuracy_reward_stage2": 0.5614794492721558, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1415 }, { "completion_length": 6.90625, "epoch": 0.2481163483441388, "grad_norm": 17.831093251512257, "kl": 0.03271484375, "learning_rate": 7.520588750657088e-07, "loss": 0.0131, "reward": 1.59375, "reward_std": 0.3255898952484131, "rewards/accuracy_reward_stage2": 0.71875, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1416 }, { "completion_length": 12.8125, "epoch": 0.24829157175398633, "grad_norm": 17.810874882373575, "kl": 0.271484375, "learning_rate": 7.518836516558613e-07, "loss": 0.0648, "reward": 1.532361626625061, "reward_std": 0.21110737323760986, "rewards/accuracy_reward_stage2": 0.6729865670204163, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1417 }, { "completion_length": 9.84375, "epoch": 0.24846679516383388, "grad_norm": 15.057571439227704, "kl": 0.0927734375, "learning_rate": 7.517084282460136e-07, "loss": 0.007, "reward": 1.6341720819473267, "reward_std": 0.13025765120983124, "rewards/accuracy_reward_stage2": 0.6497971415519714, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1418 }, { "completion_length": 11.65625, "epoch": 0.24864201857368146, "grad_norm": 14.375263032052567, "kl": 0.0308837890625, "learning_rate": 7.515332048361661e-07, "loss": 0.0123, "reward": 1.642590880393982, "reward_std": 0.10421305894851685, "rewards/accuracy_reward_stage2": 0.7675908803939819, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1419 }, { "completion_length": 15.3125, "epoch": 0.248817241983529, "grad_norm": 23.254299329378373, "kl": 0.04541015625, "learning_rate": 7.513579814263185e-07, "loss": 0.0182, "reward": 1.7045319080352783, "reward_std": 0.18549993634223938, "rewards/accuracy_reward_stage2": 0.7045319080352783, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1420 }, { "completion_length": 9.3125, "epoch": 0.24899246539337655, "grad_norm": 13.079817489563679, "kl": 0.03515625, "learning_rate": 7.51182758016471e-07, "loss": -0.0184, "reward": 1.5532286167144775, "reward_std": 0.10524085909128189, "rewards/accuracy_reward_stage2": 0.5688536167144775, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1421 }, { "completion_length": 10.25, "epoch": 0.24916768880322412, "grad_norm": 23.29879296353814, "kl": 0.0732421875, "learning_rate": 7.510075346066234e-07, "loss": 0.0014, "reward": 1.4815268516540527, "reward_std": 0.29361575841903687, "rewards/accuracy_reward_stage2": 0.4971519112586975, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1422 }, { "completion_length": 10.40625, "epoch": 0.24934291221307167, "grad_norm": 21.83091816412804, "kl": 0.07763671875, "learning_rate": 7.508323111967758e-07, "loss": 0.0311, "reward": 1.6659326553344727, "reward_std": 0.23042136430740356, "rewards/accuracy_reward_stage2": 0.6659327149391174, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1423 }, { "completion_length": 7.125, "epoch": 0.24951813562291922, "grad_norm": 14.444543879346396, "kl": 0.1572265625, "learning_rate": 7.506570877869283e-07, "loss": 0.0629, "reward": 1.6293728351593018, "reward_std": 0.1331539899110794, "rewards/accuracy_reward_stage2": 0.6293728351593018, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1424 }, { "completion_length": 11.453125, "epoch": 0.24969335903276677, "grad_norm": 18.51638467771239, "kl": 0.03515625, "learning_rate": 7.504818643770808e-07, "loss": -0.0301, "reward": 1.515625, "reward_std": 0.20569033920764923, "rewards/accuracy_reward_stage2": 0.53125, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1425 }, { "completion_length": 8.5, "epoch": 0.24986858244261434, "grad_norm": 11.394857987905583, "kl": 0.07275390625, "learning_rate": 7.503066409672332e-07, "loss": 0.0291, "reward": 1.5371688604354858, "reward_std": 0.10468746721744537, "rewards/accuracy_reward_stage2": 0.5371688008308411, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1426 }, { "completion_length": 11.859375, "epoch": 0.2500438058524619, "grad_norm": 19.613433131991332, "kl": 0.09814453125, "learning_rate": 7.501314175573856e-07, "loss": 0.0393, "reward": 1.4760699272155762, "reward_std": 0.13601933419704437, "rewards/accuracy_reward_stage2": 0.4760698676109314, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1427 }, { "completion_length": 9.953125, "epoch": 0.25021902926230943, "grad_norm": 18.820985905928033, "kl": 0.2041015625, "learning_rate": 7.49956194147538e-07, "loss": 0.0815, "reward": 1.6965408325195312, "reward_std": 0.18144288659095764, "rewards/accuracy_reward_stage2": 0.6965407729148865, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1428 }, { "completion_length": 10.0625, "epoch": 0.250394252672157, "grad_norm": 13.106608164948959, "kl": 0.0242919921875, "learning_rate": 7.497809707376905e-07, "loss": 0.0097, "reward": 1.4696911573410034, "reward_std": 0.07452090829610825, "rewards/accuracy_reward_stage2": 0.4696912169456482, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1429 }, { "completion_length": 12.984375, "epoch": 0.2505694760820046, "grad_norm": 17.266848797712317, "kl": 0.044677734375, "learning_rate": 7.49605747327843e-07, "loss": 0.0179, "reward": 1.7010854482650757, "reward_std": 0.13501934707164764, "rewards/accuracy_reward_stage2": 0.7010855078697205, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1430 }, { "completion_length": 6.875, "epoch": 0.25074469949185213, "grad_norm": 21.6197429247069, "kl": 0.203125, "learning_rate": 7.494305239179954e-07, "loss": -0.0074, "reward": 1.5226540565490723, "reward_std": 0.29842448234558105, "rewards/accuracy_reward_stage2": 0.5539040565490723, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1431 }, { "completion_length": 8.390625, "epoch": 0.2509199229016997, "grad_norm": 28.866050092959014, "kl": 0.042236328125, "learning_rate": 7.492553005081479e-07, "loss": 0.0169, "reward": 1.445624828338623, "reward_std": 0.173716202378273, "rewards/accuracy_reward_stage2": 0.5706248879432678, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1432 }, { "completion_length": 10.46875, "epoch": 0.2510951463115472, "grad_norm": 27.56143086547294, "kl": 0.10107421875, "learning_rate": 7.490800770983004e-07, "loss": 0.0211, "reward": 1.434044361114502, "reward_std": 0.30634164810180664, "rewards/accuracy_reward_stage2": 0.4496694803237915, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1433 }, { "completion_length": 13.015625, "epoch": 0.25127036972139477, "grad_norm": 73.54643410324547, "kl": 0.5546875, "learning_rate": 7.489048536884528e-07, "loss": 0.1939, "reward": 1.631661295890808, "reward_std": 0.22395049035549164, "rewards/accuracy_reward_stage2": 0.7722861766815186, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1434 }, { "completion_length": 5.46875, "epoch": 0.2514455931312423, "grad_norm": 21.097042169226135, "kl": 0.2431640625, "learning_rate": 7.487296302786052e-07, "loss": 0.0198, "reward": 1.3362268209457397, "reward_std": 0.21675795316696167, "rewards/accuracy_reward_stage2": 0.36747682094573975, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1435 }, { "completion_length": 22.015625, "epoch": 0.25162081654108986, "grad_norm": 24.83305083490462, "kl": 0.06689453125, "learning_rate": 7.485544068687576e-07, "loss": 0.0268, "reward": 1.452586054801941, "reward_std": 0.22090375423431396, "rewards/accuracy_reward_stage2": 0.45258599519729614, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1436 }, { "completion_length": 12.609375, "epoch": 0.25179603995093747, "grad_norm": 17.161128543951413, "kl": 0.09912109375, "learning_rate": 7.4837918345891e-07, "loss": -0.0045, "reward": 1.6552212238311768, "reward_std": 0.17116889357566833, "rewards/accuracy_reward_stage2": 0.670846164226532, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1437 }, { "completion_length": 9.234375, "epoch": 0.251971263360785, "grad_norm": 19.6058479034139, "kl": 0.09619140625, "learning_rate": 7.482039600490625e-07, "loss": 0.0384, "reward": 1.6015892028808594, "reward_std": 0.22905901074409485, "rewards/accuracy_reward_stage2": 0.6015892624855042, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1438 }, { "completion_length": 13.65625, "epoch": 0.25214648677063256, "grad_norm": 27.10964415369589, "kl": 0.119140625, "learning_rate": 7.480287366392149e-07, "loss": 0.0297, "reward": 1.3641960620880127, "reward_std": 0.28837230801582336, "rewards/accuracy_reward_stage2": 0.5048211812973022, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1439 }, { "completion_length": 23.796875, "epoch": 0.2523217101804801, "grad_norm": 21.790024565848892, "kl": 0.10498046875, "learning_rate": 7.478535132293674e-07, "loss": -0.0436, "reward": 1.3245376348495483, "reward_std": 0.2398291975259781, "rewards/accuracy_reward_stage2": 0.35578760504722595, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1440 }, { "completion_length": 13.921875, "epoch": 0.25249693359032765, "grad_norm": 25.97179485729417, "kl": 0.25390625, "learning_rate": 7.476782898195199e-07, "loss": 0.0571, "reward": 1.390436053276062, "reward_std": 0.31526440382003784, "rewards/accuracy_reward_stage2": 0.5310611128807068, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1441 }, { "completion_length": 7.203125, "epoch": 0.2526721570001752, "grad_norm": 22.677557170391783, "kl": 0.1640625, "learning_rate": 7.475030664096723e-07, "loss": -0.0452, "reward": 1.5565773248672485, "reward_std": 0.3190111517906189, "rewards/accuracy_reward_stage2": 0.6034523248672485, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1442 }, { "completion_length": 14.625, "epoch": 0.2528473804100228, "grad_norm": 17.705483157782556, "kl": 0.09326171875, "learning_rate": 7.473278429998248e-07, "loss": 0.0374, "reward": 1.362941026687622, "reward_std": 0.1572730839252472, "rewards/accuracy_reward_stage2": 0.36294102668762207, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1443 }, { "completion_length": 10.171875, "epoch": 0.25302260381987035, "grad_norm": 17.715526509646416, "kl": 0.0186767578125, "learning_rate": 7.471526195899772e-07, "loss": 0.0075, "reward": 1.6852679252624512, "reward_std": 0.21711409091949463, "rewards/accuracy_reward_stage2": 0.6852678656578064, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1444 }, { "completion_length": 8.96875, "epoch": 0.2531978272297179, "grad_norm": 21.70342349079483, "kl": 0.046875, "learning_rate": 7.469773961801297e-07, "loss": 0.0188, "reward": 1.4218826293945312, "reward_std": 0.14263302087783813, "rewards/accuracy_reward_stage2": 0.5468826293945312, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1445 }, { "completion_length": 7.765625, "epoch": 0.25337305063956544, "grad_norm": 49.26411936487985, "kl": 0.0189208984375, "learning_rate": 7.468021727702822e-07, "loss": -0.013, "reward": 1.8323465585708618, "reward_std": 0.1732073575258255, "rewards/accuracy_reward_stage2": 0.8479715585708618, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1446 }, { "completion_length": 13.59375, "epoch": 0.253548274049413, "grad_norm": 22.701810094153558, "kl": 0.33984375, "learning_rate": 7.466269493604344e-07, "loss": 0.0922, "reward": 1.13534414768219, "reward_std": 0.25967109203338623, "rewards/accuracy_reward_stage2": 0.4009692072868347, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1447 }, { "completion_length": 14.109375, "epoch": 0.25372349745926054, "grad_norm": 15.881379556902235, "kl": 0.08056640625, "learning_rate": 7.464517259505869e-07, "loss": 0.0104, "reward": 1.5367052555084229, "reward_std": 0.11349460482597351, "rewards/accuracy_reward_stage2": 0.6773301362991333, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1448 }, { "completion_length": 11.265625, "epoch": 0.2538987208691081, "grad_norm": 26.014993849480607, "kl": 0.1591796875, "learning_rate": 7.462765025407393e-07, "loss": 0.0636, "reward": 1.4904820919036865, "reward_std": 0.25210410356521606, "rewards/accuracy_reward_stage2": 0.6154820322990417, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1449 }, { "completion_length": 13.640625, "epoch": 0.2540739442789557, "grad_norm": 21.598784966647813, "kl": 0.09716796875, "learning_rate": 7.461012791308918e-07, "loss": 0.0106, "reward": 1.38657546043396, "reward_std": 0.1828770488500595, "rewards/accuracy_reward_stage2": 0.51157546043396, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1450 }, { "completion_length": 9.828125, "epoch": 0.25424916768880323, "grad_norm": 20.03192383804041, "kl": 0.06787109375, "learning_rate": 7.459260557210443e-07, "loss": 0.0272, "reward": 1.3679254055023193, "reward_std": 0.20850923657417297, "rewards/accuracy_reward_stage2": 0.3679255247116089, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1451 }, { "completion_length": 9.640625, "epoch": 0.2544243910986508, "grad_norm": 19.753770296880482, "kl": 0.0263671875, "learning_rate": 7.457508323111967e-07, "loss": 0.0105, "reward": 1.7602910995483398, "reward_std": 0.14263275265693665, "rewards/accuracy_reward_stage2": 0.7602912187576294, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1452 }, { "completion_length": 10.078125, "epoch": 0.25459961450849833, "grad_norm": 19.34286584256072, "kl": 0.0849609375, "learning_rate": 7.455756089013492e-07, "loss": -0.0017, "reward": 1.4176316261291504, "reward_std": 0.205317884683609, "rewards/accuracy_reward_stage2": 0.5582566261291504, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1453 }, { "completion_length": 9.25, "epoch": 0.2547748379183459, "grad_norm": 19.34879254141021, "kl": 0.0869140625, "learning_rate": 7.454003854915017e-07, "loss": 0.0348, "reward": 1.580161690711975, "reward_std": 0.18217667937278748, "rewards/accuracy_reward_stage2": 0.5801616907119751, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1454 }, { "completion_length": 18.734375, "epoch": 0.2549500613281934, "grad_norm": 39.93095842420503, "kl": 0.19921875, "learning_rate": 7.452251620816541e-07, "loss": 0.0794, "reward": 1.459136724472046, "reward_std": 0.3002258241176605, "rewards/accuracy_reward_stage2": 0.5841366052627563, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1455 }, { "completion_length": 16.375, "epoch": 0.255125284738041, "grad_norm": 22.736610580425168, "kl": 0.0269775390625, "learning_rate": 7.450499386718066e-07, "loss": 0.0011, "reward": 1.651605248451233, "reward_std": 0.13762909173965454, "rewards/accuracy_reward_stage2": 0.6672303080558777, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1456 }, { "completion_length": 11.953125, "epoch": 0.25530050814788857, "grad_norm": 30.042272676858413, "kl": 0.11279296875, "learning_rate": 7.448747152619589e-07, "loss": 0.0009, "reward": 1.6088519096374512, "reward_std": 0.19328728318214417, "rewards/accuracy_reward_stage2": 0.6244767904281616, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1457 }, { "completion_length": 10.046875, "epoch": 0.2554757315577361, "grad_norm": 22.167031177976327, "kl": 0.0537109375, "learning_rate": 7.446994918521114e-07, "loss": 0.0215, "reward": 1.6259219646453857, "reward_std": 0.15675854682922363, "rewards/accuracy_reward_stage2": 0.6259219646453857, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1458 }, { "completion_length": 23.0625, "epoch": 0.25565095496758367, "grad_norm": 21.075811345440805, "kl": 0.038330078125, "learning_rate": 7.445242684422639e-07, "loss": 0.0153, "reward": 1.518410325050354, "reward_std": 0.11710774898529053, "rewards/accuracy_reward_stage2": 0.5184102654457092, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1459 }, { "completion_length": 18.875, "epoch": 0.2558261783774312, "grad_norm": 14.518518964574008, "kl": 0.0791015625, "learning_rate": 7.443490450324162e-07, "loss": -0.0087, "reward": 1.4062702655792236, "reward_std": 0.11130297183990479, "rewards/accuracy_reward_stage2": 0.42189526557922363, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1460 }, { "completion_length": 12.015625, "epoch": 0.25600140178727876, "grad_norm": 28.656267793463243, "kl": 0.0771484375, "learning_rate": 7.441738216225687e-07, "loss": -0.0133, "reward": 1.7806577682495117, "reward_std": 0.26290401816368103, "rewards/accuracy_reward_stage2": 0.7962826490402222, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1461 }, { "completion_length": 7.796875, "epoch": 0.25617662519712636, "grad_norm": 11.394494329527667, "kl": 0.0169677734375, "learning_rate": 7.439985982127212e-07, "loss": -0.0262, "reward": 1.609375, "reward_std": 0.15981829166412354, "rewards/accuracy_reward_stage2": 0.625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1462 }, { "completion_length": 10.828125, "epoch": 0.2563518486069739, "grad_norm": 18.163585583393655, "kl": 0.0625, "learning_rate": 7.438233748028736e-07, "loss": -0.0481, "reward": 1.711039423942566, "reward_std": 0.25919869542121887, "rewards/accuracy_reward_stage2": 0.7422893643379211, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1463 }, { "completion_length": 5.53125, "epoch": 0.25652707201682146, "grad_norm": 17.824082159016235, "kl": 0.03759765625, "learning_rate": 7.436481513930261e-07, "loss": 0.015, "reward": 1.53125, "reward_std": 0.23356688022613525, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1464 }, { "completion_length": 11.59375, "epoch": 0.256702295426669, "grad_norm": 21.450314735509018, "kl": 0.11181640625, "learning_rate": 7.434729279831785e-07, "loss": 0.0034, "reward": 1.5948938131332397, "reward_std": 0.2250853031873703, "rewards/accuracy_reward_stage2": 0.8605188727378845, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1465 }, { "completion_length": 15.875, "epoch": 0.25687751883651655, "grad_norm": 27.628838534330345, "kl": 0.2099609375, "learning_rate": 7.43297704573331e-07, "loss": 0.0837, "reward": 1.5314993858337402, "reward_std": 0.15965047478675842, "rewards/accuracy_reward_stage2": 0.6564993858337402, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1466 }, { "completion_length": 10.0, "epoch": 0.2570527422463641, "grad_norm": 22.88498429016478, "kl": 0.11572265625, "learning_rate": 7.431224811634834e-07, "loss": 0.0461, "reward": 1.4264659881591797, "reward_std": 0.2359931915998459, "rewards/accuracy_reward_stage2": 0.4264659583568573, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1467 }, { "completion_length": 8.625, "epoch": 0.25722796565621164, "grad_norm": 28.53875785233994, "kl": 0.2080078125, "learning_rate": 7.429472577536358e-07, "loss": 0.0521, "reward": 1.5848909616470337, "reward_std": 0.30258065462112427, "rewards/accuracy_reward_stage2": 0.6005159616470337, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1468 }, { "completion_length": 12.375, "epoch": 0.25740318906605925, "grad_norm": 20.73955273991918, "kl": 0.1123046875, "learning_rate": 7.427720343437883e-07, "loss": 0.0448, "reward": 1.2760417461395264, "reward_std": 0.19150808453559875, "rewards/accuracy_reward_stage2": 0.4010416567325592, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1469 }, { "completion_length": 8.703125, "epoch": 0.2575784124759068, "grad_norm": 20.030808607531387, "kl": 0.220703125, "learning_rate": 7.425968109339408e-07, "loss": 0.0609, "reward": 1.2748761177062988, "reward_std": 0.2619929611682892, "rewards/accuracy_reward_stage2": 0.5405011773109436, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1470 }, { "completion_length": 14.65625, "epoch": 0.25775363588575434, "grad_norm": 20.08715223958517, "kl": 0.0625, "learning_rate": 7.424215875240932e-07, "loss": 0.025, "reward": 1.4923529624938965, "reward_std": 0.14947402477264404, "rewards/accuracy_reward_stage2": 0.49235299229621887, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1471 }, { "completion_length": 11.09375, "epoch": 0.2579288592956019, "grad_norm": 25.864038997006453, "kl": 0.12060546875, "learning_rate": 7.422463641142457e-07, "loss": 0.0483, "reward": 1.5058326721191406, "reward_std": 0.22785821557044983, "rewards/accuracy_reward_stage2": 0.6308326721191406, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1472 }, { "completion_length": 14.640625, "epoch": 0.25810408270544943, "grad_norm": 19.42238006236761, "kl": 0.1748046875, "learning_rate": 7.42071140704398e-07, "loss": 0.07, "reward": 1.171360969543457, "reward_std": 0.22907811403274536, "rewards/accuracy_reward_stage2": 0.42136093974113464, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1473 }, { "completion_length": 11.9375, "epoch": 0.258279306115297, "grad_norm": 57.78994671492662, "kl": 0.34375, "learning_rate": 7.418959172945505e-07, "loss": 0.0761, "reward": 1.6343741416931152, "reward_std": 0.21707776188850403, "rewards/accuracy_reward_stage2": 0.79062420129776, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1474 }, { "completion_length": 15.828125, "epoch": 0.2584545295251446, "grad_norm": 21.16854422932251, "kl": 0.359375, "learning_rate": 7.41720693884703e-07, "loss": 0.1228, "reward": 1.4675710201263428, "reward_std": 0.11915571242570877, "rewards/accuracy_reward_stage2": 0.7175710797309875, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1475 }, { "completion_length": 7.265625, "epoch": 0.25862975293499213, "grad_norm": 18.533242185784058, "kl": 0.024169921875, "learning_rate": 7.415454704748554e-07, "loss": 0.0097, "reward": 1.8850057125091553, "reward_std": 0.0957728922367096, "rewards/accuracy_reward_stage2": 0.8850055932998657, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1476 }, { "completion_length": 11.4375, "epoch": 0.2588049763448397, "grad_norm": 21.933109256368923, "kl": 0.11328125, "learning_rate": 7.413702470650078e-07, "loss": -0.0156, "reward": 1.615840196609497, "reward_std": 0.32640892267227173, "rewards/accuracy_reward_stage2": 0.6470901966094971, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1477 }, { "completion_length": 9.671875, "epoch": 0.2589801997546872, "grad_norm": 20.716459022775172, "kl": 0.06591796875, "learning_rate": 7.411950236551603e-07, "loss": -0.0179, "reward": 1.5069777965545654, "reward_std": 0.22965574264526367, "rewards/accuracy_reward_stage2": 0.5226027965545654, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1478 }, { "completion_length": 14.640625, "epoch": 0.25915542316453477, "grad_norm": 21.596170252166306, "kl": 0.11376953125, "learning_rate": 7.410198002453127e-07, "loss": 0.0455, "reward": 1.479250192642212, "reward_std": 0.15525218844413757, "rewards/accuracy_reward_stage2": 0.6042501330375671, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1479 }, { "completion_length": 20.0, "epoch": 0.2593306465743823, "grad_norm": 22.154253712235363, "kl": 0.126953125, "learning_rate": 7.408445768354652e-07, "loss": 0.0071, "reward": 1.4506645202636719, "reward_std": 0.3448774814605713, "rewards/accuracy_reward_stage2": 0.4662895202636719, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1480 }, { "completion_length": 9.828125, "epoch": 0.2595058699842299, "grad_norm": 19.49886095101816, "kl": 0.15234375, "learning_rate": 7.406693534256176e-07, "loss": -0.0273, "reward": 1.681386947631836, "reward_std": 0.26997825503349304, "rewards/accuracy_reward_stage2": 0.7126370072364807, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1481 }, { "completion_length": 9.328125, "epoch": 0.25968109339407747, "grad_norm": 15.037718624074698, "kl": 0.057861328125, "learning_rate": 7.404941300157701e-07, "loss": 0.0231, "reward": 1.8315874338150024, "reward_std": 0.16824200749397278, "rewards/accuracy_reward_stage2": 0.8315874338150024, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1482 }, { "completion_length": 12.15625, "epoch": 0.259856316803925, "grad_norm": 22.13869404334615, "kl": 0.09423828125, "learning_rate": 7.403189066059226e-07, "loss": 0.0377, "reward": 1.4400691986083984, "reward_std": 0.10665388405323029, "rewards/accuracy_reward_stage2": 0.5650691986083984, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1483 }, { "completion_length": 6.875, "epoch": 0.26003154021377256, "grad_norm": 18.15221953566913, "kl": 0.11328125, "learning_rate": 7.40143683196075e-07, "loss": 0.001, "reward": 1.7418863773345947, "reward_std": 0.21698671579360962, "rewards/accuracy_reward_stage2": 0.8825114369392395, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1484 }, { "completion_length": 7.78125, "epoch": 0.2602067636236201, "grad_norm": 6.688679135897942, "kl": 0.052734375, "learning_rate": 7.399684597862275e-07, "loss": -0.0144, "reward": 1.5801225900650024, "reward_std": 0.05681667849421501, "rewards/accuracy_reward_stage2": 0.7207475900650024, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1485 }, { "completion_length": 11.15625, "epoch": 0.26038198703346765, "grad_norm": 22.284164463542524, "kl": 0.05859375, "learning_rate": 7.397932363763799e-07, "loss": 0.0171, "reward": 1.3674495220184326, "reward_std": 0.2036455124616623, "rewards/accuracy_reward_stage2": 0.4924495220184326, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1486 }, { "completion_length": 12.984375, "epoch": 0.2605572104433152, "grad_norm": 12.447484630901936, "kl": 0.0274658203125, "learning_rate": 7.396180129665322e-07, "loss": -0.0179, "reward": 1.6566715240478516, "reward_std": 0.08770053833723068, "rewards/accuracy_reward_stage2": 0.6722966432571411, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1487 }, { "completion_length": 7.859375, "epoch": 0.2607324338531628, "grad_norm": 25.099938792494854, "kl": 0.1435546875, "learning_rate": 7.394427895566847e-07, "loss": 0.0574, "reward": 1.4594025611877441, "reward_std": 0.24416542053222656, "rewards/accuracy_reward_stage2": 0.5844025015830994, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1488 }, { "completion_length": 17.625, "epoch": 0.26090765726301035, "grad_norm": 21.788132997945162, "kl": 0.296875, "learning_rate": 7.392675661468371e-07, "loss": 0.0749, "reward": 1.1694362163543701, "reward_std": 0.19415882229804993, "rewards/accuracy_reward_stage2": 0.4350612759590149, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1489 }, { "completion_length": 12.6875, "epoch": 0.2610828806728579, "grad_norm": 24.197340000226113, "kl": 0.208984375, "learning_rate": 7.390923427369896e-07, "loss": 0.0392, "reward": 1.5895700454711914, "reward_std": 0.33698683977127075, "rewards/accuracy_reward_stage2": 0.605195164680481, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1490 }, { "completion_length": 8.015625, "epoch": 0.26125810408270544, "grad_norm": 23.261036424611014, "kl": 0.109375, "learning_rate": 7.389171193271421e-07, "loss": 0.0437, "reward": 1.7723209857940674, "reward_std": 0.23626047372817993, "rewards/accuracy_reward_stage2": 0.7723209857940674, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1491 }, { "completion_length": 8.5625, "epoch": 0.261433327492553, "grad_norm": 21.332989290106536, "kl": 0.1494140625, "learning_rate": 7.387418959172945e-07, "loss": -0.0579, "reward": 1.338010311126709, "reward_std": 0.3674984872341156, "rewards/accuracy_reward_stage2": 0.384885311126709, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1492 }, { "completion_length": 9.328125, "epoch": 0.26160855090240054, "grad_norm": 25.19229994115063, "kl": 0.16015625, "learning_rate": 7.38566672507447e-07, "loss": 0.0641, "reward": 1.725899338722229, "reward_std": 0.33280277252197266, "rewards/accuracy_reward_stage2": 0.725899338722229, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1493 }, { "completion_length": 7.875, "epoch": 0.26178377431224814, "grad_norm": 19.955869600132683, "kl": 0.0869140625, "learning_rate": 7.383914490975995e-07, "loss": 0.0346, "reward": 1.4154143333435059, "reward_std": 0.22046023607254028, "rewards/accuracy_reward_stage2": 0.41541436314582825, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1494 }, { "completion_length": 6.15625, "epoch": 0.2619589977220957, "grad_norm": 17.110046537530014, "kl": 0.091796875, "learning_rate": 7.382162256877519e-07, "loss": -0.029, "reward": 1.28125, "reward_std": 0.26409146189689636, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1495 }, { "completion_length": 14.171875, "epoch": 0.26213422113194323, "grad_norm": 20.595751893456807, "kl": 0.376953125, "learning_rate": 7.380410022779044e-07, "loss": 0.1507, "reward": 1.3054620027542114, "reward_std": 0.13554486632347107, "rewards/accuracy_reward_stage2": 0.43046194314956665, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1496 }, { "completion_length": 11.078125, "epoch": 0.2623094445417908, "grad_norm": 26.140411911761632, "kl": 0.126953125, "learning_rate": 7.378657788680567e-07, "loss": -0.0612, "reward": 1.340702772140503, "reward_std": 0.2606419324874878, "rewards/accuracy_reward_stage2": 0.38757771253585815, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1497 }, { "completion_length": 8.953125, "epoch": 0.26248466795163833, "grad_norm": 20.82795983912578, "kl": 0.0615234375, "learning_rate": 7.376905554582091e-07, "loss": -0.0043, "reward": 1.5155892372131348, "reward_std": 0.20563608407974243, "rewards/accuracy_reward_stage2": 0.5312142968177795, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1498 }, { "completion_length": 9.796875, "epoch": 0.2626598913614859, "grad_norm": 33.9523855883196, "kl": 0.080078125, "learning_rate": 7.375153320483616e-07, "loss": 0.0319, "reward": 1.6113669872283936, "reward_std": 0.2674233317375183, "rewards/accuracy_reward_stage2": 0.6113669276237488, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1499 }, { "completion_length": 8.234375, "epoch": 0.2628351147713335, "grad_norm": 24.67586190521906, "kl": 0.1005859375, "learning_rate": 7.37340108638514e-07, "loss": 0.0404, "reward": 1.703125, "reward_std": 0.16887325048446655, "rewards/accuracy_reward_stage2": 0.703125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1500 }, { "completion_length": 9.109375, "epoch": 0.263010338181181, "grad_norm": 24.975522476038243, "kl": 0.0927734375, "learning_rate": 7.371648852286665e-07, "loss": 0.0037, "reward": 1.3885433673858643, "reward_std": 0.36126166582107544, "rewards/accuracy_reward_stage2": 0.40416842699050903, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1501 }, { "completion_length": 8.6875, "epoch": 0.26318556159102857, "grad_norm": 23.351333659034683, "kl": 0.17578125, "learning_rate": 7.36989661818819e-07, "loss": 0.0177, "reward": 1.5084370374679565, "reward_std": 0.2623979449272156, "rewards/accuracy_reward_stage2": 0.5396870374679565, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1502 }, { "completion_length": 8.75, "epoch": 0.2633607850008761, "grad_norm": 22.303221688076096, "kl": 0.125, "learning_rate": 7.368144384089714e-07, "loss": 0.0503, "reward": 1.6632972955703735, "reward_std": 0.3265223205089569, "rewards/accuracy_reward_stage2": 0.6632972955703735, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1503 }, { "completion_length": 10.609375, "epoch": 0.26353600841072367, "grad_norm": 21.468972803101558, "kl": 0.03125, "learning_rate": 7.366392149991239e-07, "loss": 0.0125, "reward": 1.4165986776351929, "reward_std": 0.3401222229003906, "rewards/accuracy_reward_stage2": 0.41659867763519287, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1504 }, { "completion_length": 14.84375, "epoch": 0.2637112318205712, "grad_norm": 20.28889674676962, "kl": 0.041015625, "learning_rate": 7.364639915892763e-07, "loss": 0.0165, "reward": 1.507493495941162, "reward_std": 0.2466462105512619, "rewards/accuracy_reward_stage2": 0.5074934959411621, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1505 }, { "completion_length": 8.8125, "epoch": 0.26388645523041876, "grad_norm": 13.069562216022993, "kl": 0.054443359375, "learning_rate": 7.362887681794288e-07, "loss": -0.0224, "reward": 1.5927482843399048, "reward_std": 0.08447122573852539, "rewards/accuracy_reward_stage2": 0.7333732843399048, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1506 }, { "completion_length": 11.25, "epoch": 0.26406167864026636, "grad_norm": 19.479084888275086, "kl": 0.107421875, "learning_rate": 7.361135447695812e-07, "loss": -0.0244, "reward": 1.7037560939788818, "reward_std": 0.21897412836551666, "rewards/accuracy_reward_stage2": 0.7350061535835266, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1507 }, { "completion_length": 13.3125, "epoch": 0.2642369020501139, "grad_norm": 17.467329132593147, "kl": 0.051513671875, "learning_rate": 7.359383213597336e-07, "loss": 0.0206, "reward": 1.7382456064224243, "reward_std": 0.16379417479038239, "rewards/accuracy_reward_stage2": 0.7382456064224243, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1508 }, { "completion_length": 13.859375, "epoch": 0.26441212545996146, "grad_norm": 33.108877608375465, "kl": 0.2470703125, "learning_rate": 7.357630979498861e-07, "loss": 0.0991, "reward": 1.4780302047729492, "reward_std": 0.03881131857633591, "rewards/accuracy_reward_stage2": 0.6030303239822388, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1509 }, { "completion_length": 13.984375, "epoch": 0.264587348869809, "grad_norm": 27.470318929032366, "kl": 0.1005859375, "learning_rate": 7.355878745400386e-07, "loss": -0.0041, "reward": 1.5446429252624512, "reward_std": 0.3013976216316223, "rewards/accuracy_reward_stage2": 0.6852678060531616, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1510 }, { "completion_length": 13.328125, "epoch": 0.26476257227965655, "grad_norm": 23.887624091153057, "kl": 0.1376953125, "learning_rate": 7.354126511301909e-07, "loss": 0.0552, "reward": 1.4459335803985596, "reward_std": 0.3139682412147522, "rewards/accuracy_reward_stage2": 0.5709335207939148, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1511 }, { "completion_length": 13.859375, "epoch": 0.2649377956895041, "grad_norm": 20.174374690765312, "kl": 0.181640625, "learning_rate": 7.352374277203434e-07, "loss": 0.0727, "reward": 1.328125, "reward_std": 0.2109457552433014, "rewards/accuracy_reward_stage2": 0.453125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1512 }, { "completion_length": 9.640625, "epoch": 0.2651130190993517, "grad_norm": 23.224227516558585, "kl": 0.10888671875, "learning_rate": 7.350622043104958e-07, "loss": 0.0436, "reward": 1.6329894065856934, "reward_std": 0.21760083734989166, "rewards/accuracy_reward_stage2": 0.6329893469810486, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1513 }, { "completion_length": 13.375, "epoch": 0.26528824250919925, "grad_norm": 16.8682280462148, "kl": 0.051025390625, "learning_rate": 7.348869809006483e-07, "loss": 0.0204, "reward": 1.3874523639678955, "reward_std": 0.14845938980579376, "rewards/accuracy_reward_stage2": 0.38745230436325073, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1514 }, { "completion_length": 8.734375, "epoch": 0.2654634659190468, "grad_norm": 24.619000253706965, "kl": 0.12451171875, "learning_rate": 7.347117574908008e-07, "loss": -0.0269, "reward": 1.5498409271240234, "reward_std": 0.2750667929649353, "rewards/accuracy_reward_stage2": 0.7060908079147339, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1515 }, { "completion_length": 10.640625, "epoch": 0.26563868932889434, "grad_norm": 19.46372314296072, "kl": 0.1396484375, "learning_rate": 7.345365340809531e-07, "loss": 0.0196, "reward": 1.3163824081420898, "reward_std": 0.19993111491203308, "rewards/accuracy_reward_stage2": 0.33200740814208984, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1516 }, { "completion_length": 8.59375, "epoch": 0.2658139127387419, "grad_norm": 16.929295225810957, "kl": 0.044921875, "learning_rate": 7.343613106711056e-07, "loss": 0.018, "reward": 1.6382789611816406, "reward_std": 0.19527596235275269, "rewards/accuracy_reward_stage2": 0.6382789611816406, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1517 }, { "completion_length": 18.453125, "epoch": 0.26598913614858943, "grad_norm": 18.964953779951724, "kl": 0.2216796875, "learning_rate": 7.341860872612581e-07, "loss": 0.0446, "reward": 1.358322024345398, "reward_std": 0.1473166048526764, "rewards/accuracy_reward_stage2": 0.623947024345398, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1518 }, { "completion_length": 12.140625, "epoch": 0.266164359558437, "grad_norm": 28.538663763536473, "kl": 0.05078125, "learning_rate": 7.340108638514105e-07, "loss": 0.0203, "reward": 1.677398681640625, "reward_std": 0.1924341320991516, "rewards/accuracy_reward_stage2": 0.677398681640625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1519 }, { "completion_length": 10.21875, "epoch": 0.2663395829682846, "grad_norm": 22.416505790465667, "kl": 0.1357421875, "learning_rate": 7.33835640441563e-07, "loss": -0.0172, "reward": 1.6864490509033203, "reward_std": 0.31137195229530334, "rewards/accuracy_reward_stage2": 0.7176990509033203, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1520 }, { "completion_length": 10.515625, "epoch": 0.26651480637813213, "grad_norm": 31.324550878208168, "kl": 0.007598876953125, "learning_rate": 7.336604170317154e-07, "loss": 0.003, "reward": 1.8011363744735718, "reward_std": 0.07158337533473969, "rewards/accuracy_reward_stage2": 0.8011363744735718, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1521 }, { "completion_length": 9.78125, "epoch": 0.2666900297879797, "grad_norm": 24.600260940307486, "kl": 0.0966796875, "learning_rate": 7.334851936218679e-07, "loss": 0.0387, "reward": 1.659136176109314, "reward_std": 0.14838439226150513, "rewards/accuracy_reward_stage2": 0.659136176109314, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1522 }, { "completion_length": 10.484375, "epoch": 0.2668652531978272, "grad_norm": 43.7767226666006, "kl": 0.2431640625, "learning_rate": 7.333099702120204e-07, "loss": 0.0426, "reward": 1.4813872575759888, "reward_std": 0.34337955713272095, "rewards/accuracy_reward_stage2": 0.637637197971344, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1523 }, { "completion_length": 15.9375, "epoch": 0.26704047660767477, "grad_norm": 19.529746543141552, "kl": 0.138671875, "learning_rate": 7.331347468021727e-07, "loss": 0.0554, "reward": 1.5571314096450806, "reward_std": 0.18025755882263184, "rewards/accuracy_reward_stage2": 0.5571314692497253, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1524 }, { "completion_length": 13.765625, "epoch": 0.2672157000175223, "grad_norm": 22.63237928794489, "kl": 0.0859375, "learning_rate": 7.329595233923252e-07, "loss": -0.0073, "reward": 1.4489909410476685, "reward_std": 0.15715591609477997, "rewards/accuracy_reward_stage2": 0.7146159410476685, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1525 }, { "completion_length": 11.8125, "epoch": 0.2673909234273699, "grad_norm": 24.14759904765817, "kl": 0.1318359375, "learning_rate": 7.327842999824775e-07, "loss": 0.0526, "reward": 1.4918327331542969, "reward_std": 0.2508474290370941, "rewards/accuracy_reward_stage2": 0.4918326735496521, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1526 }, { "completion_length": 6.75, "epoch": 0.26756614683721747, "grad_norm": 20.278598793689493, "kl": 0.07373046875, "learning_rate": 7.3260907657263e-07, "loss": 0.0296, "reward": 1.3896163702011108, "reward_std": 0.3074378967285156, "rewards/accuracy_reward_stage2": 0.6396163702011108, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1527 }, { "completion_length": 7.484375, "epoch": 0.267741370247065, "grad_norm": 19.871356492961542, "kl": 0.056640625, "learning_rate": 7.324338531627825e-07, "loss": 0.0226, "reward": 1.7014296054840088, "reward_std": 0.1917523890733719, "rewards/accuracy_reward_stage2": 0.7014296054840088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1528 }, { "completion_length": 9.015625, "epoch": 0.26791659365691256, "grad_norm": 18.416822579452255, "kl": 0.04052734375, "learning_rate": 7.322586297529349e-07, "loss": 0.0162, "reward": 1.6586830615997314, "reward_std": 0.15771014988422394, "rewards/accuracy_reward_stage2": 0.6586830615997314, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1529 }, { "completion_length": 13.03125, "epoch": 0.2680918170667601, "grad_norm": 157.14169101763926, "kl": 0.91015625, "learning_rate": 7.320834063430874e-07, "loss": 0.3213, "reward": 1.418661117553711, "reward_std": 0.2747938632965088, "rewards/accuracy_reward_stage2": 0.6842861175537109, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1530 }, { "completion_length": 7.8125, "epoch": 0.26826704047660765, "grad_norm": 17.26589900919268, "kl": 0.05908203125, "learning_rate": 7.319081829332399e-07, "loss": 0.0237, "reward": 1.5729167461395264, "reward_std": 0.14022307097911835, "rewards/accuracy_reward_stage2": 0.5729166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1531 }, { "completion_length": 21.59375, "epoch": 0.26844226388645526, "grad_norm": 16.03669517849557, "kl": 0.0203857421875, "learning_rate": 7.317329595233923e-07, "loss": 0.0082, "reward": 1.642247200012207, "reward_std": 0.13648752868175507, "rewards/accuracy_reward_stage2": 0.7672471404075623, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1532 }, { "completion_length": 8.609375, "epoch": 0.2686174872963028, "grad_norm": 21.116938570932263, "kl": 0.06396484375, "learning_rate": 7.315577361135448e-07, "loss": -0.0187, "reward": 1.3490604162216187, "reward_std": 0.34063291549682617, "rewards/accuracy_reward_stage2": 0.48968538641929626, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1533 }, { "completion_length": 13.015625, "epoch": 0.26879271070615035, "grad_norm": 21.123971038254172, "kl": 0.08837890625, "learning_rate": 7.313825127036972e-07, "loss": 0.0354, "reward": 1.433570146560669, "reward_std": 0.17886027693748474, "rewards/accuracy_reward_stage2": 0.5585700869560242, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1534 }, { "completion_length": 12.40625, "epoch": 0.2689679341159979, "grad_norm": 19.209208955755024, "kl": 0.083984375, "learning_rate": 7.312072892938497e-07, "loss": -0.0107, "reward": 1.373374104499817, "reward_std": 0.22035284340381622, "rewards/accuracy_reward_stage2": 0.5139991044998169, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1535 }, { "completion_length": 11.046875, "epoch": 0.26914315752584544, "grad_norm": 17.626216892205676, "kl": 0.040283203125, "learning_rate": 7.310320658840022e-07, "loss": 0.0161, "reward": 1.4475104808807373, "reward_std": 0.16717243194580078, "rewards/accuracy_reward_stage2": 0.4475104808807373, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1536 }, { "completion_length": 12.828125, "epoch": 0.269318380935693, "grad_norm": 20.258858135507833, "kl": 0.08984375, "learning_rate": 7.308568424741544e-07, "loss": 0.036, "reward": 1.6602098941802979, "reward_std": 0.1800985336303711, "rewards/accuracy_reward_stage2": 0.6602099537849426, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1537 }, { "completion_length": 11.953125, "epoch": 0.26949360434554054, "grad_norm": 18.407349718764067, "kl": 0.056396484375, "learning_rate": 7.306816190643069e-07, "loss": 0.0225, "reward": 1.3184478282928467, "reward_std": 0.12989208102226257, "rewards/accuracy_reward_stage2": 0.3184478282928467, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1538 }, { "completion_length": 9.390625, "epoch": 0.26966882775538814, "grad_norm": 19.289921141580795, "kl": 0.08203125, "learning_rate": 7.305063956544594e-07, "loss": 0.0328, "reward": 1.4996446371078491, "reward_std": 0.2246101200580597, "rewards/accuracy_reward_stage2": 0.7496446371078491, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1539 }, { "completion_length": 6.484375, "epoch": 0.2698440511652357, "grad_norm": 12.441059142254845, "kl": 0.04541015625, "learning_rate": 7.303311722446118e-07, "loss": 0.0181, "reward": 1.5642765760421753, "reward_std": 0.08475670963525772, "rewards/accuracy_reward_stage2": 0.6892765760421753, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1540 }, { "completion_length": 12.359375, "epoch": 0.27001927457508323, "grad_norm": 24.390854012867567, "kl": 0.333984375, "learning_rate": 7.301559488347643e-07, "loss": 0.0978, "reward": 1.6280450820922852, "reward_std": 0.20801720023155212, "rewards/accuracy_reward_stage2": 0.7686700820922852, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1541 }, { "completion_length": 10.609375, "epoch": 0.2701944979849308, "grad_norm": 19.020692683382507, "kl": 0.01507568359375, "learning_rate": 7.299807254249167e-07, "loss": 0.006, "reward": 1.4479167461395264, "reward_std": 0.1515468955039978, "rewards/accuracy_reward_stage2": 0.4479166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1542 }, { "completion_length": 9.546875, "epoch": 0.27036972139477833, "grad_norm": 18.919351767185397, "kl": 0.0703125, "learning_rate": 7.298055020150692e-07, "loss": 0.0282, "reward": 1.7478828430175781, "reward_std": 0.18501171469688416, "rewards/accuracy_reward_stage2": 0.7478827834129333, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1543 }, { "completion_length": 8.015625, "epoch": 0.2705449448046259, "grad_norm": 19.402100954334866, "kl": 0.1044921875, "learning_rate": 7.296302786052217e-07, "loss": 0.0417, "reward": 1.5251411199569702, "reward_std": 0.26569750905036926, "rewards/accuracy_reward_stage2": 0.5251411199569702, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1544 }, { "completion_length": 10.28125, "epoch": 0.2707201682144735, "grad_norm": 15.03881206013984, "kl": 0.02001953125, "learning_rate": 7.294550551953741e-07, "loss": 0.008, "reward": 1.7756855487823486, "reward_std": 0.12909118831157684, "rewards/accuracy_reward_stage2": 0.7756855487823486, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1545 }, { "completion_length": 6.96875, "epoch": 0.270895391624321, "grad_norm": 13.6036257647761, "kl": 0.361328125, "learning_rate": 7.292798317855265e-07, "loss": 0.1449, "reward": 1.4483622312545776, "reward_std": 0.13131847977638245, "rewards/accuracy_reward_stage2": 0.6983622312545776, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1546 }, { "completion_length": 8.296875, "epoch": 0.27107061503416857, "grad_norm": 19.47664920431056, "kl": 0.1357421875, "learning_rate": 7.29104608375679e-07, "loss": 0.0119, "reward": 1.723066806793213, "reward_std": 0.2823743224143982, "rewards/accuracy_reward_stage2": 0.7386916875839233, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1547 }, { "completion_length": 7.84375, "epoch": 0.2712458384440161, "grad_norm": 17.67104325953167, "kl": 0.08935546875, "learning_rate": 7.289293849658314e-07, "loss": 0.0358, "reward": 1.5002610683441162, "reward_std": 0.27437451481819153, "rewards/accuracy_reward_stage2": 0.6252610683441162, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1548 }, { "completion_length": 14.328125, "epoch": 0.27142106185386367, "grad_norm": 21.85574395429999, "kl": 0.0220947265625, "learning_rate": 7.287541615559838e-07, "loss": 0.0088, "reward": 1.491215467453003, "reward_std": 0.20244181156158447, "rewards/accuracy_reward_stage2": 0.4912155270576477, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1549 }, { "completion_length": 12.46875, "epoch": 0.2715962852637112, "grad_norm": 21.99441168996388, "kl": 0.11376953125, "learning_rate": 7.285789381461362e-07, "loss": 0.0454, "reward": 1.4256680011749268, "reward_std": 0.2561571002006531, "rewards/accuracy_reward_stage2": 0.5506680607795715, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1550 }, { "completion_length": 10.84375, "epoch": 0.2717715086735588, "grad_norm": 19.485417665693184, "kl": 0.1865234375, "learning_rate": 7.284037147362887e-07, "loss": -0.0068, "reward": 1.4601422548294067, "reward_std": 0.2415839433670044, "rewards/accuracy_reward_stage2": 0.4913923144340515, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1551 }, { "completion_length": 6.484375, "epoch": 0.27194673208340636, "grad_norm": 19.868362707801886, "kl": 0.1474609375, "learning_rate": 7.282284913264412e-07, "loss": 0.03, "reward": 1.6962921619415283, "reward_std": 0.23688717186450958, "rewards/accuracy_reward_stage2": 0.7119171619415283, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1552 }, { "completion_length": 8.765625, "epoch": 0.2721219554932539, "grad_norm": 21.00231032813797, "kl": 0.0947265625, "learning_rate": 7.280532679165936e-07, "loss": 0.038, "reward": 1.3415729999542236, "reward_std": 0.28956979513168335, "rewards/accuracy_reward_stage2": 0.46657294034957886, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1553 }, { "completion_length": 10.171875, "epoch": 0.27229717890310146, "grad_norm": 16.49101090303168, "kl": 0.048095703125, "learning_rate": 7.278780445067461e-07, "loss": -0.0791, "reward": 1.3982372283935547, "reward_std": 0.2809445261955261, "rewards/accuracy_reward_stage2": 0.4451121687889099, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1554 }, { "completion_length": 9.15625, "epoch": 0.272472402312949, "grad_norm": 20.813752137808752, "kl": 0.10986328125, "learning_rate": 7.277028210968986e-07, "loss": -0.0759, "reward": 1.5947017669677734, "reward_std": 0.34263890981674194, "rewards/accuracy_reward_stage2": 0.6415768265724182, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1555 }, { "completion_length": 14.953125, "epoch": 0.27264762572279655, "grad_norm": 19.569881932631915, "kl": 0.050048828125, "learning_rate": 7.275275976870509e-07, "loss": 0.02, "reward": 1.2932356595993042, "reward_std": 0.14172068238258362, "rewards/accuracy_reward_stage2": 0.4182356595993042, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1556 }, { "completion_length": 10.984375, "epoch": 0.2728228491326441, "grad_norm": 23.275062004848152, "kl": 0.2412109375, "learning_rate": 7.273523742772034e-07, "loss": 0.0523, "reward": 1.5078812837600708, "reward_std": 0.26423394680023193, "rewards/accuracy_reward_stage2": 0.5235062837600708, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1557 }, { "completion_length": 8.265625, "epoch": 0.2729980725424917, "grad_norm": 16.222941710110852, "kl": 0.08251953125, "learning_rate": 7.271771508673558e-07, "loss": 0.0329, "reward": 1.6588354110717773, "reward_std": 0.08715774118900299, "rewards/accuracy_reward_stage2": 0.6588354110717773, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1558 }, { "completion_length": 9.96875, "epoch": 0.27317329595233925, "grad_norm": 15.832014677569221, "kl": 0.061767578125, "learning_rate": 7.270019274575083e-07, "loss": -0.0195, "reward": 1.5660628080368042, "reward_std": 0.17610442638397217, "rewards/accuracy_reward_stage2": 0.5816878080368042, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1559 }, { "completion_length": 8.203125, "epoch": 0.2733485193621868, "grad_norm": 22.731903546764354, "kl": 0.1298828125, "learning_rate": 7.268267040476608e-07, "loss": 0.008, "reward": 1.6920348405838013, "reward_std": 0.24214372038841248, "rewards/accuracy_reward_stage2": 0.7076598405838013, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1560 }, { "completion_length": 9.96875, "epoch": 0.27352374277203434, "grad_norm": 23.863983210931238, "kl": 0.16015625, "learning_rate": 7.266514806378132e-07, "loss": 0.0203, "reward": 1.4968124628067017, "reward_std": 0.13817864656448364, "rewards/accuracy_reward_stage2": 0.6374374628067017, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1561 }, { "completion_length": 10.765625, "epoch": 0.2736989661818819, "grad_norm": 21.012253949955664, "kl": 0.08154296875, "learning_rate": 7.264762572279656e-07, "loss": -0.0115, "reward": 1.4106385707855225, "reward_std": 0.292081743478775, "rewards/accuracy_reward_stage2": 0.42626360058784485, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1562 }, { "completion_length": 10.28125, "epoch": 0.27387418959172943, "grad_norm": 18.270956325101377, "kl": 0.2431640625, "learning_rate": 7.263010338181181e-07, "loss": 0.0971, "reward": 1.5469186305999756, "reward_std": 0.25838232040405273, "rewards/accuracy_reward_stage2": 0.6719185709953308, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1563 }, { "completion_length": 6.90625, "epoch": 0.27404941300157704, "grad_norm": 16.509044988635893, "kl": 0.03125, "learning_rate": 7.261258104082705e-07, "loss": 0.0126, "reward": 1.7471439838409424, "reward_std": 0.11338774114847183, "rewards/accuracy_reward_stage2": 0.7471439838409424, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1564 }, { "completion_length": 13.078125, "epoch": 0.2742246364114246, "grad_norm": 23.28838918712886, "kl": 0.1279296875, "learning_rate": 7.25950586998423e-07, "loss": 0.0512, "reward": 1.424917459487915, "reward_std": 0.12856319546699524, "rewards/accuracy_reward_stage2": 0.674917459487915, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1565 }, { "completion_length": 12.46875, "epoch": 0.27439985982127213, "grad_norm": 23.4382924648424, "kl": 0.12109375, "learning_rate": 7.257753635885753e-07, "loss": 0.0483, "reward": 1.663330316543579, "reward_std": 0.17533919215202332, "rewards/accuracy_reward_stage2": 0.7883303165435791, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1566 }, { "completion_length": 12.0625, "epoch": 0.2745750832311197, "grad_norm": 29.567678011581055, "kl": 0.03271484375, "learning_rate": 7.256001401787278e-07, "loss": -0.0312, "reward": 1.4522767066955566, "reward_std": 0.2672945261001587, "rewards/accuracy_reward_stage2": 0.46790170669555664, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1567 }, { "completion_length": 12.6875, "epoch": 0.2747503066409672, "grad_norm": 29.62746153373393, "kl": 0.01361083984375, "learning_rate": 7.254249167688803e-07, "loss": 0.0054, "reward": 1.375, "reward_std": 0.2709311842918396, "rewards/accuracy_reward_stage2": 0.375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1568 }, { "completion_length": 11.375, "epoch": 0.27492553005081477, "grad_norm": 17.557370589904153, "kl": 0.1123046875, "learning_rate": 7.252496933590327e-07, "loss": -0.0266, "reward": 1.325448989868164, "reward_std": 0.2274552583694458, "rewards/accuracy_reward_stage2": 0.48169901967048645, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1569 }, { "completion_length": 10.890625, "epoch": 0.2751007534606623, "grad_norm": 27.34549604652153, "kl": 0.11328125, "learning_rate": 7.250744699491852e-07, "loss": -0.0535, "reward": 1.5265979766845703, "reward_std": 0.3133997321128845, "rewards/accuracy_reward_stage2": 0.5734728574752808, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1570 }, { "completion_length": 7.421875, "epoch": 0.2752759768705099, "grad_norm": 18.701687375757633, "kl": 0.2158203125, "learning_rate": 7.248992465393377e-07, "loss": 0.0444, "reward": 1.6914236545562744, "reward_std": 0.1895194947719574, "rewards/accuracy_reward_stage2": 0.7070485353469849, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1571 }, { "completion_length": 9.109375, "epoch": 0.27545120028035747, "grad_norm": 14.077681371050371, "kl": 0.06103515625, "learning_rate": 7.247240231294901e-07, "loss": -0.0434, "reward": 1.7748515605926514, "reward_std": 0.1633080542087555, "rewards/accuracy_reward_stage2": 0.8061015009880066, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1572 }, { "completion_length": 10.90625, "epoch": 0.275626423690205, "grad_norm": 23.67382172325355, "kl": 0.2373046875, "learning_rate": 7.245487997196426e-07, "loss": 0.0242, "reward": 1.2528319358825684, "reward_std": 0.2686885595321655, "rewards/accuracy_reward_stage2": 0.40908199548721313, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1573 }, { "completion_length": 8.8125, "epoch": 0.27580164710005256, "grad_norm": 22.988428113639372, "kl": 0.09716796875, "learning_rate": 7.24373576309795e-07, "loss": 0.039, "reward": 1.665795087814331, "reward_std": 0.23197835683822632, "rewards/accuracy_reward_stage2": 0.665795087814331, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1574 }, { "completion_length": 7.859375, "epoch": 0.2759768705099001, "grad_norm": 19.34328584263878, "kl": 0.07275390625, "learning_rate": 7.241983528999474e-07, "loss": 0.029, "reward": 1.4947218894958496, "reward_std": 0.1720176637172699, "rewards/accuracy_reward_stage2": 0.6197218894958496, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1575 }, { "completion_length": 15.5625, "epoch": 0.27615209391974765, "grad_norm": 22.570700861758745, "kl": 0.041748046875, "learning_rate": 7.240231294900998e-07, "loss": 0.0167, "reward": 1.4003034830093384, "reward_std": 0.22432449460029602, "rewards/accuracy_reward_stage2": 0.5253034830093384, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1576 }, { "completion_length": 9.609375, "epoch": 0.27632731732959526, "grad_norm": 19.67214373662556, "kl": 0.11083984375, "learning_rate": 7.238479060802522e-07, "loss": -0.0314, "reward": 1.4500322341918945, "reward_std": 0.21745267510414124, "rewards/accuracy_reward_stage2": 0.48128223419189453, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1577 }, { "completion_length": 23.21875, "epoch": 0.2765025407394428, "grad_norm": 22.79993085876522, "kl": 0.06982421875, "learning_rate": 7.236726826704047e-07, "loss": 0.0139, "reward": 1.3926570415496826, "reward_std": 0.17935630679130554, "rewards/accuracy_reward_stage2": 0.5489070415496826, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1578 }, { "completion_length": 10.1875, "epoch": 0.27667776414929035, "grad_norm": 23.08514350231986, "kl": 0.0888671875, "learning_rate": 7.234974592605572e-07, "loss": -0.0061, "reward": 1.7529489994049072, "reward_std": 0.20225925743579865, "rewards/accuracy_reward_stage2": 0.7685739398002625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1579 }, { "completion_length": 13.5625, "epoch": 0.2768529875591379, "grad_norm": 21.042427743109787, "kl": 0.04345703125, "learning_rate": 7.233222358507096e-07, "loss": -0.0268, "reward": 1.4279401302337646, "reward_std": 0.16784167289733887, "rewards/accuracy_reward_stage2": 0.44356510043144226, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1580 }, { "completion_length": 10.390625, "epoch": 0.27702821096898544, "grad_norm": 77.07830159883746, "kl": 0.4609375, "learning_rate": 7.231470124408621e-07, "loss": 0.1237, "reward": 1.6272919178009033, "reward_std": 0.2883310914039612, "rewards/accuracy_reward_stage2": 0.6585419178009033, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1581 }, { "completion_length": 11.0, "epoch": 0.277203434378833, "grad_norm": 18.558360183545652, "kl": 0.055908203125, "learning_rate": 7.229717890310145e-07, "loss": -0.0218, "reward": 1.8247655630111694, "reward_std": 0.20700310170650482, "rewards/accuracy_reward_stage2": 0.8403905034065247, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1582 }, { "completion_length": 10.546875, "epoch": 0.2773786577886806, "grad_norm": 14.116427509022715, "kl": 0.1572265625, "learning_rate": 7.22796565621167e-07, "loss": 0.0345, "reward": 0.8919172286987305, "reward_std": 0.153579980134964, "rewards/accuracy_reward_stage2": 0.28254222869873047, "rewards/format_reward_stage1_pointerpad": 0.609375, "scores/accuracy_reward_stage2": 0.609375, "step": 1583 }, { "completion_length": 8.0625, "epoch": 0.27755388119852814, "grad_norm": 20.72078723909815, "kl": 0.1025390625, "learning_rate": 7.226213422113195e-07, "loss": -0.0296, "reward": 1.6172088384628296, "reward_std": 0.2659575641155243, "rewards/accuracy_reward_stage2": 0.6484588384628296, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1584 }, { "completion_length": 11.34375, "epoch": 0.2777291046083757, "grad_norm": 26.16018586016227, "kl": 0.208984375, "learning_rate": 7.224461188014719e-07, "loss": 0.0837, "reward": 1.5841631889343262, "reward_std": 0.2787715196609497, "rewards/accuracy_reward_stage2": 0.5841631889343262, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1585 }, { "completion_length": 10.296875, "epoch": 0.27790432801822323, "grad_norm": 19.981196513188134, "kl": 0.048095703125, "learning_rate": 7.222708953916243e-07, "loss": -0.025, "reward": 1.5554091930389404, "reward_std": 0.24271945655345917, "rewards/accuracy_reward_stage2": 0.6960341930389404, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1586 }, { "completion_length": 9.140625, "epoch": 0.2780795514280708, "grad_norm": 23.940325381236757, "kl": 0.2490234375, "learning_rate": 7.220956719817766e-07, "loss": 0.0579, "reward": 1.2890129089355469, "reward_std": 0.29061371088027954, "rewards/accuracy_reward_stage2": 0.4452629089355469, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1587 }, { "completion_length": 7.84375, "epoch": 0.27825477483791833, "grad_norm": 20.466731457336554, "kl": 0.054443359375, "learning_rate": 7.219204485719291e-07, "loss": 0.0218, "reward": 1.5818238258361816, "reward_std": 0.15346041321754456, "rewards/accuracy_reward_stage2": 0.5818238258361816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1588 }, { "completion_length": 7.6875, "epoch": 0.2784299982477659, "grad_norm": 27.632750470965156, "kl": 0.09228515625, "learning_rate": 7.217452251620816e-07, "loss": 0.0176, "reward": 1.4603047370910645, "reward_std": 0.25379306077957153, "rewards/accuracy_reward_stage2": 0.6009297370910645, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1589 }, { "completion_length": 7.453125, "epoch": 0.2786052216576135, "grad_norm": 19.974365273381522, "kl": 0.06884765625, "learning_rate": 7.21570001752234e-07, "loss": 0.0276, "reward": 1.343886375427246, "reward_std": 0.21338877081871033, "rewards/accuracy_reward_stage2": 0.46888643503189087, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1590 }, { "completion_length": 8.296875, "epoch": 0.278780445067461, "grad_norm": 21.637228824670125, "kl": 0.060302734375, "learning_rate": 7.213947783423865e-07, "loss": -0.0038, "reward": 1.4773318767547607, "reward_std": 0.2645827531814575, "rewards/accuracy_reward_stage2": 0.49295687675476074, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1591 }, { "completion_length": 10.375, "epoch": 0.27895566847730857, "grad_norm": 19.634208559796395, "kl": 0.07177734375, "learning_rate": 7.21219554932539e-07, "loss": -0.0597, "reward": 1.3822365999221802, "reward_std": 0.33031585812568665, "rewards/accuracy_reward_stage2": 0.5384865999221802, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1592 }, { "completion_length": 11.109375, "epoch": 0.2791308918871561, "grad_norm": 36.81881300779984, "kl": 0.384765625, "learning_rate": 7.210443315226914e-07, "loss": 0.1539, "reward": 1.5430142879486084, "reward_std": 0.26516449451446533, "rewards/accuracy_reward_stage2": 0.7930142283439636, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1593 }, { "completion_length": 13.84375, "epoch": 0.27930611529700367, "grad_norm": 21.62546848781223, "kl": 0.02099609375, "learning_rate": 7.208691081128439e-07, "loss": -0.0349, "reward": 1.7699782848358154, "reward_std": 0.22004219889640808, "rewards/accuracy_reward_stage2": 0.7856031656265259, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1594 }, { "completion_length": 12.875, "epoch": 0.2794813387068512, "grad_norm": 18.89892017703249, "kl": 0.0712890625, "learning_rate": 7.206938847029962e-07, "loss": -0.0105, "reward": 1.6201822757720947, "reward_std": 0.22205275297164917, "rewards/accuracy_reward_stage2": 0.6358071565628052, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1595 }, { "completion_length": 10.96875, "epoch": 0.2796565621166988, "grad_norm": 9.189918380732902, "kl": 0.03955078125, "learning_rate": 7.205186612931487e-07, "loss": -0.0726, "reward": 1.7642738819122314, "reward_std": 0.10237517952919006, "rewards/accuracy_reward_stage2": 0.7955238223075867, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1596 }, { "completion_length": 7.4375, "epoch": 0.27983178552654636, "grad_norm": 15.274296589152023, "kl": 0.0693359375, "learning_rate": 7.203434378833012e-07, "loss": -0.0771, "reward": 1.8618519306182861, "reward_std": 0.2493779957294464, "rewards/accuracy_reward_stage2": 0.9087268710136414, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1597 }, { "completion_length": 11.5, "epoch": 0.2800070089363939, "grad_norm": 42.76093281964334, "kl": 0.1259765625, "learning_rate": 7.201682144734536e-07, "loss": 0.006, "reward": 1.4967622756958008, "reward_std": 0.19433635473251343, "rewards/accuracy_reward_stage2": 0.512387216091156, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1598 }, { "completion_length": 17.078125, "epoch": 0.28018223234624146, "grad_norm": 22.19032833031183, "kl": 0.248046875, "learning_rate": 7.199929910636061e-07, "loss": 0.0313, "reward": 1.3876421451568604, "reward_std": 0.24027925729751587, "rewards/accuracy_reward_stage2": 0.6688920259475708, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1599 }, { "completion_length": 11.890625, "epoch": 0.280357455756089, "grad_norm": 22.30237398012051, "kl": 0.2353515625, "learning_rate": 7.198177676537585e-07, "loss": -0.0071, "reward": 1.7141770124435425, "reward_std": 0.3087601959705353, "rewards/accuracy_reward_stage2": 0.7610519528388977, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1600 }, { "completion_length": 9.21875, "epoch": 0.28053267916593655, "grad_norm": 8.829576239530924, "kl": 0.0703125, "learning_rate": 7.196425442439109e-07, "loss": 0.0281, "reward": 1.7324020862579346, "reward_std": 0.03725196793675423, "rewards/accuracy_reward_stage2": 0.8574021458625793, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1601 }, { "completion_length": 12.5, "epoch": 0.28070790257578415, "grad_norm": 15.991249185161834, "kl": 0.0791015625, "learning_rate": 7.194673208340634e-07, "loss": 0.0317, "reward": 1.5364727973937988, "reward_std": 0.11035994440317154, "rewards/accuracy_reward_stage2": 0.5364727973937988, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1602 }, { "completion_length": 8.03125, "epoch": 0.2808831259856317, "grad_norm": 17.575426900536577, "kl": 0.1298828125, "learning_rate": 7.192920974242158e-07, "loss": -0.0363, "reward": 1.6127972602844238, "reward_std": 0.15401007235050201, "rewards/accuracy_reward_stage2": 0.6440472602844238, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1603 }, { "completion_length": 16.203125, "epoch": 0.28105834939547925, "grad_norm": 17.905045894586475, "kl": 0.11474609375, "learning_rate": 7.191168740143683e-07, "loss": -0.0864, "reward": 1.4100637435913086, "reward_std": 0.2545863091945648, "rewards/accuracy_reward_stage2": 0.4569387435913086, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1604 }, { "completion_length": 8.640625, "epoch": 0.2812335728053268, "grad_norm": 20.18219517559298, "kl": 0.0556640625, "learning_rate": 7.189416506045208e-07, "loss": -0.0219, "reward": 1.4022883176803589, "reward_std": 0.21513070166110992, "rewards/accuracy_reward_stage2": 0.41791337728500366, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1605 }, { "completion_length": 9.34375, "epoch": 0.28140879621517434, "grad_norm": 22.68462923752165, "kl": 0.0947265625, "learning_rate": 7.187664271946731e-07, "loss": -0.0504, "reward": 1.4744114875793457, "reward_std": 0.28335297107696533, "rewards/accuracy_reward_stage2": 0.5056614875793457, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1606 }, { "completion_length": 10.375, "epoch": 0.2815840196250219, "grad_norm": 52.28396048008046, "kl": 0.205078125, "learning_rate": 7.185912037848256e-07, "loss": 0.0466, "reward": 1.6137253046035767, "reward_std": 0.33331912755966187, "rewards/accuracy_reward_stage2": 0.6293503046035767, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1607 }, { "completion_length": 8.875, "epoch": 0.28175924303486943, "grad_norm": 28.125928030587517, "kl": 0.13671875, "learning_rate": 7.184159803749781e-07, "loss": 0.0229, "reward": 1.5231481790542603, "reward_std": 0.31302833557128906, "rewards/accuracy_reward_stage2": 0.6637731790542603, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1608 }, { "completion_length": 14.96875, "epoch": 0.28193446644471704, "grad_norm": 31.768402575173674, "kl": 0.08349609375, "learning_rate": 7.182407569651305e-07, "loss": -0.0108, "reward": 1.2850186824798584, "reward_std": 0.22496335208415985, "rewards/accuracy_reward_stage2": 0.3006437122821808, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1609 }, { "completion_length": 14.546875, "epoch": 0.2821096898545646, "grad_norm": 21.607326402315472, "kl": 0.1416015625, "learning_rate": 7.18065533555283e-07, "loss": 0.022, "reward": 1.6873608827590942, "reward_std": 0.19603906571865082, "rewards/accuracy_reward_stage2": 0.7029858231544495, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1610 }, { "completion_length": 13.75, "epoch": 0.28228491326441213, "grad_norm": 15.443224102301414, "kl": 0.05859375, "learning_rate": 7.178903101454354e-07, "loss": 0.0234, "reward": 1.625, "reward_std": 0.1157275140285492, "rewards/accuracy_reward_stage2": 0.75, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1611 }, { "completion_length": 21.28125, "epoch": 0.2824601366742597, "grad_norm": 27.345953796390596, "kl": 0.23046875, "learning_rate": 7.177150867355879e-07, "loss": -0.0345, "reward": 1.4842654466629028, "reward_std": 0.28159990906715393, "rewards/accuracy_reward_stage2": 0.5311404466629028, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1612 }, { "completion_length": 10.28125, "epoch": 0.2826353600841072, "grad_norm": 10.134441654602675, "kl": 0.091796875, "learning_rate": 7.175398633257403e-07, "loss": -0.0052, "reward": 1.9131689071655273, "reward_std": 0.1398359090089798, "rewards/accuracy_reward_stage2": 0.9287939667701721, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1613 }, { "completion_length": 9.28125, "epoch": 0.28281058349395477, "grad_norm": 21.48645768776718, "kl": 0.138671875, "learning_rate": 7.173646399158927e-07, "loss": 0.0092, "reward": 1.359375, "reward_std": 0.32878512144088745, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1614 }, { "completion_length": 15.421875, "epoch": 0.2829858069038024, "grad_norm": 19.737474472499912, "kl": 0.06787109375, "learning_rate": 7.171894165060451e-07, "loss": 0.0272, "reward": 1.5567772388458252, "reward_std": 0.11542443186044693, "rewards/accuracy_reward_stage2": 0.68177729845047, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1615 }, { "completion_length": 13.578125, "epoch": 0.2831610303136499, "grad_norm": 24.78032036079619, "kl": 0.1142578125, "learning_rate": 7.170141930961976e-07, "loss": 0.0457, "reward": 1.5786793231964111, "reward_std": 0.21386365592479706, "rewards/accuracy_reward_stage2": 0.5786793828010559, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1616 }, { "completion_length": 6.640625, "epoch": 0.28333625372349747, "grad_norm": 23.418408766851822, "kl": 0.2265625, "learning_rate": 7.1683896968635e-07, "loss": -0.0405, "reward": 1.5210037231445312, "reward_std": 0.34530240297317505, "rewards/accuracy_reward_stage2": 0.567878782749176, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1617 }, { "completion_length": 9.03125, "epoch": 0.283511477133345, "grad_norm": 18.082312553657825, "kl": 0.11767578125, "learning_rate": 7.166637462765025e-07, "loss": 0.0111, "reward": 1.6185312271118164, "reward_std": 0.1870284378528595, "rewards/accuracy_reward_stage2": 0.6497811079025269, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1618 }, { "completion_length": 9.078125, "epoch": 0.28368670054319256, "grad_norm": 17.717906115355813, "kl": 0.031982421875, "learning_rate": 7.164885228666549e-07, "loss": 0.0128, "reward": 1.6676509380340576, "reward_std": 0.13465890288352966, "rewards/accuracy_reward_stage2": 0.6676508784294128, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1619 }, { "completion_length": 13.53125, "epoch": 0.2838619239530401, "grad_norm": 20.824326637564756, "kl": 0.138671875, "learning_rate": 7.163132994568074e-07, "loss": 0.0557, "reward": 1.265625, "reward_std": 0.19044628739356995, "rewards/accuracy_reward_stage2": 0.390625, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1620 }, { "completion_length": 13.578125, "epoch": 0.28403714736288765, "grad_norm": 22.11145305537472, "kl": 0.1728515625, "learning_rate": 7.161380760469599e-07, "loss": 0.0624, "reward": 1.6914026737213135, "reward_std": 0.1933891624212265, "rewards/accuracy_reward_stage2": 0.8320276737213135, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1621 }, { "completion_length": 15.359375, "epoch": 0.28421237077273526, "grad_norm": 18.671780608441452, "kl": 0.04443359375, "learning_rate": 7.159628526371123e-07, "loss": -0.0599, "reward": 1.3597071170806885, "reward_std": 0.16765311360359192, "rewards/accuracy_reward_stage2": 0.6409571170806885, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 1622 }, { "completion_length": 9.5, "epoch": 0.2843875941825828, "grad_norm": 19.783864825763466, "kl": 0.07080078125, "learning_rate": 7.157876292272648e-07, "loss": -0.0091, "reward": 1.5630043745040894, "reward_std": 0.2638583779335022, "rewards/accuracy_reward_stage2": 0.7036293745040894, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1623 }, { "completion_length": 11.09375, "epoch": 0.28456281759243035, "grad_norm": 18.045229465095055, "kl": 0.0634765625, "learning_rate": 7.156124058174173e-07, "loss": 0.0254, "reward": 1.6278626918792725, "reward_std": 0.16786369681358337, "rewards/accuracy_reward_stage2": 0.627862811088562, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1624 }, { "completion_length": 11.0, "epoch": 0.2847380410022779, "grad_norm": 22.45995158178718, "kl": 0.126953125, "learning_rate": 7.154371824075697e-07, "loss": 0.0505, "reward": 1.616892695426941, "reward_std": 0.14647267758846283, "rewards/accuracy_reward_stage2": 0.6168926954269409, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1625 }, { "completion_length": 9.015625, "epoch": 0.28491326441212544, "grad_norm": 16.218214133076454, "kl": 0.224609375, "learning_rate": 7.15261958997722e-07, "loss": 0.0478, "reward": 1.5607107877731323, "reward_std": 0.15622293949127197, "rewards/accuracy_reward_stage2": 0.7013357281684875, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1626 }, { "completion_length": 10.0, "epoch": 0.285088487821973, "grad_norm": 18.28017136924628, "kl": 0.1630859375, "learning_rate": 7.150867355878744e-07, "loss": 0.0653, "reward": 1.3840994834899902, "reward_std": 0.1972706913948059, "rewards/accuracy_reward_stage2": 0.38409942388534546, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1627 }, { "completion_length": 10.34375, "epoch": 0.2852637112318206, "grad_norm": 18.30002553390304, "kl": 0.0162353515625, "learning_rate": 7.149115121780269e-07, "loss": 0.0065, "reward": 1.625, "reward_std": 0.0883883461356163, "rewards/accuracy_reward_stage2": 0.75, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1628 }, { "completion_length": 6.28125, "epoch": 0.28543893464166814, "grad_norm": 18.105026194272845, "kl": 0.09619140625, "learning_rate": 7.147362887681794e-07, "loss": 0.0385, "reward": 1.465050220489502, "reward_std": 0.14513222873210907, "rewards/accuracy_reward_stage2": 0.46505028009414673, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1629 }, { "completion_length": 10.890625, "epoch": 0.2856141580515157, "grad_norm": 16.56397158084368, "kl": 0.1298828125, "learning_rate": 7.145610653583318e-07, "loss": 0.0077, "reward": 1.2697676420211792, "reward_std": 0.25002607703208923, "rewards/accuracy_reward_stage2": 0.5353926420211792, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1630 }, { "completion_length": 14.4375, "epoch": 0.28578938146136323, "grad_norm": 19.65409355902699, "kl": 0.1279296875, "learning_rate": 7.143858419484843e-07, "loss": 0.0514, "reward": 1.4833333492279053, "reward_std": 0.21255075931549072, "rewards/accuracy_reward_stage2": 0.8583333492279053, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1631 }, { "completion_length": 6.5, "epoch": 0.2859646048712108, "grad_norm": 25.046007821442476, "kl": 0.076171875, "learning_rate": 7.142106185386368e-07, "loss": 0.0305, "reward": 1.5952610969543457, "reward_std": 0.22790226340293884, "rewards/accuracy_reward_stage2": 0.5952612161636353, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1632 }, { "completion_length": 9.78125, "epoch": 0.28613982828105833, "grad_norm": 22.298551409705386, "kl": 0.078125, "learning_rate": 7.140353951287892e-07, "loss": -0.0022, "reward": 1.5244977474212646, "reward_std": 0.33707937598228455, "rewards/accuracy_reward_stage2": 0.6651226282119751, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1633 }, { "completion_length": 13.3125, "epoch": 0.28631505169090593, "grad_norm": 19.77506952533496, "kl": 0.26171875, "learning_rate": 7.138601717189417e-07, "loss": 0.0391, "reward": 1.274181604385376, "reward_std": 0.18674418330192566, "rewards/accuracy_reward_stage2": 0.4304315447807312, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1634 }, { "completion_length": 7.359375, "epoch": 0.2864902751007535, "grad_norm": 22.0114841950705, "kl": 0.087890625, "learning_rate": 7.13684948309094e-07, "loss": -0.009, "reward": 1.5875247716903687, "reward_std": 0.31310462951660156, "rewards/accuracy_reward_stage2": 0.6031497716903687, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1635 }, { "completion_length": 17.546875, "epoch": 0.286665498510601, "grad_norm": 17.453595183630732, "kl": 0.028564453125, "learning_rate": 7.135097248992465e-07, "loss": -0.0294, "reward": 1.7066229581832886, "reward_std": 0.1509627103805542, "rewards/accuracy_reward_stage2": 0.7222478985786438, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1636 }, { "completion_length": 16.171875, "epoch": 0.28684072192044857, "grad_norm": 25.26047295052692, "kl": 0.345703125, "learning_rate": 7.13334501489399e-07, "loss": 0.1135, "reward": 1.2085933685302734, "reward_std": 0.22050847113132477, "rewards/accuracy_reward_stage2": 0.47421833872795105, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1637 }, { "completion_length": 10.828125, "epoch": 0.2870159453302961, "grad_norm": 29.875900138268193, "kl": 0.07861328125, "learning_rate": 7.131592780795513e-07, "loss": 0.0316, "reward": 1.5491917133331299, "reward_std": 0.2803182899951935, "rewards/accuracy_reward_stage2": 0.5491916537284851, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1638 }, { "completion_length": 25.75, "epoch": 0.28719116874014367, "grad_norm": 20.93227222442535, "kl": 0.1103515625, "learning_rate": 7.129840546697038e-07, "loss": -0.0201, "reward": 1.324310302734375, "reward_std": 0.26539939641952515, "rewards/accuracy_reward_stage2": 0.4805603623390198, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1639 }, { "completion_length": 8.5625, "epoch": 0.2873663921499912, "grad_norm": 14.537766364808704, "kl": 0.1328125, "learning_rate": 7.128088312598563e-07, "loss": -0.0141, "reward": 1.6957449913024902, "reward_std": 0.17225536704063416, "rewards/accuracy_reward_stage2": 0.7269949913024902, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1640 }, { "completion_length": 11.609375, "epoch": 0.2875416155598388, "grad_norm": 20.96258079590175, "kl": 0.056884765625, "learning_rate": 7.126336078500087e-07, "loss": -0.0858, "reward": 1.7480556964874268, "reward_std": 0.3304206430912018, "rewards/accuracy_reward_stage2": 0.7949306964874268, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1641 }, { "completion_length": 7.65625, "epoch": 0.28771683896968636, "grad_norm": 23.292493455122358, "kl": 0.0751953125, "learning_rate": 7.124583844401612e-07, "loss": 0.03, "reward": 1.7483609914779663, "reward_std": 0.19954615831375122, "rewards/accuracy_reward_stage2": 0.7483609914779663, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1642 }, { "completion_length": 26.328125, "epoch": 0.2878920623795339, "grad_norm": 17.8083297445197, "kl": 0.0439453125, "learning_rate": 7.122831610303136e-07, "loss": 0.0176, "reward": 1.6547987461090088, "reward_std": 0.17785227298736572, "rewards/accuracy_reward_stage2": 0.6547987461090088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1643 }, { "completion_length": 12.703125, "epoch": 0.28806728578938146, "grad_norm": 30.90286646759068, "kl": 0.1396484375, "learning_rate": 7.121079376204661e-07, "loss": 0.0559, "reward": 1.5476398468017578, "reward_std": 0.23930677771568298, "rewards/accuracy_reward_stage2": 0.6726399660110474, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1644 }, { "completion_length": 8.625, "epoch": 0.288242509199229, "grad_norm": 19.56818403132419, "kl": 0.1806640625, "learning_rate": 7.119327142106185e-07, "loss": 0.0346, "reward": 1.5129913091659546, "reward_std": 0.22683054208755493, "rewards/accuracy_reward_stage2": 0.6536163091659546, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1645 }, { "completion_length": 8.5, "epoch": 0.28841773260907655, "grad_norm": 19.170340208463543, "kl": 0.212890625, "learning_rate": 7.117574908007709e-07, "loss": 0.0346, "reward": 1.6334724426269531, "reward_std": 0.17795699834823608, "rewards/accuracy_reward_stage2": 0.7897223830223083, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1646 }, { "completion_length": 9.375, "epoch": 0.28859295601892415, "grad_norm": 15.89994694611484, "kl": 0.12890625, "learning_rate": 7.115822673909234e-07, "loss": 0.0516, "reward": 1.3058521747589111, "reward_std": 0.17969033122062683, "rewards/accuracy_reward_stage2": 0.43085217475891113, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1647 }, { "completion_length": 7.84375, "epoch": 0.2887681794287717, "grad_norm": 23.01958952222859, "kl": 0.1611328125, "learning_rate": 7.114070439810759e-07, "loss": -0.0225, "reward": 1.4997072219848633, "reward_std": 0.208816260099411, "rewards/accuracy_reward_stage2": 0.5309572219848633, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1648 }, { "completion_length": 10.84375, "epoch": 0.28894340283861925, "grad_norm": 17.99364116397282, "kl": 0.0400390625, "learning_rate": 7.112318205712283e-07, "loss": 0.016, "reward": 1.4152864217758179, "reward_std": 0.16304050385951996, "rewards/accuracy_reward_stage2": 0.5402864217758179, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1649 }, { "completion_length": 10.765625, "epoch": 0.2891186262484668, "grad_norm": 22.290912720435063, "kl": 0.032958984375, "learning_rate": 7.110565971613808e-07, "loss": 0.0132, "reward": 1.2604167461395264, "reward_std": 0.20825409889221191, "rewards/accuracy_reward_stage2": 0.2604166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1650 }, { "completion_length": 6.515625, "epoch": 0.28929384965831434, "grad_norm": 16.86284479055235, "kl": 0.049072265625, "learning_rate": 7.108813737515331e-07, "loss": -0.0245, "reward": 1.6870609521865845, "reward_std": 0.14026181399822235, "rewards/accuracy_reward_stage2": 0.7026859521865845, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1651 }, { "completion_length": 8.046875, "epoch": 0.2894690730681619, "grad_norm": 23.0356528349917, "kl": 0.055419921875, "learning_rate": 7.107061503416856e-07, "loss": -0.0152, "reward": 1.5876586437225342, "reward_std": 0.2479107826948166, "rewards/accuracy_reward_stage2": 0.6032836437225342, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1652 }, { "completion_length": 11.5625, "epoch": 0.2896442964780095, "grad_norm": 26.79041801998364, "kl": 0.07568359375, "learning_rate": 7.105309269318381e-07, "loss": 0.0014, "reward": 1.5761586427688599, "reward_std": 0.2695969343185425, "rewards/accuracy_reward_stage2": 0.7167835831642151, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1653 }, { "completion_length": 13.171875, "epoch": 0.28981951988785704, "grad_norm": 12.232697936613171, "kl": 0.06640625, "learning_rate": 7.103557035219905e-07, "loss": 0.0266, "reward": 1.479015588760376, "reward_std": 0.10361681878566742, "rewards/accuracy_reward_stage2": 0.4790155291557312, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1654 }, { "completion_length": 10.953125, "epoch": 0.2899947432977046, "grad_norm": 18.931813636993038, "kl": 0.11767578125, "learning_rate": 7.101804801121429e-07, "loss": 0.0156, "reward": 1.6782610416412354, "reward_std": 0.2642056345939636, "rewards/accuracy_reward_stage2": 0.8188860416412354, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1655 }, { "completion_length": 19.78125, "epoch": 0.29016996670755213, "grad_norm": 29.698298899171085, "kl": 0.166015625, "learning_rate": 7.100052567022954e-07, "loss": 0.0663, "reward": 1.455777645111084, "reward_std": 0.2696455121040344, "rewards/accuracy_reward_stage2": 0.45577773451805115, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1656 }, { "completion_length": 9.390625, "epoch": 0.2903451901173997, "grad_norm": 20.362922155821412, "kl": 0.09130859375, "learning_rate": 7.098300332924478e-07, "loss": 0.0364, "reward": 1.796497106552124, "reward_std": 0.16053104400634766, "rewards/accuracy_reward_stage2": 0.7964969873428345, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1657 }, { "completion_length": 8.546875, "epoch": 0.2905204135272472, "grad_norm": 16.360212652099975, "kl": 0.0908203125, "learning_rate": 7.096548098826003e-07, "loss": 0.0363, "reward": 1.8485822677612305, "reward_std": 0.17348849773406982, "rewards/accuracy_reward_stage2": 0.8485823273658752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1658 }, { "completion_length": 9.859375, "epoch": 0.29069563693709477, "grad_norm": 29.315179459450466, "kl": 0.044921875, "learning_rate": 7.094795864727527e-07, "loss": 0.0179, "reward": 1.810223937034607, "reward_std": 0.21215862035751343, "rewards/accuracy_reward_stage2": 0.8102238774299622, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1659 }, { "completion_length": 7.203125, "epoch": 0.2908708603469424, "grad_norm": 17.962626858687273, "kl": 0.111328125, "learning_rate": 7.093043630629052e-07, "loss": -0.0129, "reward": 1.331225872039795, "reward_std": 0.21827349066734314, "rewards/accuracy_reward_stage2": 0.48747581243515015, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1660 }, { "completion_length": 8.25, "epoch": 0.2910460837567899, "grad_norm": 16.764163271747165, "kl": 0.115234375, "learning_rate": 7.091291396530577e-07, "loss": 0.0182, "reward": 1.455185055732727, "reward_std": 0.1520254909992218, "rewards/accuracy_reward_stage2": 0.595810055732727, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1661 }, { "completion_length": 17.5, "epoch": 0.29122130716663747, "grad_norm": 19.947587748612907, "kl": 0.2578125, "learning_rate": 7.089539162432101e-07, "loss": -0.0365, "reward": 1.4499015808105469, "reward_std": 0.32634487748146057, "rewards/accuracy_reward_stage2": 0.7624015808105469, "rewards/format_reward_stage1_pointerpad": 0.6875, "scores/accuracy_reward_stage2": 0.6875, "step": 1662 }, { "completion_length": 12.78125, "epoch": 0.291396530576485, "grad_norm": 24.9274809883867, "kl": 0.40234375, "learning_rate": 7.087786928333626e-07, "loss": 0.1166, "reward": 1.428887963294983, "reward_std": 0.12139017879962921, "rewards/accuracy_reward_stage2": 0.6945129632949829, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1663 }, { "completion_length": 10.203125, "epoch": 0.29157175398633256, "grad_norm": 18.81552695245245, "kl": 0.1279296875, "learning_rate": 7.086034694235148e-07, "loss": 0.051, "reward": 1.5833933353424072, "reward_std": 0.2008472979068756, "rewards/accuracy_reward_stage2": 0.7083932161331177, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1664 }, { "completion_length": 16.46875, "epoch": 0.2917469773961801, "grad_norm": 19.36111770410725, "kl": 0.314453125, "learning_rate": 7.084282460136673e-07, "loss": 0.1258, "reward": 1.2535412311553955, "reward_std": 0.12364614009857178, "rewards/accuracy_reward_stage2": 0.5035412311553955, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1665 }, { "completion_length": 9.9375, "epoch": 0.2919222008060277, "grad_norm": 21.804938787404357, "kl": 0.140625, "learning_rate": 7.082530226038198e-07, "loss": 0.0118, "reward": 1.6458333730697632, "reward_std": 0.1695856750011444, "rewards/accuracy_reward_stage2": 0.6614583730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1666 }, { "completion_length": 8.9375, "epoch": 0.29209742421587526, "grad_norm": 15.49891530709726, "kl": 0.08544921875, "learning_rate": 7.080777991939722e-07, "loss": -0.0101, "reward": 1.6766133308410645, "reward_std": 0.19475360214710236, "rewards/accuracy_reward_stage2": 0.8172383904457092, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1667 }, { "completion_length": 15.125, "epoch": 0.2922726476257228, "grad_norm": 16.128066127735444, "kl": 0.0113525390625, "learning_rate": 7.079025757841247e-07, "loss": 0.0046, "reward": 1.5276246070861816, "reward_std": 0.09870946407318115, "rewards/accuracy_reward_stage2": 0.5276245474815369, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1668 }, { "completion_length": 7.46875, "epoch": 0.29244787103557035, "grad_norm": 23.841239880152038, "kl": 0.08447265625, "learning_rate": 7.077273523742772e-07, "loss": -0.0079, "reward": 1.5087754726409912, "reward_std": 0.15740589797496796, "rewards/accuracy_reward_stage2": 0.5244004726409912, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1669 }, { "completion_length": 10.21875, "epoch": 0.2926230944454179, "grad_norm": 15.687482528874513, "kl": 0.10986328125, "learning_rate": 7.075521289644296e-07, "loss": 0.0439, "reward": 1.417892575263977, "reward_std": 0.06578870862722397, "rewards/accuracy_reward_stage2": 0.667892575263977, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1670 }, { "completion_length": 12.421875, "epoch": 0.29279831785526544, "grad_norm": 18.25270654941957, "kl": 0.12255859375, "learning_rate": 7.073769055545821e-07, "loss": 0.0489, "reward": 1.4978927373886108, "reward_std": 0.14950445294380188, "rewards/accuracy_reward_stage2": 0.6228927373886108, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1671 }, { "completion_length": 20.21875, "epoch": 0.29297354126511305, "grad_norm": 96.1894213436714, "kl": 0.5625, "learning_rate": 7.072016821447345e-07, "loss": 0.1609, "reward": 1.3571423292160034, "reward_std": 0.20367828011512756, "rewards/accuracy_reward_stage2": 0.5133923292160034, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1672 }, { "completion_length": 6.625, "epoch": 0.2931487646749606, "grad_norm": 20.343287663887203, "kl": 0.06005859375, "learning_rate": 7.07026458734887e-07, "loss": 0.0241, "reward": 1.7696726322174072, "reward_std": 0.2620747983455658, "rewards/accuracy_reward_stage2": 0.7696726322174072, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1673 }, { "completion_length": 17.421875, "epoch": 0.29332398808480814, "grad_norm": 30.667524608470384, "kl": 0.1796875, "learning_rate": 7.068512353250395e-07, "loss": 0.0493, "reward": 1.5008697509765625, "reward_std": 0.11915256083011627, "rewards/accuracy_reward_stage2": 0.6258696913719177, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1674 }, { "completion_length": 9.671875, "epoch": 0.2934992114946557, "grad_norm": 21.4828702837734, "kl": 0.07373046875, "learning_rate": 7.066760119151918e-07, "loss": 0.0293, "reward": 1.3638755083084106, "reward_std": 0.1987563669681549, "rewards/accuracy_reward_stage2": 0.36387550830841064, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1675 }, { "completion_length": 11.5625, "epoch": 0.29367443490450323, "grad_norm": 21.590194337992045, "kl": 0.13671875, "learning_rate": 7.065007885053443e-07, "loss": 0.0924, "reward": 1.173264741897583, "reward_std": 0.16812384128570557, "rewards/accuracy_reward_stage2": 0.29826462268829346, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1676 }, { "completion_length": 14.65625, "epoch": 0.2938496583143508, "grad_norm": 23.505825274296527, "kl": 0.10791015625, "learning_rate": 7.063255650954967e-07, "loss": 0.0431, "reward": 1.5011882781982422, "reward_std": 0.2176738977432251, "rewards/accuracy_reward_stage2": 0.6261882185935974, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1677 }, { "completion_length": 4.390625, "epoch": 0.29402488172419833, "grad_norm": 8.510244036546899, "kl": 0.0693359375, "learning_rate": 7.061503416856491e-07, "loss": -0.0165, "reward": 1.8720643520355225, "reward_std": 0.09669148921966553, "rewards/accuracy_reward_stage2": 0.8876894116401672, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1678 }, { "completion_length": 9.265625, "epoch": 0.29420010513404593, "grad_norm": 25.477170919588477, "kl": 0.1328125, "learning_rate": 7.059751182758016e-07, "loss": -0.0351, "reward": 1.716698169708252, "reward_std": 0.3310700058937073, "rewards/accuracy_reward_stage2": 0.7479482293128967, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1679 }, { "completion_length": 9.46875, "epoch": 0.2943753285438935, "grad_norm": 24.03928992881974, "kl": 0.0615234375, "learning_rate": 7.05799894865954e-07, "loss": 0.0247, "reward": 1.6673147678375244, "reward_std": 0.166721373796463, "rewards/accuracy_reward_stage2": 0.6673146486282349, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1680 }, { "completion_length": 7.6875, "epoch": 0.294550551953741, "grad_norm": 16.872918992455006, "kl": 0.09423828125, "learning_rate": 7.056246714561065e-07, "loss": -0.0064, "reward": 1.4720426797866821, "reward_std": 0.2335229367017746, "rewards/accuracy_reward_stage2": 0.48766764998435974, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1681 }, { "completion_length": 11.65625, "epoch": 0.29472577536358857, "grad_norm": 21.510050629437227, "kl": 0.10107421875, "learning_rate": 7.05449448046259e-07, "loss": -0.014, "reward": 1.7428269386291504, "reward_std": 0.2706993818283081, "rewards/accuracy_reward_stage2": 0.7740768194198608, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1682 }, { "completion_length": 9.0, "epoch": 0.2949009987734361, "grad_norm": 18.79166684396821, "kl": 0.060546875, "learning_rate": 7.052742246364114e-07, "loss": -0.0199, "reward": 1.8116947412490845, "reward_std": 0.17610837519168854, "rewards/accuracy_reward_stage2": 0.8273198008537292, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1683 }, { "completion_length": 23.0625, "epoch": 0.29507622218328367, "grad_norm": 16.363578535268406, "kl": 0.1708984375, "learning_rate": 7.050990012265639e-07, "loss": 0.0684, "reward": 1.6377060413360596, "reward_std": 0.08054365962743759, "rewards/accuracy_reward_stage2": 0.7627060413360596, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1684 }, { "completion_length": 15.140625, "epoch": 0.29525144559313127, "grad_norm": 15.03309143820295, "kl": 0.06396484375, "learning_rate": 7.049237778167163e-07, "loss": 0.0256, "reward": 1.1309211254119873, "reward_std": 0.11928550899028778, "rewards/accuracy_reward_stage2": 0.2559211850166321, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1685 }, { "completion_length": 10.71875, "epoch": 0.2954266690029788, "grad_norm": 12.656645795151226, "kl": 0.0308837890625, "learning_rate": 7.047485544068687e-07, "loss": 0.0123, "reward": 1.7976956367492676, "reward_std": 0.04943205416202545, "rewards/accuracy_reward_stage2": 0.7976956367492676, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1686 }, { "completion_length": 11.734375, "epoch": 0.29560189241282636, "grad_norm": 24.12905263102765, "kl": 0.12109375, "learning_rate": 7.045733309970212e-07, "loss": -0.0167, "reward": 1.6340982913970947, "reward_std": 0.2528322637081146, "rewards/accuracy_reward_stage2": 0.6653482913970947, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1687 }, { "completion_length": 8.953125, "epoch": 0.2957771158226739, "grad_norm": 19.304379581875406, "kl": 0.1611328125, "learning_rate": 7.043981075871736e-07, "loss": -0.013, "reward": 1.532942295074463, "reward_std": 0.21373668313026428, "rewards/accuracy_reward_stage2": 0.6891922950744629, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1688 }, { "completion_length": 25.125, "epoch": 0.29595233923252146, "grad_norm": 20.032788690640114, "kl": 0.06689453125, "learning_rate": 7.04222884177326e-07, "loss": 0.0268, "reward": 1.6118648052215576, "reward_std": 0.14255890250205994, "rewards/accuracy_reward_stage2": 0.6118648052215576, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1689 }, { "completion_length": 9.515625, "epoch": 0.296127562642369, "grad_norm": 23.07272201883303, "kl": 0.1396484375, "learning_rate": 7.040476607674785e-07, "loss": 0.0285, "reward": 1.4711434841156006, "reward_std": 0.2035903036594391, "rewards/accuracy_reward_stage2": 0.4867684245109558, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1690 }, { "completion_length": 22.375, "epoch": 0.29630278605221655, "grad_norm": 91.94085271560532, "kl": 0.84765625, "learning_rate": 7.038724373576309e-07, "loss": 0.3547, "reward": 1.5097134113311768, "reward_std": 0.231087327003479, "rewards/accuracy_reward_stage2": 0.7597134709358215, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1691 }, { "completion_length": 7.796875, "epoch": 0.29647800946206415, "grad_norm": 19.91389494985646, "kl": 0.1533203125, "learning_rate": 7.036972139477834e-07, "loss": -0.0047, "reward": 1.438263177871704, "reward_std": 0.2388438731431961, "rewards/accuracy_reward_stage2": 0.5945132374763489, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1692 }, { "completion_length": 12.734375, "epoch": 0.2966532328719117, "grad_norm": 18.86150756027041, "kl": 0.064453125, "learning_rate": 7.035219905379359e-07, "loss": 0.0259, "reward": 1.7015407085418701, "reward_std": 0.21713702380657196, "rewards/accuracy_reward_stage2": 0.8265406489372253, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1693 }, { "completion_length": 12.46875, "epoch": 0.29682845628175925, "grad_norm": 8.4688183375784, "kl": 0.2431640625, "learning_rate": 7.033467671280882e-07, "loss": 0.053, "reward": 1.265625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.40625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1694 }, { "completion_length": 17.296875, "epoch": 0.2970036796916068, "grad_norm": 22.47149168439553, "kl": 0.34375, "learning_rate": 7.031715437182407e-07, "loss": 0.1372, "reward": 1.2965278625488281, "reward_std": 0.11467509716749191, "rewards/accuracy_reward_stage2": 0.5465278625488281, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1695 }, { "completion_length": 7.0625, "epoch": 0.29717890310145434, "grad_norm": 21.669890151890225, "kl": 0.13671875, "learning_rate": 7.029963203083931e-07, "loss": 0.0549, "reward": 1.6818768978118896, "reward_std": 0.14322030544281006, "rewards/accuracy_reward_stage2": 0.6818768382072449, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1696 }, { "completion_length": 8.3125, "epoch": 0.2973541265113019, "grad_norm": 14.261060175777834, "kl": 0.0279541015625, "learning_rate": 7.028210968985456e-07, "loss": -0.0217, "reward": 1.7881548404693604, "reward_std": 0.16872760653495789, "rewards/accuracy_reward_stage2": 0.8037798404693604, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1697 }, { "completion_length": 9.59375, "epoch": 0.2975293499211495, "grad_norm": 17.725080767100014, "kl": 0.072265625, "learning_rate": 7.026458734886981e-07, "loss": -0.0152, "reward": 1.4075841903686523, "reward_std": 0.1767999678850174, "rewards/accuracy_reward_stage2": 0.5482091903686523, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1698 }, { "completion_length": 12.6875, "epoch": 0.29770457333099704, "grad_norm": 23.378929824865686, "kl": 0.26953125, "learning_rate": 7.024706500788505e-07, "loss": -0.0099, "reward": 1.4340919256210327, "reward_std": 0.3488747477531433, "rewards/accuracy_reward_stage2": 0.6059669256210327, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1699 }, { "completion_length": 12.5, "epoch": 0.2978797967408446, "grad_norm": 24.027023608537906, "kl": 0.126953125, "learning_rate": 7.02295426669003e-07, "loss": -0.0246, "reward": 1.2346971035003662, "reward_std": 0.4185022711753845, "rewards/accuracy_reward_stage2": 0.390947163105011, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1700 }, { "completion_length": 12.75, "epoch": 0.29805502015069213, "grad_norm": 21.65595446347219, "kl": 0.125, "learning_rate": 7.021202032591555e-07, "loss": -0.0053, "reward": 1.5966134071350098, "reward_std": 0.3082408905029297, "rewards/accuracy_reward_stage2": 0.6278634667396545, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1701 }, { "completion_length": 11.078125, "epoch": 0.2982302435605397, "grad_norm": 23.075324456091533, "kl": 0.125, "learning_rate": 7.019449798493078e-07, "loss": 0.05, "reward": 1.47487473487854, "reward_std": 0.21936720609664917, "rewards/accuracy_reward_stage2": 0.5998746752738953, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1702 }, { "completion_length": 12.8125, "epoch": 0.2984054669703872, "grad_norm": 23.632321001409643, "kl": 0.291015625, "learning_rate": 7.017697564394603e-07, "loss": 0.1002, "reward": 1.6295795440673828, "reward_std": 0.34378618001937866, "rewards/accuracy_reward_stage2": 0.7545795440673828, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1703 }, { "completion_length": 12.359375, "epoch": 0.2985806903802348, "grad_norm": 26.175977719061464, "kl": 0.369140625, "learning_rate": 7.015945330296126e-07, "loss": 0.126, "reward": 1.4780054092407227, "reward_std": 0.2588563561439514, "rewards/accuracy_reward_stage2": 0.6186305284500122, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1704 }, { "completion_length": 10.3125, "epoch": 0.2987559137900824, "grad_norm": 22.18689402119106, "kl": 0.15234375, "learning_rate": 7.014193096197651e-07, "loss": -0.0272, "reward": 1.6308624744415283, "reward_std": 0.24226665496826172, "rewards/accuracy_reward_stage2": 0.6621125340461731, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1705 }, { "completion_length": 8.71875, "epoch": 0.2989311371999299, "grad_norm": 12.44901374803875, "kl": 0.0634765625, "learning_rate": 7.012440862099176e-07, "loss": -0.0629, "reward": 1.558675765991211, "reward_std": 0.15706884860992432, "rewards/accuracy_reward_stage2": 0.7149257063865662, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1706 }, { "completion_length": 8.9375, "epoch": 0.29910636060977747, "grad_norm": 18.19321123021985, "kl": 0.14453125, "learning_rate": 7.0106886280007e-07, "loss": 0.0263, "reward": 1.467024326324463, "reward_std": 0.23365125060081482, "rewards/accuracy_reward_stage2": 0.48264938592910767, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1707 }, { "completion_length": 9.96875, "epoch": 0.299281584019625, "grad_norm": 23.959214635034463, "kl": 0.138671875, "learning_rate": 7.008936393902225e-07, "loss": 0.0553, "reward": 1.6607258319854736, "reward_std": 0.3005909323692322, "rewards/accuracy_reward_stage2": 0.6607259511947632, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1708 }, { "completion_length": 12.984375, "epoch": 0.29945680742947256, "grad_norm": 23.925526061688014, "kl": 0.07958984375, "learning_rate": 7.00718415980375e-07, "loss": 0.0318, "reward": 1.6628730297088623, "reward_std": 0.16425105929374695, "rewards/accuracy_reward_stage2": 0.6628729701042175, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1709 }, { "completion_length": 29.578125, "epoch": 0.2996320308393201, "grad_norm": 25.19467130350549, "kl": 0.19140625, "learning_rate": 7.005431925705274e-07, "loss": 0.0827, "reward": 1.296067714691162, "reward_std": 0.17308276891708374, "rewards/accuracy_reward_stage2": 0.4366927146911621, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1710 }, { "completion_length": 7.5, "epoch": 0.2998072542491677, "grad_norm": 19.493516475996053, "kl": 0.1123046875, "learning_rate": 7.003679691606799e-07, "loss": 0.0007, "reward": 1.6939597129821777, "reward_std": 0.2620150148868561, "rewards/accuracy_reward_stage2": 0.8345847129821777, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1711 }, { "completion_length": 9.25, "epoch": 0.29998247765901526, "grad_norm": 19.677259863326938, "kl": 0.04052734375, "learning_rate": 7.001927457508323e-07, "loss": -0.028, "reward": 1.6832020282745361, "reward_std": 0.19173334538936615, "rewards/accuracy_reward_stage2": 0.6988270282745361, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1712 }, { "completion_length": 10.46875, "epoch": 0.3001577010688628, "grad_norm": 21.43673387663666, "kl": 0.1669921875, "learning_rate": 7.000175223409848e-07, "loss": -0.0295, "reward": 1.6994589567184448, "reward_std": 0.2502681314945221, "rewards/accuracy_reward_stage2": 0.7463339567184448, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1713 }, { "completion_length": 10.484375, "epoch": 0.30033292447871035, "grad_norm": 19.202934091736996, "kl": 0.1337890625, "learning_rate": 6.998422989311373e-07, "loss": 0.0535, "reward": 1.3423008918762207, "reward_std": 0.27192068099975586, "rewards/accuracy_reward_stage2": 0.3423008322715759, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1714 }, { "completion_length": 18.484375, "epoch": 0.3005081478885579, "grad_norm": 21.518890464068733, "kl": 0.06298828125, "learning_rate": 6.996670755212895e-07, "loss": 0.0253, "reward": 1.4944093227386475, "reward_std": 0.17470549046993256, "rewards/accuracy_reward_stage2": 0.49440932273864746, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1715 }, { "completion_length": 17.796875, "epoch": 0.30068337129840544, "grad_norm": 48.596822529684296, "kl": 0.482421875, "learning_rate": 6.99491852111442e-07, "loss": 0.0966, "reward": 1.3591495752334595, "reward_std": 0.17734628915786743, "rewards/accuracy_reward_stage2": 0.5310245752334595, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1716 }, { "completion_length": 16.953125, "epoch": 0.30085859470825305, "grad_norm": 23.366423040036004, "kl": 0.040283203125, "learning_rate": 6.993166287015945e-07, "loss": -0.007, "reward": 1.3081918954849243, "reward_std": 0.19678080081939697, "rewards/accuracy_reward_stage2": 0.3238169252872467, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1717 }, { "completion_length": 15.875, "epoch": 0.3010338181181006, "grad_norm": 15.664885734367475, "kl": 0.057861328125, "learning_rate": 6.991414052917469e-07, "loss": -0.0651, "reward": 1.781704306602478, "reward_std": 0.19567933678627014, "rewards/accuracy_reward_stage2": 0.8129542469978333, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1718 }, { "completion_length": 7.90625, "epoch": 0.30120904152794814, "grad_norm": 18.809252110481758, "kl": 0.0830078125, "learning_rate": 6.989661818818994e-07, "loss": 0.012, "reward": 1.4171596765518188, "reward_std": 0.21570885181427002, "rewards/accuracy_reward_stage2": 0.6827847957611084, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1719 }, { "completion_length": 9.4375, "epoch": 0.3013842649377957, "grad_norm": 22.51829370190563, "kl": 0.052978515625, "learning_rate": 6.987909584720518e-07, "loss": 0.0212, "reward": 1.703669548034668, "reward_std": 0.2621934711933136, "rewards/accuracy_reward_stage2": 0.7036696076393127, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1720 }, { "completion_length": 8.71875, "epoch": 0.30155948834764323, "grad_norm": 21.091726299660664, "kl": 0.052734375, "learning_rate": 6.986157350622043e-07, "loss": 0.0211, "reward": 1.749462604522705, "reward_std": 0.1858448088169098, "rewards/accuracy_reward_stage2": 0.7494626045227051, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1721 }, { "completion_length": 8.734375, "epoch": 0.3017347117574908, "grad_norm": 25.56784738107091, "kl": 0.1962890625, "learning_rate": 6.984405116523568e-07, "loss": -0.065, "reward": 1.4051895141601562, "reward_std": 0.35617536306381226, "rewards/accuracy_reward_stage2": 0.4676896333694458, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1722 }, { "completion_length": 22.90625, "epoch": 0.3019099351673384, "grad_norm": 24.982174793589298, "kl": 0.12255859375, "learning_rate": 6.982652882425092e-07, "loss": -0.0369, "reward": 1.7698495388031006, "reward_std": 0.23711535334587097, "rewards/accuracy_reward_stage2": 0.8010995984077454, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1723 }, { "completion_length": 9.96875, "epoch": 0.30208515857718593, "grad_norm": 21.534866823854347, "kl": 0.0673828125, "learning_rate": 6.980900648326617e-07, "loss": -0.0121, "reward": 1.7227420806884766, "reward_std": 0.24864692986011505, "rewards/accuracy_reward_stage2": 0.8633670806884766, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1724 }, { "completion_length": 9.0625, "epoch": 0.3022603819870335, "grad_norm": 15.481663429370426, "kl": 0.169921875, "learning_rate": 6.979148414228141e-07, "loss": -0.0576, "reward": 1.8736064434051514, "reward_std": 0.20491771399974823, "rewards/accuracy_reward_stage2": 0.9204814434051514, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1725 }, { "completion_length": 12.21875, "epoch": 0.302435605396881, "grad_norm": 22.575733117566205, "kl": 0.1474609375, "learning_rate": 6.977396180129665e-07, "loss": -0.0086, "reward": 1.4143965244293213, "reward_std": 0.29561150074005127, "rewards/accuracy_reward_stage2": 0.44564658403396606, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1726 }, { "completion_length": 7.46875, "epoch": 0.30261082880672857, "grad_norm": 17.265000582793025, "kl": 0.068359375, "learning_rate": 6.97564394603119e-07, "loss": -0.017, "reward": 1.6568676233291626, "reward_std": 0.2403629571199417, "rewards/accuracy_reward_stage2": 0.6724926233291626, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1727 }, { "completion_length": 10.421875, "epoch": 0.3027860522165761, "grad_norm": 13.370376075249549, "kl": 0.0966796875, "learning_rate": 6.973891711932713e-07, "loss": 0.0387, "reward": 1.4338374137878418, "reward_std": 0.12409239262342453, "rewards/accuracy_reward_stage2": 0.5588374733924866, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1728 }, { "completion_length": 10.03125, "epoch": 0.30296127562642367, "grad_norm": 16.232930281897875, "kl": 0.1044921875, "learning_rate": 6.972139477834238e-07, "loss": -0.0017, "reward": 1.6806886196136475, "reward_std": 0.19015967845916748, "rewards/accuracy_reward_stage2": 0.6963136792182922, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1729 }, { "completion_length": 8.046875, "epoch": 0.30313649903627127, "grad_norm": 17.384646746654873, "kl": 0.146484375, "learning_rate": 6.970387243735763e-07, "loss": 0.0146, "reward": 1.5590732097625732, "reward_std": 0.2061610072851181, "rewards/accuracy_reward_stage2": 0.5746980905532837, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1730 }, { "completion_length": 11.1875, "epoch": 0.3033117224461188, "grad_norm": 25.681642519445006, "kl": 0.07275390625, "learning_rate": 6.968635009637287e-07, "loss": -0.0114, "reward": 1.6075778007507324, "reward_std": 0.30036935210227966, "rewards/accuracy_reward_stage2": 0.6232027411460876, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1731 }, { "completion_length": 12.234375, "epoch": 0.30348694585596636, "grad_norm": 15.472091349424671, "kl": 0.11669921875, "learning_rate": 6.966882775538812e-07, "loss": 0.0465, "reward": 1.7527433633804321, "reward_std": 0.10957083106040955, "rewards/accuracy_reward_stage2": 0.8777433633804321, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1732 }, { "completion_length": 7.1875, "epoch": 0.3036621692658139, "grad_norm": 12.604748146353854, "kl": 0.08056640625, "learning_rate": 6.965130541440337e-07, "loss": 0.0047, "reward": 1.7152559757232666, "reward_std": 0.05833900347352028, "rewards/accuracy_reward_stage2": 0.7308810949325562, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1733 }, { "completion_length": 9.78125, "epoch": 0.30383739267566146, "grad_norm": 23.64900274193855, "kl": 0.1513671875, "learning_rate": 6.96337830734186e-07, "loss": 0.0301, "reward": 1.58614182472229, "reward_std": 0.31377267837524414, "rewards/accuracy_reward_stage2": 0.60176682472229, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1734 }, { "completion_length": 11.09375, "epoch": 0.304012616085509, "grad_norm": 14.200156242246816, "kl": 0.07470703125, "learning_rate": 6.961626073243385e-07, "loss": -0.0142, "reward": 1.3241374492645264, "reward_std": 0.0708392933011055, "rewards/accuracy_reward_stage2": 0.46476244926452637, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1735 }, { "completion_length": 8.46875, "epoch": 0.3041878394953566, "grad_norm": 18.37673948165755, "kl": 0.0927734375, "learning_rate": 6.959873839144909e-07, "loss": 0.0371, "reward": 1.5182172060012817, "reward_std": 0.2147284299135208, "rewards/accuracy_reward_stage2": 0.5182172060012817, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1736 }, { "completion_length": 8.9375, "epoch": 0.30436306290520415, "grad_norm": 53.08502332734311, "kl": 0.357421875, "learning_rate": 6.958121605046434e-07, "loss": 0.119, "reward": 1.6498416662216187, "reward_std": 0.2479400783777237, "rewards/accuracy_reward_stage2": 0.7904666066169739, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1737 }, { "completion_length": 9.625, "epoch": 0.3045382863150517, "grad_norm": 21.390723516615594, "kl": 0.15625, "learning_rate": 6.956369370947959e-07, "loss": 0.0043, "reward": 1.3170366287231445, "reward_std": 0.2997969686985016, "rewards/accuracy_reward_stage2": 0.4732866883277893, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1738 }, { "completion_length": 8.359375, "epoch": 0.30471350972489925, "grad_norm": 20.05931961782236, "kl": 0.1337890625, "learning_rate": 6.954617136849483e-07, "loss": 0.0094, "reward": 1.5125038623809814, "reward_std": 0.16261497139930725, "rewards/accuracy_reward_stage2": 0.6531288623809814, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1739 }, { "completion_length": 19.984375, "epoch": 0.3048887331347468, "grad_norm": 20.79173114096201, "kl": 0.287109375, "learning_rate": 6.952864902751007e-07, "loss": 0.0703, "reward": 1.5382153987884521, "reward_std": 0.21911008656024933, "rewards/accuracy_reward_stage2": 0.6788403391838074, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1740 }, { "completion_length": 10.828125, "epoch": 0.30506395654459434, "grad_norm": 17.715087000545765, "kl": 0.1396484375, "learning_rate": 6.951112668652531e-07, "loss": 0.0119, "reward": 1.2132325172424316, "reward_std": 0.2066960632801056, "rewards/accuracy_reward_stage2": 0.35385745763778687, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1741 }, { "completion_length": 13.984375, "epoch": 0.3052391799544419, "grad_norm": 30.009838675491842, "kl": 0.1513671875, "learning_rate": 6.949360434554056e-07, "loss": -0.0027, "reward": 1.3477928638458252, "reward_std": 0.23385745286941528, "rewards/accuracy_reward_stage2": 0.3790428638458252, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1742 }, { "completion_length": 9.71875, "epoch": 0.3054144033642895, "grad_norm": 22.287655232885186, "kl": 0.1650390625, "learning_rate": 6.947608200455581e-07, "loss": -0.0358, "reward": 1.2880135774612427, "reward_std": 0.39539164304733276, "rewards/accuracy_reward_stage2": 0.33488860726356506, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1743 }, { "completion_length": 9.953125, "epoch": 0.30558962677413704, "grad_norm": 20.939609903793155, "kl": 0.08251953125, "learning_rate": 6.945855966357104e-07, "loss": 0.0329, "reward": 1.4582154750823975, "reward_std": 0.24663110077381134, "rewards/accuracy_reward_stage2": 0.45821553468704224, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1744 }, { "completion_length": 14.140625, "epoch": 0.3057648501839846, "grad_norm": 30.250789103938825, "kl": 0.5390625, "learning_rate": 6.944103732258629e-07, "loss": 0.1747, "reward": 1.0708760023117065, "reward_std": 0.27258533239364624, "rewards/accuracy_reward_stage2": 0.5865009427070618, "rewards/format_reward_stage1_pointerpad": 0.484375, "scores/accuracy_reward_stage2": 0.484375, "step": 1745 }, { "completion_length": 9.21875, "epoch": 0.30594007359383213, "grad_norm": 21.60245725560257, "kl": 0.0966796875, "learning_rate": 6.942351498160154e-07, "loss": 0.0066, "reward": 1.5730595588684082, "reward_std": 0.2877153158187866, "rewards/accuracy_reward_stage2": 0.5886844992637634, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1746 }, { "completion_length": 12.8125, "epoch": 0.3061152970036797, "grad_norm": 23.293060975908556, "kl": 0.103515625, "learning_rate": 6.940599264061678e-07, "loss": 0.0415, "reward": 1.6763148307800293, "reward_std": 0.24937281012535095, "rewards/accuracy_reward_stage2": 0.6763148307800293, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1747 }, { "completion_length": 9.625, "epoch": 0.3062905204135272, "grad_norm": 20.31014330344514, "kl": 0.06787109375, "learning_rate": 6.938847029963203e-07, "loss": -0.0171, "reward": 1.2781528234481812, "reward_std": 0.18712200224399567, "rewards/accuracy_reward_stage2": 0.41877782344818115, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1748 }, { "completion_length": 12.0625, "epoch": 0.3064657438233748, "grad_norm": 21.356448724215582, "kl": 0.07470703125, "learning_rate": 6.937094795864727e-07, "loss": 0.0298, "reward": 1.7344744205474854, "reward_std": 0.22010810673236847, "rewards/accuracy_reward_stage2": 0.7344744801521301, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1749 }, { "completion_length": 6.34375, "epoch": 0.3066409672332224, "grad_norm": 16.5876889581935, "kl": 0.0654296875, "learning_rate": 6.935342561766252e-07, "loss": -0.0179, "reward": 1.6628472805023193, "reward_std": 0.19510656595230103, "rewards/accuracy_reward_stage2": 0.6784722805023193, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1750 }, { "completion_length": 11.359375, "epoch": 0.3068161906430699, "grad_norm": 19.02986286446497, "kl": 0.0191650390625, "learning_rate": 6.933590327667777e-07, "loss": 0.0076, "reward": 1.5158599615097046, "reward_std": 0.11624392867088318, "rewards/accuracy_reward_stage2": 0.5158599615097046, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1751 }, { "completion_length": 7.796875, "epoch": 0.30699141405291747, "grad_norm": 19.201117488136774, "kl": 0.1806640625, "learning_rate": 6.931838093569301e-07, "loss": 0.0173, "reward": 1.5672911405563354, "reward_std": 0.26074376702308655, "rewards/accuracy_reward_stage2": 0.5985411405563354, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1752 }, { "completion_length": 11.828125, "epoch": 0.307166637462765, "grad_norm": 27.713417077776867, "kl": 0.06787109375, "learning_rate": 6.930085859470825e-07, "loss": -0.017, "reward": 1.5677083730697632, "reward_std": 0.3722786009311676, "rewards/accuracy_reward_stage2": 0.5833333730697632, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1753 }, { "completion_length": 9.328125, "epoch": 0.30734186087261256, "grad_norm": 17.40100673490896, "kl": 0.05126953125, "learning_rate": 6.928333625372349e-07, "loss": 0.0205, "reward": 1.8788065910339355, "reward_std": 0.13032446801662445, "rewards/accuracy_reward_stage2": 0.8788067102432251, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1754 }, { "completion_length": 10.015625, "epoch": 0.30751708428246016, "grad_norm": 19.086859227578596, "kl": 0.06005859375, "learning_rate": 6.926581391273873e-07, "loss": 0.0241, "reward": 1.5123202800750732, "reward_std": 0.11857327073812485, "rewards/accuracy_reward_stage2": 0.6373202800750732, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1755 }, { "completion_length": 8.90625, "epoch": 0.3076923076923077, "grad_norm": 17.686528099809827, "kl": 0.0240478515625, "learning_rate": 6.924829157175398e-07, "loss": 0.0096, "reward": 1.5551719665527344, "reward_std": 0.18451553583145142, "rewards/accuracy_reward_stage2": 0.5551718473434448, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1756 }, { "completion_length": 11.96875, "epoch": 0.30786753110215526, "grad_norm": 19.352114644863942, "kl": 0.0966796875, "learning_rate": 6.923076923076922e-07, "loss": 0.0385, "reward": 1.438650131225586, "reward_std": 0.24392160773277283, "rewards/accuracy_reward_stage2": 0.5636501312255859, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1757 }, { "completion_length": 8.859375, "epoch": 0.3080427545120028, "grad_norm": 27.21748250589423, "kl": 0.1591796875, "learning_rate": 6.921324688978447e-07, "loss": 0.0637, "reward": 1.498934030532837, "reward_std": 0.22403478622436523, "rewards/accuracy_reward_stage2": 0.49893397092819214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1758 }, { "completion_length": 9.140625, "epoch": 0.30821797792185035, "grad_norm": 16.378049138983172, "kl": 0.01336669921875, "learning_rate": 6.919572454879972e-07, "loss": 0.0053, "reward": 1.7135417461395264, "reward_std": 0.16204531490802765, "rewards/accuracy_reward_stage2": 0.7135416865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1759 }, { "completion_length": 8.734375, "epoch": 0.3083932013316979, "grad_norm": 24.21610767057917, "kl": 0.2578125, "learning_rate": 6.917820220781496e-07, "loss": 0.0634, "reward": 1.5016014575958252, "reward_std": 0.2916114926338196, "rewards/accuracy_reward_stage2": 0.5172264575958252, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1760 }, { "completion_length": 15.46875, "epoch": 0.30856842474154544, "grad_norm": 17.971966916719687, "kl": 0.05029296875, "learning_rate": 6.916067986683021e-07, "loss": 0.0202, "reward": 1.7312650680541992, "reward_std": 0.16479554772377014, "rewards/accuracy_reward_stage2": 0.731265127658844, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1761 }, { "completion_length": 12.296875, "epoch": 0.30874364815139305, "grad_norm": 13.864875853871668, "kl": 0.03564453125, "learning_rate": 6.914315752584546e-07, "loss": 0.0142, "reward": 1.4874417781829834, "reward_std": 0.09884752333164215, "rewards/accuracy_reward_stage2": 0.4874417185783386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1762 }, { "completion_length": 14.15625, "epoch": 0.3089188715612406, "grad_norm": 45.83784041709486, "kl": 0.302734375, "learning_rate": 6.91256351848607e-07, "loss": 0.1213, "reward": 1.5485448837280273, "reward_std": 0.26276490092277527, "rewards/accuracy_reward_stage2": 0.6735448837280273, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1763 }, { "completion_length": 10.3125, "epoch": 0.30909409497108814, "grad_norm": 14.837251793084045, "kl": 0.04443359375, "learning_rate": 6.910811284387594e-07, "loss": 0.0178, "reward": 1.5204423666000366, "reward_std": 0.1353163868188858, "rewards/accuracy_reward_stage2": 0.5204423666000366, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1764 }, { "completion_length": 15.171875, "epoch": 0.3092693183809357, "grad_norm": 11.342377410991487, "kl": 0.019287109375, "learning_rate": 6.909059050289118e-07, "loss": 0.0077, "reward": 1.2142300605773926, "reward_std": 0.031139397993683815, "rewards/accuracy_reward_stage2": 0.5892300009727478, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 1765 }, { "completion_length": 9.84375, "epoch": 0.30944454179078323, "grad_norm": 16.764132909546117, "kl": 0.09765625, "learning_rate": 6.907306816190642e-07, "loss": 0.0391, "reward": 1.5185444355010986, "reward_std": 0.09002818167209625, "rewards/accuracy_reward_stage2": 0.6435444355010986, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1766 }, { "completion_length": 11.25, "epoch": 0.3096197652006308, "grad_norm": 23.46573423877756, "kl": 0.04052734375, "learning_rate": 6.905554582092167e-07, "loss": 0.0162, "reward": 1.546875, "reward_std": 0.20872823894023895, "rewards/accuracy_reward_stage2": 0.546875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1767 }, { "completion_length": 9.734375, "epoch": 0.3097949886104784, "grad_norm": 22.381252833486272, "kl": 0.08251953125, "learning_rate": 6.903802347993691e-07, "loss": 0.033, "reward": 1.6614583730697632, "reward_std": 0.30925020575523376, "rewards/accuracy_reward_stage2": 0.6614583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1768 }, { "completion_length": 11.6875, "epoch": 0.30997021202032593, "grad_norm": 20.636808608361164, "kl": 0.126953125, "learning_rate": 6.902050113895216e-07, "loss": 0.0063, "reward": 1.4054598808288574, "reward_std": 0.2945351302623749, "rewards/accuracy_reward_stage2": 0.4210848808288574, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1769 }, { "completion_length": 7.5, "epoch": 0.3101454354301735, "grad_norm": 15.579460620975455, "kl": 0.039306640625, "learning_rate": 6.900297879796741e-07, "loss": 0.0157, "reward": 1.7460291385650635, "reward_std": 0.13293343782424927, "rewards/accuracy_reward_stage2": 0.7460291385650635, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1770 }, { "completion_length": 7.578125, "epoch": 0.310320658840021, "grad_norm": 17.058832854839068, "kl": 0.044677734375, "learning_rate": 6.898545645698265e-07, "loss": 0.0178, "reward": 1.8352556228637695, "reward_std": 0.16354835033416748, "rewards/accuracy_reward_stage2": 0.83525550365448, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1771 }, { "completion_length": 14.390625, "epoch": 0.31049588224986857, "grad_norm": 17.598162532968843, "kl": 0.08203125, "learning_rate": 6.89679341159979e-07, "loss": -0.0556, "reward": 1.3854167461395264, "reward_std": 0.16781337559223175, "rewards/accuracy_reward_stage2": 0.4166666865348816, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1772 }, { "completion_length": 23.28125, "epoch": 0.3106711056597161, "grad_norm": 24.96201886620127, "kl": 0.482421875, "learning_rate": 6.895041177501314e-07, "loss": 0.1934, "reward": 1.4418706893920898, "reward_std": 0.10315918177366257, "rewards/accuracy_reward_stage2": 0.6918706297874451, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1773 }, { "completion_length": 13.234375, "epoch": 0.3108463290695637, "grad_norm": 26.336116245880884, "kl": 0.146484375, "learning_rate": 6.893288943402838e-07, "loss": 0.0587, "reward": 1.6287994384765625, "reward_std": 0.2064562737941742, "rewards/accuracy_reward_stage2": 0.6287994384765625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1774 }, { "completion_length": 14.375, "epoch": 0.31102155247941127, "grad_norm": 17.15853837132984, "kl": 0.037109375, "learning_rate": 6.891536709304363e-07, "loss": -0.0293, "reward": 1.4953597784042358, "reward_std": 0.26825428009033203, "rewards/accuracy_reward_stage2": 0.5109847784042358, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1775 }, { "completion_length": 11.6875, "epoch": 0.3111967758892588, "grad_norm": 19.782067804864628, "kl": 0.1259765625, "learning_rate": 6.889784475205887e-07, "loss": -0.032, "reward": 1.452484130859375, "reward_std": 0.21816937625408173, "rewards/accuracy_reward_stage2": 0.608734130859375, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1776 }, { "completion_length": 14.125, "epoch": 0.31137199929910636, "grad_norm": 28.4017838573517, "kl": 0.134765625, "learning_rate": 6.888032241107412e-07, "loss": 0.0537, "reward": 1.4646635055541992, "reward_std": 0.1783100664615631, "rewards/accuracy_reward_stage2": 0.7146634459495544, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1777 }, { "completion_length": 17.140625, "epoch": 0.3115472227089539, "grad_norm": 22.533784478535114, "kl": 0.12255859375, "learning_rate": 6.886280007008937e-07, "loss": 0.0175, "reward": 1.6235495805740356, "reward_std": 0.32233843207359314, "rewards/accuracy_reward_stage2": 0.6391745805740356, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1778 }, { "completion_length": 11.015625, "epoch": 0.31172244611880146, "grad_norm": 15.745599310481818, "kl": 0.0169677734375, "learning_rate": 6.88452777291046e-07, "loss": 0.0068, "reward": 1.8678183555603027, "reward_std": 0.1711842119693756, "rewards/accuracy_reward_stage2": 0.867818295955658, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1779 }, { "completion_length": 22.296875, "epoch": 0.311897669528649, "grad_norm": 20.326031505125602, "kl": 0.0654296875, "learning_rate": 6.882775538811985e-07, "loss": 0.0261, "reward": 1.4402329921722412, "reward_std": 0.2539198100566864, "rewards/accuracy_reward_stage2": 0.440233051776886, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1780 }, { "completion_length": 42.359375, "epoch": 0.3120728929384966, "grad_norm": 21.174740109176046, "kl": 0.052978515625, "learning_rate": 6.881023304713509e-07, "loss": 0.0213, "reward": 1.5869065523147583, "reward_std": 0.10317087918519974, "rewards/accuracy_reward_stage2": 0.5869066119194031, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1781 }, { "completion_length": 11.90625, "epoch": 0.31224811634834415, "grad_norm": 18.904937260082143, "kl": 0.04345703125, "learning_rate": 6.879271070615034e-07, "loss": -0.0257, "reward": 1.6459707021713257, "reward_std": 0.18680721521377563, "rewards/accuracy_reward_stage2": 0.6615957021713257, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1782 }, { "completion_length": 8.046875, "epoch": 0.3124233397581917, "grad_norm": 25.21570094160528, "kl": 0.1044921875, "learning_rate": 6.877518836516559e-07, "loss": 0.0203, "reward": 1.656537413597107, "reward_std": 0.3188740313053131, "rewards/accuracy_reward_stage2": 0.6721623539924622, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1783 }, { "completion_length": 18.328125, "epoch": 0.31259856316803925, "grad_norm": 12.622433851895309, "kl": 0.010498046875, "learning_rate": 6.875766602418082e-07, "loss": 0.0042, "reward": 1.5705180168151855, "reward_std": 0.11025116592645645, "rewards/accuracy_reward_stage2": 0.570517897605896, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1784 }, { "completion_length": 11.21875, "epoch": 0.3127737865778868, "grad_norm": 19.055355689325225, "kl": 0.053955078125, "learning_rate": 6.874014368319607e-07, "loss": 0.0215, "reward": 1.5969265699386597, "reward_std": 0.23442596197128296, "rewards/accuracy_reward_stage2": 0.7219265699386597, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1785 }, { "completion_length": 15.578125, "epoch": 0.31294900998773434, "grad_norm": 19.587873470697733, "kl": 0.08349609375, "learning_rate": 6.872262134221132e-07, "loss": 0.0335, "reward": 1.5746169090270996, "reward_std": 0.141147643327713, "rewards/accuracy_reward_stage2": 0.5746169090270996, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1786 }, { "completion_length": 9.34375, "epoch": 0.31312423339758194, "grad_norm": 14.958775392170649, "kl": 0.087890625, "learning_rate": 6.870509900122656e-07, "loss": 0.0351, "reward": 1.5703849792480469, "reward_std": 0.11295461654663086, "rewards/accuracy_reward_stage2": 0.5703849792480469, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1787 }, { "completion_length": 13.03125, "epoch": 0.3132994568074295, "grad_norm": 22.341246226878546, "kl": 0.099609375, "learning_rate": 6.868757666024181e-07, "loss": 0.0527, "reward": 1.4932540655136108, "reward_std": 0.2555721402168274, "rewards/accuracy_reward_stage2": 0.6182540655136108, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1788 }, { "completion_length": 15.65625, "epoch": 0.31347468021727704, "grad_norm": 17.995892472886116, "kl": 0.181640625, "learning_rate": 6.867005431925705e-07, "loss": 0.0725, "reward": 1.3227570056915283, "reward_std": 0.10920379310846329, "rewards/accuracy_reward_stage2": 0.4477570056915283, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1789 }, { "completion_length": 12.96875, "epoch": 0.3136499036271246, "grad_norm": 24.844965959313072, "kl": 0.055908203125, "learning_rate": 6.86525319782723e-07, "loss": -0.0218, "reward": 1.794481635093689, "reward_std": 0.24829509854316711, "rewards/accuracy_reward_stage2": 0.8101067543029785, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1790 }, { "completion_length": 12.953125, "epoch": 0.31382512703697213, "grad_norm": 16.769842743412386, "kl": 0.056884765625, "learning_rate": 6.863500963728754e-07, "loss": -0.0214, "reward": 1.5633602142333984, "reward_std": 0.16764867305755615, "rewards/accuracy_reward_stage2": 0.5789852142333984, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1791 }, { "completion_length": 5.625, "epoch": 0.3140003504468197, "grad_norm": 17.682650240172475, "kl": 0.0908203125, "learning_rate": 6.861748729630278e-07, "loss": 0.001, "reward": 1.6108324527740479, "reward_std": 0.20219948887825012, "rewards/accuracy_reward_stage2": 0.6264575719833374, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1792 }, { "completion_length": 12.359375, "epoch": 0.3141755738566673, "grad_norm": 18.742977868192597, "kl": 0.0986328125, "learning_rate": 6.859996495531802e-07, "loss": -0.0046, "reward": 1.536928415298462, "reward_std": 0.2393271028995514, "rewards/accuracy_reward_stage2": 0.5525534152984619, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1793 }, { "completion_length": 17.125, "epoch": 0.3143507972665148, "grad_norm": 2587.4131228674446, "kl": 9.4375, "learning_rate": 6.858244261433327e-07, "loss": 3.7916, "reward": 1.308718204498291, "reward_std": 0.2237774282693863, "rewards/accuracy_reward_stage2": 0.433718204498291, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1794 }, { "completion_length": 9.21875, "epoch": 0.3145260206763624, "grad_norm": 20.517464844163342, "kl": 0.1181640625, "learning_rate": 6.856492027334851e-07, "loss": 0.0474, "reward": 1.5761194229125977, "reward_std": 0.17638246715068817, "rewards/accuracy_reward_stage2": 0.5761193633079529, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1795 }, { "completion_length": 12.65625, "epoch": 0.3147012440862099, "grad_norm": 15.253128507606846, "kl": 0.052734375, "learning_rate": 6.854739793236376e-07, "loss": -0.0121, "reward": 1.2813804149627686, "reward_std": 0.16135841608047485, "rewards/accuracy_reward_stage2": 0.4220053553581238, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1796 }, { "completion_length": 17.984375, "epoch": 0.31487646749605747, "grad_norm": 18.547588567577, "kl": 0.130859375, "learning_rate": 6.8529875591379e-07, "loss": -0.0184, "reward": 1.5960500240325928, "reward_std": 0.2250652313232422, "rewards/accuracy_reward_stage2": 0.6273000836372375, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1797 }, { "completion_length": 8.578125, "epoch": 0.315051690905905, "grad_norm": 16.926765708714097, "kl": 0.10595703125, "learning_rate": 6.851235325039425e-07, "loss": 0.0425, "reward": 1.3718523979187012, "reward_std": 0.14817330241203308, "rewards/accuracy_reward_stage2": 0.4968523681163788, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1798 }, { "completion_length": 8.15625, "epoch": 0.31522691431575256, "grad_norm": 18.962333571474712, "kl": 0.208984375, "learning_rate": 6.84948309094095e-07, "loss": 0.0397, "reward": 1.7007365226745605, "reward_std": 0.24715575575828552, "rewards/accuracy_reward_stage2": 0.7163615226745605, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1799 }, { "completion_length": 9.1875, "epoch": 0.31540213772560016, "grad_norm": 12.197948562184031, "kl": 0.0703125, "learning_rate": 6.847730856842474e-07, "loss": 0.0282, "reward": 1.671875, "reward_std": 0.14489679038524628, "rewards/accuracy_reward_stage2": 0.671875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1800 }, { "completion_length": 7.28125, "epoch": 0.3155773611354477, "grad_norm": 14.937120670245951, "kl": 0.0703125, "learning_rate": 6.845978622743999e-07, "loss": -0.0059, "reward": 1.7304699420928955, "reward_std": 0.19275791943073273, "rewards/accuracy_reward_stage2": 0.7460950016975403, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1801 }, { "completion_length": 10.078125, "epoch": 0.31575258454529526, "grad_norm": 18.702303439522616, "kl": 0.07421875, "learning_rate": 6.844226388645524e-07, "loss": 0.0296, "reward": 1.6846437454223633, "reward_std": 0.1779603809118271, "rewards/accuracy_reward_stage2": 0.6846436858177185, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1802 }, { "completion_length": 10.28125, "epoch": 0.3159278079551428, "grad_norm": 24.060191532518267, "kl": 0.028564453125, "learning_rate": 6.842474154547048e-07, "loss": -0.0053, "reward": 1.7212055921554565, "reward_std": 0.16443204879760742, "rewards/accuracy_reward_stage2": 0.7368306517601013, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1803 }, { "completion_length": 13.234375, "epoch": 0.31610303136499035, "grad_norm": 15.439669746278195, "kl": 0.0625, "learning_rate": 6.840721920448571e-07, "loss": 0.025, "reward": 1.3805962800979614, "reward_std": 0.16425207257270813, "rewards/accuracy_reward_stage2": 0.5055962800979614, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1804 }, { "completion_length": 15.484375, "epoch": 0.3162782547748379, "grad_norm": 22.17714169358847, "kl": 0.1318359375, "learning_rate": 6.838969686350095e-07, "loss": 0.0723, "reward": 1.2215710878372192, "reward_std": 0.161886066198349, "rewards/accuracy_reward_stage2": 0.34657102823257446, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1805 }, { "completion_length": 8.3125, "epoch": 0.3164534781846855, "grad_norm": 18.199861306414956, "kl": 0.05126953125, "learning_rate": 6.83721745225162e-07, "loss": -0.0144, "reward": 1.4537413120269775, "reward_std": 0.2938691973686218, "rewards/accuracy_reward_stage2": 0.46936625242233276, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1806 }, { "completion_length": 9.3125, "epoch": 0.31662870159453305, "grad_norm": 16.691593883218008, "kl": 0.22265625, "learning_rate": 6.835465218153145e-07, "loss": 0.0604, "reward": 1.3344494104385376, "reward_std": 0.16383177042007446, "rewards/accuracy_reward_stage2": 0.5844494104385376, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1807 }, { "completion_length": 8.921875, "epoch": 0.3168039250043806, "grad_norm": 21.852852343190598, "kl": 0.11181640625, "learning_rate": 6.833712984054669e-07, "loss": 0.0204, "reward": 1.7614967823028564, "reward_std": 0.23907579481601715, "rewards/accuracy_reward_stage2": 0.7771217823028564, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1808 }, { "completion_length": 11.09375, "epoch": 0.31697914841422814, "grad_norm": 21.97589200147923, "kl": 0.2470703125, "learning_rate": 6.831960749956194e-07, "loss": 0.0891, "reward": 1.1384527683258057, "reward_std": 0.20597995817661285, "rewards/accuracy_reward_stage2": 0.40407782793045044, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1809 }, { "completion_length": 16.96875, "epoch": 0.3171543718240757, "grad_norm": 27.96568751730448, "kl": 0.265625, "learning_rate": 6.830208515857718e-07, "loss": 0.1061, "reward": 1.6964399814605713, "reward_std": 0.17561517655849457, "rewards/accuracy_reward_stage2": 0.8214400410652161, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1810 }, { "completion_length": 10.5625, "epoch": 0.31732959523392323, "grad_norm": 21.330821947226656, "kl": 0.1865234375, "learning_rate": 6.828456281759243e-07, "loss": 0.0744, "reward": 1.559272050857544, "reward_std": 0.21070796251296997, "rewards/accuracy_reward_stage2": 0.8092721104621887, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1811 }, { "completion_length": 6.90625, "epoch": 0.3175048186437708, "grad_norm": 16.990363875822908, "kl": 0.05517578125, "learning_rate": 6.826704047660768e-07, "loss": 0.0221, "reward": 1.7029221057891846, "reward_std": 0.2239094078540802, "rewards/accuracy_reward_stage2": 0.7029222249984741, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1812 }, { "completion_length": 9.875, "epoch": 0.3176800420536184, "grad_norm": 22.556586253233327, "kl": 0.0419921875, "learning_rate": 6.824951813562291e-07, "loss": 0.0169, "reward": 1.5033842325210571, "reward_std": 0.17745548486709595, "rewards/accuracy_reward_stage2": 0.5033841729164124, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1813 }, { "completion_length": 13.59375, "epoch": 0.31785526546346593, "grad_norm": 16.64785646842737, "kl": 0.0673828125, "learning_rate": 6.823199579463816e-07, "loss": 0.0269, "reward": 1.3829011917114258, "reward_std": 0.1356073021888733, "rewards/accuracy_reward_stage2": 0.3829011917114258, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1814 }, { "completion_length": 9.0625, "epoch": 0.3180304888733135, "grad_norm": 20.360701507122354, "kl": 0.06982421875, "learning_rate": 6.821447345365341e-07, "loss": 0.028, "reward": 1.6769695281982422, "reward_std": 0.16446499526500702, "rewards/accuracy_reward_stage2": 0.6769695281982422, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1815 }, { "completion_length": 13.28125, "epoch": 0.318205712283161, "grad_norm": 26.064078900170728, "kl": 0.11572265625, "learning_rate": 6.819695111266865e-07, "loss": 0.0463, "reward": 1.6434980630874634, "reward_std": 0.2011084407567978, "rewards/accuracy_reward_stage2": 0.7684980630874634, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1816 }, { "completion_length": 13.765625, "epoch": 0.31838093569300857, "grad_norm": 80.66486305198394, "kl": 0.32421875, "learning_rate": 6.817942877168389e-07, "loss": 0.1009, "reward": 1.4909992218017578, "reward_std": 0.2135014533996582, "rewards/accuracy_reward_stage2": 0.6316242218017578, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1817 }, { "completion_length": 9.09375, "epoch": 0.3185561591028561, "grad_norm": 20.126958301707184, "kl": 0.08154296875, "learning_rate": 6.816190643069913e-07, "loss": 0.0326, "reward": 1.7232518196105957, "reward_std": 0.17581014335155487, "rewards/accuracy_reward_stage2": 0.7232518196105957, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1818 }, { "completion_length": 9.03125, "epoch": 0.3187313825127037, "grad_norm": 30.36696329002241, "kl": 0.1337890625, "learning_rate": 6.814438408971438e-07, "loss": 0.0093, "reward": 1.4545722007751465, "reward_std": 0.3125390410423279, "rewards/accuracy_reward_stage2": 0.4701971113681793, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1819 }, { "completion_length": 10.140625, "epoch": 0.31890660592255127, "grad_norm": 25.695729695133302, "kl": 0.150390625, "learning_rate": 6.812686174872963e-07, "loss": 0.0031, "reward": 1.7126063108444214, "reward_std": 0.21297934651374817, "rewards/accuracy_reward_stage2": 0.7438563108444214, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1820 }, { "completion_length": 5.9375, "epoch": 0.3190818293323988, "grad_norm": 10.358628847465996, "kl": 0.0250244140625, "learning_rate": 6.810933940774487e-07, "loss": 0.01, "reward": 1.6214256286621094, "reward_std": 0.0782785713672638, "rewards/accuracy_reward_stage2": 0.6214256286621094, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1821 }, { "completion_length": 5.5625, "epoch": 0.31925705274224636, "grad_norm": 13.398588020960577, "kl": 0.016357421875, "learning_rate": 6.809181706676012e-07, "loss": 0.0065, "reward": 1.6663763523101807, "reward_std": 0.0802459791302681, "rewards/accuracy_reward_stage2": 0.6663764119148254, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1822 }, { "completion_length": 9.453125, "epoch": 0.3194322761520939, "grad_norm": 10.403578788426847, "kl": 0.06982421875, "learning_rate": 6.807429472577537e-07, "loss": -0.006, "reward": 1.1771003007888794, "reward_std": 0.07623656839132309, "rewards/accuracy_reward_stage2": 0.192725270986557, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1823 }, { "completion_length": 11.140625, "epoch": 0.31960749956194145, "grad_norm": 17.66257215236959, "kl": 0.09326171875, "learning_rate": 6.80567723847906e-07, "loss": 0.0374, "reward": 1.367735743522644, "reward_std": 0.20256279408931732, "rewards/accuracy_reward_stage2": 0.49273571372032166, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1824 }, { "completion_length": 10.84375, "epoch": 0.31978272297178906, "grad_norm": 21.69167316332861, "kl": 0.130859375, "learning_rate": 6.803925004380585e-07, "loss": -0.121, "reward": 1.7133700847625732, "reward_std": 0.3308318555355072, "rewards/accuracy_reward_stage2": 0.7758700847625732, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1825 }, { "completion_length": 17.40625, "epoch": 0.3199579463816366, "grad_norm": 187.1766272653558, "kl": 1.421875, "learning_rate": 6.802172770282109e-07, "loss": 0.5675, "reward": 1.5635792016983032, "reward_std": 0.08996030688285828, "rewards/accuracy_reward_stage2": 0.6885791420936584, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1826 }, { "completion_length": 9.796875, "epoch": 0.32013316979148415, "grad_norm": 21.355418912809615, "kl": 0.10205078125, "learning_rate": 6.800420536183634e-07, "loss": -0.0034, "reward": 1.9030907154083252, "reward_std": 0.2621467709541321, "rewards/accuracy_reward_stage2": 0.9187155961990356, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1827 }, { "completion_length": 11.875, "epoch": 0.3203083932013317, "grad_norm": 359.33581112946854, "kl": 0.8203125, "learning_rate": 6.798668302085159e-07, "loss": 0.3285, "reward": 1.2450122833251953, "reward_std": 0.19816677272319794, "rewards/accuracy_reward_stage2": 0.37001216411590576, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1828 }, { "completion_length": 18.609375, "epoch": 0.32048361661117925, "grad_norm": 48.93425362715092, "kl": 0.4140625, "learning_rate": 6.796916067986683e-07, "loss": 0.1657, "reward": 1.510254144668579, "reward_std": 0.24771133065223694, "rewards/accuracy_reward_stage2": 0.6352540850639343, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1829 }, { "completion_length": 12.90625, "epoch": 0.3206588400210268, "grad_norm": 20.2390037277772, "kl": 0.10498046875, "learning_rate": 6.795163833888207e-07, "loss": 0.013, "reward": 1.727494478225708, "reward_std": 0.18558049201965332, "rewards/accuracy_reward_stage2": 0.7431195378303528, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1830 }, { "completion_length": 9.75, "epoch": 0.32083406343087434, "grad_norm": 25.06970663743917, "kl": 0.2421875, "learning_rate": 6.793411599789732e-07, "loss": 0.0968, "reward": 1.4367856979370117, "reward_std": 0.2500126361846924, "rewards/accuracy_reward_stage2": 0.5617856979370117, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1831 }, { "completion_length": 11.5625, "epoch": 0.32100928684072194, "grad_norm": 19.475838526195844, "kl": 0.01373291015625, "learning_rate": 6.791659365691256e-07, "loss": 0.0055, "reward": 1.711681604385376, "reward_std": 0.1387220323085785, "rewards/accuracy_reward_stage2": 0.711681604385376, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1832 }, { "completion_length": 9.3125, "epoch": 0.3211845102505695, "grad_norm": 15.38598416242657, "kl": 0.08349609375, "learning_rate": 6.78990713159278e-07, "loss": 0.0001, "reward": 1.4822373390197754, "reward_std": 0.24437138438224792, "rewards/accuracy_reward_stage2": 0.49786239862442017, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1833 }, { "completion_length": 16.46875, "epoch": 0.32135973366041704, "grad_norm": 26.15721137803418, "kl": 0.265625, "learning_rate": 6.788154897494304e-07, "loss": 0.0644, "reward": 1.4538758993148804, "reward_std": 0.28248101472854614, "rewards/accuracy_reward_stage2": 0.5945007801055908, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1834 }, { "completion_length": 16.109375, "epoch": 0.3215349570702646, "grad_norm": 22.7187542979085, "kl": 0.375, "learning_rate": 6.786402663395829e-07, "loss": 0.1497, "reward": 1.4750638008117676, "reward_std": 0.2322225123643875, "rewards/accuracy_reward_stage2": 0.725063681602478, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1835 }, { "completion_length": 20.046875, "epoch": 0.32171018048011213, "grad_norm": 19.070056528167537, "kl": 0.484375, "learning_rate": 6.784650429297354e-07, "loss": 0.2005, "reward": 1.3314766883850098, "reward_std": 0.22183364629745483, "rewards/accuracy_reward_stage2": 0.5814766883850098, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1836 }, { "completion_length": 8.125, "epoch": 0.3218854038899597, "grad_norm": 14.357050005656363, "kl": 0.07080078125, "learning_rate": 6.782898195198878e-07, "loss": 0.0283, "reward": 1.2363032102584839, "reward_std": 0.13740503787994385, "rewards/accuracy_reward_stage2": 0.3613032400608063, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1837 }, { "completion_length": 16.015625, "epoch": 0.3220606272998073, "grad_norm": 18.629287627422684, "kl": 0.06396484375, "learning_rate": 6.781145961100403e-07, "loss": 0.0255, "reward": 1.4411256313323975, "reward_std": 0.14058303833007812, "rewards/accuracy_reward_stage2": 0.5661256313323975, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1838 }, { "completion_length": 10.0, "epoch": 0.3222358507096548, "grad_norm": 19.43622528331183, "kl": 0.06396484375, "learning_rate": 6.779393727001928e-07, "loss": -0.0035, "reward": 1.5037338733673096, "reward_std": 0.2288183569908142, "rewards/accuracy_reward_stage2": 0.5193589925765991, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1839 }, { "completion_length": 13.40625, "epoch": 0.32241107411950237, "grad_norm": 21.843282347266314, "kl": 0.1728515625, "learning_rate": 6.777641492903452e-07, "loss": 0.0693, "reward": 1.3334496021270752, "reward_std": 0.11595918238162994, "rewards/accuracy_reward_stage2": 0.45844966173171997, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1840 }, { "completion_length": 10.6875, "epoch": 0.3225862975293499, "grad_norm": 16.798007544070753, "kl": 0.05859375, "learning_rate": 6.775889258804977e-07, "loss": -0.0713, "reward": 1.5361828804016113, "reward_std": 0.19550885260105133, "rewards/accuracy_reward_stage2": 0.5830577611923218, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1841 }, { "completion_length": 10.125, "epoch": 0.32276152093919747, "grad_norm": 14.468662236027644, "kl": 0.0189208984375, "learning_rate": 6.7741370247065e-07, "loss": 0.0076, "reward": 1.5052210092544556, "reward_std": 0.10745733976364136, "rewards/accuracy_reward_stage2": 0.5052210092544556, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1842 }, { "completion_length": 31.875, "epoch": 0.322936744349045, "grad_norm": 19.206921000127394, "kl": 0.053466796875, "learning_rate": 6.772384790608024e-07, "loss": 0.0213, "reward": 1.525323510169983, "reward_std": 0.15850859880447388, "rewards/accuracy_reward_stage2": 0.5253235101699829, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1843 }, { "completion_length": 18.328125, "epoch": 0.3231119677588926, "grad_norm": 15.084336274366542, "kl": 0.234375, "learning_rate": 6.770632556509549e-07, "loss": 0.0647, "reward": 1.266427993774414, "reward_std": 0.14675506949424744, "rewards/accuracy_reward_stage2": 0.4070529043674469, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1844 }, { "completion_length": 8.65625, "epoch": 0.32328719116874016, "grad_norm": 14.951960898593283, "kl": 0.0400390625, "learning_rate": 6.768880322411073e-07, "loss": 0.0161, "reward": 1.4418463706970215, "reward_std": 0.08619339764118195, "rewards/accuracy_reward_stage2": 0.6918463110923767, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1845 }, { "completion_length": 10.921875, "epoch": 0.3234624145785877, "grad_norm": 34.66320880041889, "kl": 0.11328125, "learning_rate": 6.767128088312598e-07, "loss": 0.0453, "reward": 1.4650702476501465, "reward_std": 0.17414847016334534, "rewards/accuracy_reward_stage2": 0.5900702476501465, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1846 }, { "completion_length": 8.46875, "epoch": 0.32363763798843526, "grad_norm": 20.237955441663978, "kl": 0.2080078125, "learning_rate": 6.765375854214123e-07, "loss": -0.0051, "reward": 1.6920424699783325, "reward_std": 0.34711211919784546, "rewards/accuracy_reward_stage2": 0.8482924699783325, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1847 }, { "completion_length": 11.859375, "epoch": 0.3238128613982828, "grad_norm": 24.731047006662198, "kl": 0.1640625, "learning_rate": 6.763623620115647e-07, "loss": -0.0182, "reward": 1.280834674835205, "reward_std": 0.28956809639930725, "rewards/accuracy_reward_stage2": 0.4370846748352051, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1848 }, { "completion_length": 10.8125, "epoch": 0.32398808480813035, "grad_norm": 26.723550080573006, "kl": 0.279296875, "learning_rate": 6.761871386017172e-07, "loss": 0.1113, "reward": 1.5930095911026, "reward_std": 0.22630318999290466, "rewards/accuracy_reward_stage2": 0.7180095911026001, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1849 }, { "completion_length": 13.390625, "epoch": 0.3241633082179779, "grad_norm": 22.345652650672584, "kl": 0.115234375, "learning_rate": 6.760119151918696e-07, "loss": 0.046, "reward": 1.3461437225341797, "reward_std": 0.28940150141716003, "rewards/accuracy_reward_stage2": 0.4711437523365021, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1850 }, { "completion_length": 11.828125, "epoch": 0.3243385316278255, "grad_norm": 24.827188975257773, "kl": 0.01324462890625, "learning_rate": 6.758366917820221e-07, "loss": 0.0053, "reward": 1.484375, "reward_std": 0.25217998027801514, "rewards/accuracy_reward_stage2": 0.484375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1851 }, { "completion_length": 11.953125, "epoch": 0.32451375503767305, "grad_norm": 17.497428306046615, "kl": 0.13671875, "learning_rate": 6.756614683721746e-07, "loss": -0.0133, "reward": 1.5360865592956543, "reward_std": 0.23445501923561096, "rewards/accuracy_reward_stage2": 0.5673364996910095, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1852 }, { "completion_length": 10.5625, "epoch": 0.3246889784475206, "grad_norm": 22.98841053109054, "kl": 0.052734375, "learning_rate": 6.754862449623269e-07, "loss": 0.0211, "reward": 1.4700126647949219, "reward_std": 0.3278173804283142, "rewards/accuracy_reward_stage2": 0.4700126349925995, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1853 }, { "completion_length": 10.109375, "epoch": 0.32486420185736814, "grad_norm": 23.18911960312045, "kl": 0.14453125, "learning_rate": 6.753110215524794e-07, "loss": 0.0169, "reward": 1.4488449096679688, "reward_std": 0.17043456435203552, "rewards/accuracy_reward_stage2": 0.4644698202610016, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1854 }, { "completion_length": 7.875, "epoch": 0.3250394252672157, "grad_norm": 19.61804996855809, "kl": 0.10791015625, "learning_rate": 6.751357981426318e-07, "loss": 0.0431, "reward": 1.716990351676941, "reward_std": 0.22781193256378174, "rewards/accuracy_reward_stage2": 0.7169903516769409, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1855 }, { "completion_length": 9.65625, "epoch": 0.32521464867706323, "grad_norm": 14.670859725635287, "kl": 0.283203125, "learning_rate": 6.749605747327842e-07, "loss": 0.025, "reward": 1.4233312606811523, "reward_std": 0.11870677769184113, "rewards/accuracy_reward_stage2": 0.5795813202857971, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1856 }, { "completion_length": 4.6875, "epoch": 0.32538987208691084, "grad_norm": 14.784253008631724, "kl": 0.0859375, "learning_rate": 6.747853513229367e-07, "loss": -0.0098, "reward": 1.6528429985046387, "reward_std": 0.19694890081882477, "rewards/accuracy_reward_stage2": 0.6684680581092834, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1857 }, { "completion_length": 11.484375, "epoch": 0.3255650954967584, "grad_norm": 17.366655675185815, "kl": 0.09716796875, "learning_rate": 6.746101279130891e-07, "loss": 0.0047, "reward": 1.3349708318710327, "reward_std": 0.18092112243175507, "rewards/accuracy_reward_stage2": 0.3505958318710327, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1858 }, { "completion_length": 10.015625, "epoch": 0.32574031890660593, "grad_norm": 20.455291845170937, "kl": 0.1025390625, "learning_rate": 6.744349045032416e-07, "loss": -0.0135, "reward": 1.5648746490478516, "reward_std": 0.3064984083175659, "rewards/accuracy_reward_stage2": 0.7054996490478516, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1859 }, { "completion_length": 9.1875, "epoch": 0.3259155423164535, "grad_norm": 21.613500327855913, "kl": 0.10009765625, "learning_rate": 6.742596810933941e-07, "loss": -0.0377, "reward": 1.8078569173812866, "reward_std": 0.2591524124145508, "rewards/accuracy_reward_stage2": 0.8391069769859314, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1860 }, { "completion_length": 9.578125, "epoch": 0.326090765726301, "grad_norm": 26.2316907230029, "kl": 0.17578125, "learning_rate": 6.740844576835465e-07, "loss": 0.0702, "reward": 1.3697917461395264, "reward_std": 0.3007485270500183, "rewards/accuracy_reward_stage2": 0.6197916269302368, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1861 }, { "completion_length": 14.3125, "epoch": 0.32626598913614857, "grad_norm": 245.91376195572363, "kl": 1.3046875, "learning_rate": 6.73909234273699e-07, "loss": 0.4788, "reward": 1.1469907760620117, "reward_std": 0.25098198652267456, "rewards/accuracy_reward_stage2": 0.5376157164573669, "rewards/format_reward_stage1_pointerpad": 0.609375, "scores/accuracy_reward_stage2": 0.609375, "step": 1862 }, { "completion_length": 6.859375, "epoch": 0.3264412125459961, "grad_norm": 46.635079623868656, "kl": 0.3984375, "learning_rate": 6.737340108638514e-07, "loss": 0.0704, "reward": 1.6059027910232544, "reward_std": 0.29173195362091064, "rewards/accuracy_reward_stage2": 0.6371527910232544, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1863 }, { "completion_length": 7.90625, "epoch": 0.3266164359558437, "grad_norm": 13.745160872355433, "kl": 0.0751953125, "learning_rate": 6.735587874540038e-07, "loss": 0.03, "reward": 1.6368508338928223, "reward_std": 0.0824931263923645, "rewards/accuracy_reward_stage2": 0.6368508338928223, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1864 }, { "completion_length": 18.859375, "epoch": 0.32679165936569127, "grad_norm": 43.0947117605456, "kl": 0.326171875, "learning_rate": 6.733835640441563e-07, "loss": 0.0918, "reward": 1.6280841827392578, "reward_std": 0.30339083075523376, "rewards/accuracy_reward_stage2": 0.768709123134613, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1865 }, { "completion_length": 11.203125, "epoch": 0.3269668827755388, "grad_norm": 40.6742173575688, "kl": 0.349609375, "learning_rate": 6.732083406343087e-07, "loss": 0.1224, "reward": 1.433029294013977, "reward_std": 0.21674448251724243, "rewards/accuracy_reward_stage2": 0.683029294013977, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1866 }, { "completion_length": 13.203125, "epoch": 0.32714210618538636, "grad_norm": 16.46817567207021, "kl": 0.046142578125, "learning_rate": 6.730331172244612e-07, "loss": -0.0105, "reward": 1.75, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.765625, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1867 }, { "completion_length": 16.546875, "epoch": 0.3273173295952339, "grad_norm": 22.167670621631846, "kl": 0.287109375, "learning_rate": 6.728578938146136e-07, "loss": 0.1147, "reward": 1.1454861164093018, "reward_std": 0.17306698858737946, "rewards/accuracy_reward_stage2": 0.39548611640930176, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1868 }, { "completion_length": 10.6875, "epoch": 0.32749255300508145, "grad_norm": 19.296797288283493, "kl": 0.09033203125, "learning_rate": 6.72682670404766e-07, "loss": 0.0362, "reward": 1.3370780944824219, "reward_std": 0.15424805879592896, "rewards/accuracy_reward_stage2": 0.33707812428474426, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1869 }, { "completion_length": 9.53125, "epoch": 0.32766777641492906, "grad_norm": 13.949661990547954, "kl": 0.0218505859375, "learning_rate": 6.725074469949185e-07, "loss": 0.0088, "reward": 1.613518476486206, "reward_std": 0.09896919131278992, "rewards/accuracy_reward_stage2": 0.613518476486206, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1870 }, { "completion_length": 10.03125, "epoch": 0.3278429998247766, "grad_norm": 19.47370807396806, "kl": 0.1328125, "learning_rate": 6.72332223585071e-07, "loss": 0.009, "reward": 1.728609561920166, "reward_std": 0.16377633810043335, "rewards/accuracy_reward_stage2": 0.744234561920166, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1871 }, { "completion_length": 11.234375, "epoch": 0.32801822323462415, "grad_norm": 21.55942535287268, "kl": 0.1005859375, "learning_rate": 6.721570001752234e-07, "loss": 0.0403, "reward": 1.6306660175323486, "reward_std": 0.21363964676856995, "rewards/accuracy_reward_stage2": 0.6306659579277039, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1872 }, { "completion_length": 18.453125, "epoch": 0.3281934466444717, "grad_norm": 16.540119657055413, "kl": 0.166015625, "learning_rate": 6.719817767653758e-07, "loss": 0.0664, "reward": 1.2170138359069824, "reward_std": 0.1963312327861786, "rewards/accuracy_reward_stage2": 0.3420138955116272, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1873 }, { "completion_length": 9.828125, "epoch": 0.32836867005431924, "grad_norm": 18.35374074903453, "kl": 0.09326171875, "learning_rate": 6.718065533555282e-07, "loss": 0.0374, "reward": 1.5539817810058594, "reward_std": 0.16566669940948486, "rewards/accuracy_reward_stage2": 0.5539816617965698, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1874 }, { "completion_length": 6.484375, "epoch": 0.3285438934641668, "grad_norm": 14.844442858283866, "kl": 0.025634765625, "learning_rate": 6.716313299456807e-07, "loss": 0.0102, "reward": 1.744091510772705, "reward_std": 0.09293541312217712, "rewards/accuracy_reward_stage2": 0.7440915107727051, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1875 }, { "completion_length": 9.125, "epoch": 0.3287191168740144, "grad_norm": 25.254916645541858, "kl": 0.1181640625, "learning_rate": 6.714561065358332e-07, "loss": 0.0377, "reward": 1.3898807764053345, "reward_std": 0.2201821655035019, "rewards/accuracy_reward_stage2": 0.5305057764053345, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1876 }, { "completion_length": 9.609375, "epoch": 0.32889434028386194, "grad_norm": 25.69400947159281, "kl": 0.126953125, "learning_rate": 6.712808831259856e-07, "loss": -0.0167, "reward": 1.633712649345398, "reward_std": 0.3901214599609375, "rewards/accuracy_reward_stage2": 0.6649625301361084, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1877 }, { "completion_length": 11.625, "epoch": 0.3290695636937095, "grad_norm": 16.977959285591556, "kl": 0.2265625, "learning_rate": 6.711056597161381e-07, "loss": -0.0361, "reward": 1.560457706451416, "reward_std": 0.28971582651138306, "rewards/accuracy_reward_stage2": 0.732332706451416, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1878 }, { "completion_length": 12.390625, "epoch": 0.32924478710355704, "grad_norm": 18.18831099517817, "kl": 0.0615234375, "learning_rate": 6.709304363062906e-07, "loss": 0.0246, "reward": 1.5500532388687134, "reward_std": 0.13439956307411194, "rewards/accuracy_reward_stage2": 0.5500532388687134, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1879 }, { "completion_length": 14.375, "epoch": 0.3294200105134046, "grad_norm": 18.965881019041337, "kl": 0.44921875, "learning_rate": 6.707552128964429e-07, "loss": 0.1797, "reward": 1.4239583015441895, "reward_std": 0.23425593972206116, "rewards/accuracy_reward_stage2": 0.6739581823348999, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1880 }, { "completion_length": 14.625, "epoch": 0.32959523392325213, "grad_norm": 19.55683458826524, "kl": 0.103515625, "learning_rate": 6.705799894865954e-07, "loss": 0.0413, "reward": 1.6188348531723022, "reward_std": 0.22359533607959747, "rewards/accuracy_reward_stage2": 0.6188348531723022, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1881 }, { "completion_length": 12.59375, "epoch": 0.3297704573330997, "grad_norm": 18.088377876534306, "kl": 0.040283203125, "learning_rate": 6.704047660767477e-07, "loss": 0.0161, "reward": 1.8368244171142578, "reward_std": 0.14354784786701202, "rewards/accuracy_reward_stage2": 0.8368244171142578, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1882 }, { "completion_length": 33.171875, "epoch": 0.3299456807429473, "grad_norm": 16.8912919381152, "kl": 0.12060546875, "learning_rate": 6.702295426669002e-07, "loss": 0.0044, "reward": 1.1973037719726562, "reward_std": 0.18049047887325287, "rewards/accuracy_reward_stage2": 0.33792880177497864, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1883 }, { "completion_length": 10.1875, "epoch": 0.3301209041527948, "grad_norm": 18.89381295924757, "kl": 0.038818359375, "learning_rate": 6.700543192570527e-07, "loss": 0.0155, "reward": 1.4166667461395264, "reward_std": 0.21836219727993011, "rewards/accuracy_reward_stage2": 0.6666666269302368, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1884 }, { "completion_length": 7.984375, "epoch": 0.33029612756264237, "grad_norm": 20.448251291802112, "kl": 0.049560546875, "learning_rate": 6.698790958472051e-07, "loss": 0.0198, "reward": 1.7605003118515015, "reward_std": 0.3169490694999695, "rewards/accuracy_reward_stage2": 0.7605003118515015, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1885 }, { "completion_length": 10.890625, "epoch": 0.3304713509724899, "grad_norm": 22.309793657882366, "kl": 0.201171875, "learning_rate": 6.697038724373576e-07, "loss": 0.0363, "reward": 1.2537977695465088, "reward_std": 0.2588205635547638, "rewards/accuracy_reward_stage2": 0.3944226801395416, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1886 }, { "completion_length": 12.78125, "epoch": 0.33064657438233747, "grad_norm": 18.848369498488996, "kl": 0.10791015625, "learning_rate": 6.6952864902751e-07, "loss": -0.0452, "reward": 1.6480014324188232, "reward_std": 0.2525930404663086, "rewards/accuracy_reward_stage2": 0.6792514324188232, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1887 }, { "completion_length": 12.8125, "epoch": 0.330821797792185, "grad_norm": 18.613152918954547, "kl": 0.028564453125, "learning_rate": 6.693534256176625e-07, "loss": 0.0114, "reward": 1.9396920204162598, "reward_std": 0.07260610163211823, "rewards/accuracy_reward_stage2": 0.9396920204162598, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1888 }, { "completion_length": 7.15625, "epoch": 0.3309970212020326, "grad_norm": 14.881380789241637, "kl": 0.08447265625, "learning_rate": 6.69178202207815e-07, "loss": -0.048, "reward": 1.6165692806243896, "reward_std": 0.22685596346855164, "rewards/accuracy_reward_stage2": 0.6478191614151001, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1889 }, { "completion_length": 21.734375, "epoch": 0.33117224461188016, "grad_norm": 19.699195856672503, "kl": 0.12255859375, "learning_rate": 6.690029787979674e-07, "loss": 0.0249, "reward": 1.533193588256836, "reward_std": 0.2367192953824997, "rewards/accuracy_reward_stage2": 0.5488186478614807, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1890 }, { "completion_length": 11.078125, "epoch": 0.3313474680217277, "grad_norm": 21.620235405578864, "kl": 0.099609375, "learning_rate": 6.688277553881199e-07, "loss": -0.0043, "reward": 1.7323949337005615, "reward_std": 0.19700127840042114, "rewards/accuracy_reward_stage2": 0.7480199337005615, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1891 }, { "completion_length": 15.875, "epoch": 0.33152269143157526, "grad_norm": 69.12943130337275, "kl": 0.79296875, "learning_rate": 6.686525319782724e-07, "loss": 0.2731, "reward": 1.488948106765747, "reward_std": 0.253986656665802, "rewards/accuracy_reward_stage2": 0.6295732259750366, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1892 }, { "completion_length": 18.109375, "epoch": 0.3316979148414228, "grad_norm": 17.98704485559557, "kl": 0.01385498046875, "learning_rate": 6.684773085684246e-07, "loss": 0.0055, "reward": 1.6691596508026123, "reward_std": 0.14532330632209778, "rewards/accuracy_reward_stage2": 0.6691597104072571, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1893 }, { "completion_length": 12.921875, "epoch": 0.33187313825127035, "grad_norm": 53.84006846321401, "kl": 0.310546875, "learning_rate": 6.683020851585771e-07, "loss": -0.0064, "reward": 1.3225611448287964, "reward_std": 0.3454180955886841, "rewards/accuracy_reward_stage2": 0.3850611746311188, "rewards/format_reward_stage1_pointerpad": 0.9375, "scores/accuracy_reward_stage2": 0.9375, "step": 1894 }, { "completion_length": 13.15625, "epoch": 0.33204836166111795, "grad_norm": 22.44899572206889, "kl": 0.51171875, "learning_rate": 6.681268617487295e-07, "loss": 0.16, "reward": 1.7277836799621582, "reward_std": 0.1523258537054062, "rewards/accuracy_reward_stage2": 0.8684086799621582, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1895 }, { "completion_length": 13.125, "epoch": 0.3322235850709655, "grad_norm": 17.53173188046345, "kl": 0.1181640625, "learning_rate": 6.67951638338882e-07, "loss": 0.0257, "reward": 1.5292876958847046, "reward_std": 0.13309209048748016, "rewards/accuracy_reward_stage2": 0.5449126958847046, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1896 }, { "completion_length": 8.453125, "epoch": 0.33239880848081305, "grad_norm": 20.38947336089091, "kl": 0.1484375, "learning_rate": 6.677764149290345e-07, "loss": 0.0592, "reward": 1.761195421218872, "reward_std": 0.23012542724609375, "rewards/accuracy_reward_stage2": 0.7611954212188721, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1897 }, { "completion_length": 6.8125, "epoch": 0.3325740318906606, "grad_norm": 20.896132445671658, "kl": 0.1962890625, "learning_rate": 6.676011915191869e-07, "loss": 0.0785, "reward": 1.5751183032989502, "reward_std": 0.16306456923484802, "rewards/accuracy_reward_stage2": 0.700118362903595, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1898 }, { "completion_length": 16.625, "epoch": 0.33274925530050814, "grad_norm": 19.194750271365315, "kl": 0.095703125, "learning_rate": 6.674259681093394e-07, "loss": 0.0095, "reward": 1.3477399349212646, "reward_std": 0.16524702310562134, "rewards/accuracy_reward_stage2": 0.4883649945259094, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1899 }, { "completion_length": 11.015625, "epoch": 0.3329244787103557, "grad_norm": 22.31802873960122, "kl": 0.1669921875, "learning_rate": 6.672507446994919e-07, "loss": 0.044, "reward": 1.4311261177062988, "reward_std": 0.2616199254989624, "rewards/accuracy_reward_stage2": 0.4623761773109436, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1900 }, { "completion_length": 10.5625, "epoch": 0.33309970212020323, "grad_norm": 18.039528111074194, "kl": 0.07666015625, "learning_rate": 6.670755212896443e-07, "loss": 0.0306, "reward": 1.7664124965667725, "reward_std": 0.18786652386188507, "rewards/accuracy_reward_stage2": 0.7664124369621277, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1901 }, { "completion_length": 17.609375, "epoch": 0.33327492553005084, "grad_norm": 19.913840997681163, "kl": 0.1298828125, "learning_rate": 6.669002978797968e-07, "loss": -0.0146, "reward": 1.6314010620117188, "reward_std": 0.21882987022399902, "rewards/accuracy_reward_stage2": 0.6626511216163635, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1902 }, { "completion_length": 14.453125, "epoch": 0.3334501489398984, "grad_norm": 23.040483980646762, "kl": 0.08056640625, "learning_rate": 6.667250744699491e-07, "loss": 0.0004, "reward": 1.421668529510498, "reward_std": 0.29481422901153564, "rewards/accuracy_reward_stage2": 0.43729349970817566, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1903 }, { "completion_length": 8.484375, "epoch": 0.33362537234974593, "grad_norm": 15.772263098762213, "kl": 0.107421875, "learning_rate": 6.665498510601016e-07, "loss": -0.0453, "reward": 1.7050449848175049, "reward_std": 0.2197214812040329, "rewards/accuracy_reward_stage2": 0.7362948656082153, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1904 }, { "completion_length": 11.421875, "epoch": 0.3338005957595935, "grad_norm": 16.85209853349615, "kl": 0.006591796875, "learning_rate": 6.663746276502541e-07, "loss": 0.0026, "reward": 1.4322917461395264, "reward_std": 0.16098348796367645, "rewards/accuracy_reward_stage2": 0.5572916269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1905 }, { "completion_length": 22.5625, "epoch": 0.333975819169441, "grad_norm": 19.93099087602353, "kl": 0.2890625, "learning_rate": 6.661994042404064e-07, "loss": 0.1219, "reward": 1.402681827545166, "reward_std": 0.13913270831108093, "rewards/accuracy_reward_stage2": 0.6526818871498108, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1906 }, { "completion_length": 10.109375, "epoch": 0.33415104257928857, "grad_norm": 21.492901823484114, "kl": 0.054443359375, "learning_rate": 6.660241808305589e-07, "loss": 0.0218, "reward": 1.5858439207077026, "reward_std": 0.23922909796237946, "rewards/accuracy_reward_stage2": 0.5858439207077026, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1907 }, { "completion_length": 8.34375, "epoch": 0.3343262659891362, "grad_norm": 18.817009949809382, "kl": 0.076171875, "learning_rate": 6.658489574207114e-07, "loss": 0.0304, "reward": 1.353365421295166, "reward_std": 0.15156733989715576, "rewards/accuracy_reward_stage2": 0.35336539149284363, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1908 }, { "completion_length": 10.25, "epoch": 0.3345014893989837, "grad_norm": 20.460697523371316, "kl": 0.52734375, "learning_rate": 6.656737340108638e-07, "loss": 0.1231, "reward": 1.758762240409851, "reward_std": 0.20783132314682007, "rewards/accuracy_reward_stage2": 0.8993872404098511, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1909 }, { "completion_length": 14.6875, "epoch": 0.33467671280883127, "grad_norm": 23.673039520549196, "kl": 0.04638671875, "learning_rate": 6.654985106010163e-07, "loss": -0.0256, "reward": 1.6718825101852417, "reward_std": 0.23551476001739502, "rewards/accuracy_reward_stage2": 0.6875075697898865, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1910 }, { "completion_length": 15.40625, "epoch": 0.3348519362186788, "grad_norm": 23.44966109378244, "kl": 0.3984375, "learning_rate": 6.653232871911687e-07, "loss": 0.1594, "reward": 1.3732054233551025, "reward_std": 0.16537074744701385, "rewards/accuracy_reward_stage2": 0.6232053637504578, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1911 }, { "completion_length": 11.5, "epoch": 0.33502715962852636, "grad_norm": 25.39244699869401, "kl": 0.21875, "learning_rate": 6.651480637813211e-07, "loss": 0.0779, "reward": 1.6530308723449707, "reward_std": 0.18545380234718323, "rewards/accuracy_reward_stage2": 0.7936557531356812, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1912 }, { "completion_length": 32.09375, "epoch": 0.3352023830383739, "grad_norm": 20.433934641372062, "kl": 0.19921875, "learning_rate": 6.649728403714736e-07, "loss": 0.0457, "reward": 1.4659879207611084, "reward_std": 0.24502934515476227, "rewards/accuracy_reward_stage2": 0.6066129207611084, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1913 }, { "completion_length": 12.484375, "epoch": 0.33537760644822145, "grad_norm": 17.940106585468374, "kl": 0.08935546875, "learning_rate": 6.64797616961626e-07, "loss": 0.0024, "reward": 1.429773211479187, "reward_std": 0.20611415803432465, "rewards/accuracy_reward_stage2": 0.4453982710838318, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1914 }, { "completion_length": 11.734375, "epoch": 0.33555282985806906, "grad_norm": 21.91538716583864, "kl": 0.11181640625, "learning_rate": 6.646223935517785e-07, "loss": 0.0554, "reward": 1.651969075202942, "reward_std": 0.1828499734401703, "rewards/accuracy_reward_stage2": 0.7769691348075867, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1915 }, { "completion_length": 10.984375, "epoch": 0.3357280532679166, "grad_norm": 18.154773046052462, "kl": 0.12109375, "learning_rate": 6.64447170141931e-07, "loss": 0.0041, "reward": 1.3040469884872437, "reward_std": 0.19486962258815765, "rewards/accuracy_reward_stage2": 0.44467195868492126, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1916 }, { "completion_length": 20.984375, "epoch": 0.33590327667776415, "grad_norm": 23.685376062521655, "kl": 0.12353515625, "learning_rate": 6.642719467320834e-07, "loss": 0.0495, "reward": 1.4644455909729004, "reward_std": 0.17394208908081055, "rewards/accuracy_reward_stage2": 0.5894454717636108, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1917 }, { "completion_length": 14.09375, "epoch": 0.3360785000876117, "grad_norm": 20.132935090916234, "kl": 0.046875, "learning_rate": 6.640967233222359e-07, "loss": 0.0187, "reward": 1.7096850872039795, "reward_std": 0.11627823114395142, "rewards/accuracy_reward_stage2": 0.7096851468086243, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1918 }, { "completion_length": 11.171875, "epoch": 0.33625372349745924, "grad_norm": 21.71008176430557, "kl": 0.11962890625, "learning_rate": 6.639214999123882e-07, "loss": 0.0478, "reward": 1.7054026126861572, "reward_std": 0.14449666440486908, "rewards/accuracy_reward_stage2": 0.705402672290802, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1919 }, { "completion_length": 12.3125, "epoch": 0.3364289469073068, "grad_norm": 23.566831941745605, "kl": 0.1005859375, "learning_rate": 6.637462765025407e-07, "loss": 0.0089, "reward": 1.4168455600738525, "reward_std": 0.25963038206100464, "rewards/accuracy_reward_stage2": 0.432470440864563, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1920 }, { "completion_length": 30.28125, "epoch": 0.3366041703171544, "grad_norm": 21.944029208656, "kl": 0.455078125, "learning_rate": 6.635710530926932e-07, "loss": 0.1379, "reward": 1.127016305923462, "reward_std": 0.2312091737985611, "rewards/accuracy_reward_stage2": 0.3926413357257843, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1921 }, { "completion_length": 5.828125, "epoch": 0.33677939372700194, "grad_norm": 17.257118272901188, "kl": 0.09375, "learning_rate": 6.633958296828455e-07, "loss": 0.0374, "reward": 1.875, "reward_std": 0.2130674123764038, "rewards/accuracy_reward_stage2": 0.875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1922 }, { "completion_length": 8.171875, "epoch": 0.3369546171368495, "grad_norm": 23.521572641848383, "kl": 0.2421875, "learning_rate": 6.63220606272998e-07, "loss": 0.0362, "reward": 1.216859221458435, "reward_std": 0.3034880757331848, "rewards/accuracy_reward_stage2": 0.37310922145843506, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1923 }, { "completion_length": 10.203125, "epoch": 0.33712984054669703, "grad_norm": 20.521336853814198, "kl": 0.09912109375, "learning_rate": 6.630453828631505e-07, "loss": 0.0398, "reward": 1.4524263143539429, "reward_std": 0.15699222683906555, "rewards/accuracy_reward_stage2": 0.45242631435394287, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1924 }, { "completion_length": 6.6875, "epoch": 0.3373050639565446, "grad_norm": 19.291231715911866, "kl": 0.09619140625, "learning_rate": 6.628701594533029e-07, "loss": 0.0383, "reward": 1.8933196067810059, "reward_std": 0.15603157877922058, "rewards/accuracy_reward_stage2": 0.8933195471763611, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1925 }, { "completion_length": 10.890625, "epoch": 0.33748028736639213, "grad_norm": 19.784772606319674, "kl": 0.09033203125, "learning_rate": 6.626949360434554e-07, "loss": 0.0362, "reward": 1.6539945602416992, "reward_std": 0.1568455845117569, "rewards/accuracy_reward_stage2": 0.6539945602416992, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1926 }, { "completion_length": 9.9375, "epoch": 0.33765551077623973, "grad_norm": 24.480945312649215, "kl": 0.07275390625, "learning_rate": 6.625197126336078e-07, "loss": 0.0075, "reward": 1.6549501419067383, "reward_std": 0.1914709359407425, "rewards/accuracy_reward_stage2": 0.6705750823020935, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1927 }, { "completion_length": 9.234375, "epoch": 0.3378307341860873, "grad_norm": 24.791875823236783, "kl": 0.1142578125, "learning_rate": 6.623444892237603e-07, "loss": 0.0457, "reward": 1.515584945678711, "reward_std": 0.3138820230960846, "rewards/accuracy_reward_stage2": 0.6405848264694214, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1928 }, { "completion_length": 9.28125, "epoch": 0.3380059575959348, "grad_norm": 18.311535553437952, "kl": 0.1259765625, "learning_rate": 6.621692658139128e-07, "loss": 0.0506, "reward": 1.6065380573272705, "reward_std": 0.2451055496931076, "rewards/accuracy_reward_stage2": 0.606537938117981, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1929 }, { "completion_length": 9.21875, "epoch": 0.33818118100578237, "grad_norm": 21.444373945493457, "kl": 0.140625, "learning_rate": 6.619940424040652e-07, "loss": 0.012, "reward": 1.619673728942871, "reward_std": 0.22012469172477722, "rewards/accuracy_reward_stage2": 0.6352988481521606, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1930 }, { "completion_length": 12.78125, "epoch": 0.3383564044156299, "grad_norm": 25.47223293939637, "kl": 0.466796875, "learning_rate": 6.618188189942176e-07, "loss": 0.1302, "reward": 1.3783800601959229, "reward_std": 0.23717038333415985, "rewards/accuracy_reward_stage2": 0.5346300005912781, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1931 }, { "completion_length": 13.234375, "epoch": 0.33853162782547747, "grad_norm": 23.782077737361416, "kl": 0.061767578125, "learning_rate": 6.6164359558437e-07, "loss": -0.0195, "reward": 1.6181929111480713, "reward_std": 0.202871173620224, "rewards/accuracy_reward_stage2": 0.6338179111480713, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1932 }, { "completion_length": 8.15625, "epoch": 0.338706851235325, "grad_norm": 16.914909837285762, "kl": 0.09326171875, "learning_rate": 6.614683721745224e-07, "loss": 0.0372, "reward": 1.4998173713684082, "reward_std": 0.11330730468034744, "rewards/accuracy_reward_stage2": 0.749817430973053, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 1933 }, { "completion_length": 11.15625, "epoch": 0.3388820746451726, "grad_norm": 18.585863567630085, "kl": 0.046630859375, "learning_rate": 6.612931487646749e-07, "loss": 0.0186, "reward": 1.6778485774993896, "reward_std": 0.18502506613731384, "rewards/accuracy_reward_stage2": 0.6778485774993896, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1934 }, { "completion_length": 9.90625, "epoch": 0.33905729805502016, "grad_norm": 19.867172195955273, "kl": 0.14453125, "learning_rate": 6.611179253548273e-07, "loss": -0.0022, "reward": 1.5232343673706055, "reward_std": 0.33503860235214233, "rewards/accuracy_reward_stage2": 0.554484486579895, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1935 }, { "completion_length": 9.46875, "epoch": 0.3392325214648677, "grad_norm": 20.729060743505748, "kl": 0.134765625, "learning_rate": 6.609427019449798e-07, "loss": 0.0323, "reward": 1.7128856182098389, "reward_std": 0.24648074805736542, "rewards/accuracy_reward_stage2": 0.7285105586051941, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1936 }, { "completion_length": 9.734375, "epoch": 0.33940774487471526, "grad_norm": 16.46205351952286, "kl": 0.08203125, "learning_rate": 6.607674785351323e-07, "loss": 0.0328, "reward": 1.7479907274246216, "reward_std": 0.10182836651802063, "rewards/accuracy_reward_stage2": 0.7479907274246216, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1937 }, { "completion_length": 8.078125, "epoch": 0.3395829682845628, "grad_norm": 12.568074846483034, "kl": 0.0625, "learning_rate": 6.605922551252847e-07, "loss": -0.0192, "reward": 1.6138134002685547, "reward_std": 0.1896343231201172, "rewards/accuracy_reward_stage2": 0.6294383406639099, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1938 }, { "completion_length": 6.703125, "epoch": 0.33975819169441035, "grad_norm": 25.873183430795024, "kl": 0.0966796875, "learning_rate": 6.604170317154372e-07, "loss": 0.0387, "reward": 1.7632300853729248, "reward_std": 0.2887365520000458, "rewards/accuracy_reward_stage2": 0.7632301449775696, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1939 }, { "completion_length": 9.296875, "epoch": 0.33993341510425795, "grad_norm": 15.34207546952785, "kl": 0.056396484375, "learning_rate": 6.602418083055897e-07, "loss": -0.0064, "reward": 1.7274775505065918, "reward_std": 0.12433339655399323, "rewards/accuracy_reward_stage2": 0.7431026101112366, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1940 }, { "completion_length": 10.890625, "epoch": 0.3401086385141055, "grad_norm": 24.915169950161932, "kl": 0.1796875, "learning_rate": 6.600665848957421e-07, "loss": 0.0279, "reward": 1.5926790237426758, "reward_std": 0.31510692834854126, "rewards/accuracy_reward_stage2": 0.733303964138031, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1941 }, { "completion_length": 12.625, "epoch": 0.34028386192395305, "grad_norm": 19.149575995205236, "kl": 0.11474609375, "learning_rate": 6.598913614858945e-07, "loss": 0.0459, "reward": 1.5549436807632446, "reward_std": 0.2327888011932373, "rewards/accuracy_reward_stage2": 0.5549436211585999, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1942 }, { "completion_length": 9.453125, "epoch": 0.3404590853338006, "grad_norm": 22.51574542315056, "kl": 0.1376953125, "learning_rate": 6.597161380760469e-07, "loss": 0.0108, "reward": 1.5218536853790283, "reward_std": 0.2579251527786255, "rewards/accuracy_reward_stage2": 0.5374786853790283, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1943 }, { "completion_length": 12.578125, "epoch": 0.34063430874364814, "grad_norm": 27.877815424585087, "kl": 0.10400390625, "learning_rate": 6.595409146661993e-07, "loss": -0.0122, "reward": 1.3764312267303467, "reward_std": 0.36347371339797974, "rewards/accuracy_reward_stage2": 0.5326813459396362, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1944 }, { "completion_length": 9.84375, "epoch": 0.3408095321534957, "grad_norm": 18.795007676878427, "kl": 0.09033203125, "learning_rate": 6.593656912563518e-07, "loss": -0.0838, "reward": 1.686873197555542, "reward_std": 0.23738035559654236, "rewards/accuracy_reward_stage2": 0.733748197555542, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1945 }, { "completion_length": 16.25, "epoch": 0.3409847555633433, "grad_norm": 30.824319726042305, "kl": 0.1259765625, "learning_rate": 6.591904678465042e-07, "loss": 0.0203, "reward": 1.3881264925003052, "reward_std": 0.24407415091991425, "rewards/accuracy_reward_stage2": 0.40375152230262756, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1946 }, { "completion_length": 9.609375, "epoch": 0.34115997897319084, "grad_norm": 19.57452275526612, "kl": 0.05859375, "learning_rate": 6.590152444366567e-07, "loss": 0.0019, "reward": 1.3171931505203247, "reward_std": 0.26795125007629395, "rewards/accuracy_reward_stage2": 0.3328181207180023, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1947 }, { "completion_length": 10.3125, "epoch": 0.3413352023830384, "grad_norm": 16.885076206570435, "kl": 0.09326171875, "learning_rate": 6.588400210268091e-07, "loss": -0.0069, "reward": 1.3556816577911377, "reward_std": 0.22130826115608215, "rewards/accuracy_reward_stage2": 0.6213066577911377, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1948 }, { "completion_length": 9.765625, "epoch": 0.34151042579288593, "grad_norm": 17.591580716772768, "kl": 0.1484375, "learning_rate": 6.586647976169616e-07, "loss": 0.015, "reward": 1.7992362976074219, "reward_std": 0.22967034578323364, "rewards/accuracy_reward_stage2": 0.8148613572120667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1949 }, { "completion_length": 11.25, "epoch": 0.3416856492027335, "grad_norm": 22.07411498807725, "kl": 0.05859375, "learning_rate": 6.584895742071141e-07, "loss": -0.0145, "reward": 1.4841227531433105, "reward_std": 0.2506176233291626, "rewards/accuracy_reward_stage2": 0.6247477531433105, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1950 }, { "completion_length": 13.671875, "epoch": 0.341860872612581, "grad_norm": 19.24929849784772, "kl": 0.036376953125, "learning_rate": 6.583143507972665e-07, "loss": 0.0146, "reward": 1.4395618438720703, "reward_std": 0.21534651517868042, "rewards/accuracy_reward_stage2": 0.4395618140697479, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1951 }, { "completion_length": 10.953125, "epoch": 0.34203609602242857, "grad_norm": 20.779179875174233, "kl": 0.234375, "learning_rate": 6.581391273874189e-07, "loss": 0.024, "reward": 1.5351721048355103, "reward_std": 0.23412840068340302, "rewards/accuracy_reward_stage2": 0.7070470452308655, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1952 }, { "completion_length": 12.8125, "epoch": 0.3422113194322762, "grad_norm": 22.998124530730706, "kl": 0.203125, "learning_rate": 6.579639039775714e-07, "loss": -0.0068, "reward": 1.3460776805877686, "reward_std": 0.3017346262931824, "rewards/accuracy_reward_stage2": 0.5023276805877686, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1953 }, { "completion_length": 7.359375, "epoch": 0.3423865428421237, "grad_norm": 15.53066341719061, "kl": 0.1328125, "learning_rate": 6.577886805677238e-07, "loss": -0.0584, "reward": 1.7026607990264893, "reward_std": 0.23679913580417633, "rewards/accuracy_reward_stage2": 0.7495357990264893, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 1954 }, { "completion_length": 9.453125, "epoch": 0.34256176625197127, "grad_norm": 18.46816749799684, "kl": 0.08544921875, "learning_rate": 6.576134571578763e-07, "loss": 0.034, "reward": 1.403747320175171, "reward_std": 0.22259891033172607, "rewards/accuracy_reward_stage2": 0.5287472605705261, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1955 }, { "completion_length": 18.25, "epoch": 0.3427369896618188, "grad_norm": 18.38930419158117, "kl": 0.068359375, "learning_rate": 6.574382337480288e-07, "loss": -0.0167, "reward": 1.649796962738037, "reward_std": 0.19626963138580322, "rewards/accuracy_reward_stage2": 0.6654220223426819, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1956 }, { "completion_length": 11.015625, "epoch": 0.34291221307166636, "grad_norm": 18.69463465641868, "kl": 0.16015625, "learning_rate": 6.572630103381811e-07, "loss": -0.0242, "reward": 1.4900152683258057, "reward_std": 0.23229768872261047, "rewards/accuracy_reward_stage2": 0.5212653279304504, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1957 }, { "completion_length": 8.96875, "epoch": 0.3430874364815139, "grad_norm": 16.994028473834337, "kl": 0.1416015625, "learning_rate": 6.570877869283336e-07, "loss": 0.0123, "reward": 1.7630869150161743, "reward_std": 0.23732198774814606, "rewards/accuracy_reward_stage2": 0.7787119150161743, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1958 }, { "completion_length": 12.921875, "epoch": 0.3432626598913615, "grad_norm": 48.14582209676718, "kl": 0.19140625, "learning_rate": 6.56912563518486e-07, "loss": -0.0354, "reward": 1.400420904159546, "reward_std": 0.29775795340538025, "rewards/accuracy_reward_stage2": 0.5722959041595459, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 1959 }, { "completion_length": 9.828125, "epoch": 0.34343788330120906, "grad_norm": 21.817718975240545, "kl": 0.1328125, "learning_rate": 6.567373401086385e-07, "loss": 0.0532, "reward": 1.6833243370056152, "reward_std": 0.22717389464378357, "rewards/accuracy_reward_stage2": 0.6833243370056152, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1960 }, { "completion_length": 8.609375, "epoch": 0.3436131067110566, "grad_norm": 10.814326403229101, "kl": 0.0162353515625, "learning_rate": 6.56562116698791e-07, "loss": 0.0065, "reward": 1.532088279724121, "reward_std": 0.08120846003293991, "rewards/accuracy_reward_stage2": 0.6570882797241211, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1961 }, { "completion_length": 14.84375, "epoch": 0.34378833012090415, "grad_norm": 36.49634156631595, "kl": 0.09716796875, "learning_rate": 6.563868932889433e-07, "loss": 0.001, "reward": 1.3071386814117432, "reward_std": 0.2545177638530731, "rewards/accuracy_reward_stage2": 0.32276368141174316, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1962 }, { "completion_length": 13.1875, "epoch": 0.3439635535307517, "grad_norm": 16.061314507901063, "kl": 0.076171875, "learning_rate": 6.562116698790958e-07, "loss": 0.0062, "reward": 1.3792309761047363, "reward_std": 0.1264325976371765, "rewards/accuracy_reward_stage2": 0.39485591650009155, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1963 }, { "completion_length": 12.078125, "epoch": 0.34413877694059924, "grad_norm": 23.750220020402494, "kl": 0.1357421875, "learning_rate": 6.560364464692482e-07, "loss": 0.01, "reward": 1.652053713798523, "reward_std": 0.29191863536834717, "rewards/accuracy_reward_stage2": 0.667678713798523, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1964 }, { "completion_length": 11.96875, "epoch": 0.34431400035044685, "grad_norm": 33.41616899169439, "kl": 0.12890625, "learning_rate": 6.558612230594007e-07, "loss": 0.0241, "reward": 1.4442522525787354, "reward_std": 0.23655974864959717, "rewards/accuracy_reward_stage2": 0.45987722277641296, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1965 }, { "completion_length": 6.90625, "epoch": 0.3444892237602944, "grad_norm": 21.254282550862037, "kl": 0.07470703125, "learning_rate": 6.556859996495532e-07, "loss": 0.0009, "reward": 1.7194864749908447, "reward_std": 0.23831090331077576, "rewards/accuracy_reward_stage2": 0.7351114749908447, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1966 }, { "completion_length": 9.4375, "epoch": 0.34466444717014194, "grad_norm": 15.179379706752174, "kl": 0.1240234375, "learning_rate": 6.555107762397056e-07, "loss": -0.0343, "reward": 1.696099877357483, "reward_std": 0.13108253479003906, "rewards/accuracy_reward_stage2": 0.8523498773574829, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1967 }, { "completion_length": 9.265625, "epoch": 0.3448396705799895, "grad_norm": 30.543324244072327, "kl": 0.181640625, "learning_rate": 6.553355528298581e-07, "loss": 0.0285, "reward": 1.5463353395462036, "reward_std": 0.2797982692718506, "rewards/accuracy_reward_stage2": 0.5619603395462036, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1968 }, { "completion_length": 20.390625, "epoch": 0.34501489398983703, "grad_norm": 16.82444448290589, "kl": 0.130859375, "learning_rate": 6.551603294200106e-07, "loss": 0.011, "reward": 1.4820425510406494, "reward_std": 0.1993536353111267, "rewards/accuracy_reward_stage2": 0.7476676106452942, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1969 }, { "completion_length": 12.984375, "epoch": 0.3451901173996846, "grad_norm": 11.0144221558119, "kl": 0.0498046875, "learning_rate": 6.549851060101629e-07, "loss": 0.0198, "reward": 1.7429325580596924, "reward_std": 0.07267377525568008, "rewards/accuracy_reward_stage2": 0.7429325580596924, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1970 }, { "completion_length": 14.0625, "epoch": 0.34536534080953213, "grad_norm": 17.55120295527536, "kl": 0.08447265625, "learning_rate": 6.548098826003154e-07, "loss": -0.0103, "reward": 1.5645519495010376, "reward_std": 0.2282322496175766, "rewards/accuracy_reward_stage2": 0.5801768898963928, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1971 }, { "completion_length": 13.8125, "epoch": 0.34554056421937973, "grad_norm": 16.79040991776876, "kl": 0.05859375, "learning_rate": 6.546346591904677e-07, "loss": -0.0208, "reward": 1.3323101997375488, "reward_std": 0.2069927155971527, "rewards/accuracy_reward_stage2": 0.47293511033058167, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1972 }, { "completion_length": 13.140625, "epoch": 0.3457157876292273, "grad_norm": 19.771904263657262, "kl": 0.2431640625, "learning_rate": 6.544594357806202e-07, "loss": 0.0972, "reward": 1.4394879341125488, "reward_std": 0.153305783867836, "rewards/accuracy_reward_stage2": 0.564487874507904, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1973 }, { "completion_length": 9.703125, "epoch": 0.3458910110390748, "grad_norm": 25.667897310449245, "kl": 0.09130859375, "learning_rate": 6.542842123707727e-07, "loss": -0.0014, "reward": 1.5841963291168213, "reward_std": 0.2587115168571472, "rewards/accuracy_reward_stage2": 0.7248212695121765, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1974 }, { "completion_length": 7.484375, "epoch": 0.34606623444892237, "grad_norm": 22.24575147622979, "kl": 0.25390625, "learning_rate": 6.541089889609251e-07, "loss": 0.0632, "reward": 1.5621519088745117, "reward_std": 0.335077166557312, "rewards/accuracy_reward_stage2": 0.7027768492698669, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1975 }, { "completion_length": 10.453125, "epoch": 0.3462414578587699, "grad_norm": 24.44092705807922, "kl": 0.04736328125, "learning_rate": 6.539337655510776e-07, "loss": -0.014, "reward": 1.655135154724121, "reward_std": 0.24762673676013947, "rewards/accuracy_reward_stage2": 0.6707600951194763, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1976 }, { "completion_length": 11.25, "epoch": 0.34641668126861747, "grad_norm": 23.469688804357215, "kl": 0.25, "learning_rate": 6.537585421412301e-07, "loss": 0.0116, "reward": 1.7019398212432861, "reward_std": 0.2970326542854309, "rewards/accuracy_reward_stage2": 0.8581898808479309, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 1977 }, { "completion_length": 9.21875, "epoch": 0.34659190467846507, "grad_norm": 18.475356845854026, "kl": 0.0595703125, "learning_rate": 6.535833187313825e-07, "loss": 0.0239, "reward": 1.7104763984680176, "reward_std": 0.14170876145362854, "rewards/accuracy_reward_stage2": 0.7104763984680176, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1978 }, { "completion_length": 9.484375, "epoch": 0.3467671280883126, "grad_norm": 12.945405309371132, "kl": 0.08935546875, "learning_rate": 6.53408095321535e-07, "loss": 0.0357, "reward": 1.639056921005249, "reward_std": 0.06409046053886414, "rewards/accuracy_reward_stage2": 0.6390569806098938, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1979 }, { "completion_length": 14.03125, "epoch": 0.34694235149816016, "grad_norm": 18.91283009725862, "kl": 0.232421875, "learning_rate": 6.532328719116874e-07, "loss": 0.0489, "reward": 1.1489617824554443, "reward_std": 0.1828662008047104, "rewards/accuracy_reward_stage2": 0.4145868420600891, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 1980 }, { "completion_length": 11.875, "epoch": 0.3471175749080077, "grad_norm": 25.379163351883836, "kl": 0.130859375, "learning_rate": 6.530576485018399e-07, "loss": 0.0522, "reward": 1.517045259475708, "reward_std": 0.17672033607959747, "rewards/accuracy_reward_stage2": 0.517045259475708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1981 }, { "completion_length": 10.5625, "epoch": 0.34729279831785526, "grad_norm": 24.248118802036352, "kl": 0.2412109375, "learning_rate": 6.528824250919922e-07, "loss": -0.0996, "reward": 1.5262937545776367, "reward_std": 0.3614034652709961, "rewards/accuracy_reward_stage2": 0.6200437545776367, "rewards/format_reward_stage1_pointerpad": 0.90625, "scores/accuracy_reward_stage2": 0.90625, "step": 1982 }, { "completion_length": 14.109375, "epoch": 0.3474680217277028, "grad_norm": 22.904190961498777, "kl": 0.10400390625, "learning_rate": 6.527072016821446e-07, "loss": 0.0126, "reward": 1.594224214553833, "reward_std": 0.2510741651058197, "rewards/accuracy_reward_stage2": 0.6098491549491882, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1983 }, { "completion_length": 9.6875, "epoch": 0.34764324513755035, "grad_norm": 21.44313358526625, "kl": 0.1826171875, "learning_rate": 6.525319782722971e-07, "loss": 0.0729, "reward": 1.6885744333267212, "reward_std": 0.2376718968153, "rewards/accuracy_reward_stage2": 0.6885744333267212, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1984 }, { "completion_length": 8.34375, "epoch": 0.34781846854739795, "grad_norm": 22.402962231988234, "kl": 0.09228515625, "learning_rate": 6.523567548624496e-07, "loss": 0.0054, "reward": 1.8162912130355835, "reward_std": 0.28567641973495483, "rewards/accuracy_reward_stage2": 0.8319162130355835, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1985 }, { "completion_length": 34.921875, "epoch": 0.3479936919572455, "grad_norm": 153.3247594937022, "kl": 1.28125, "learning_rate": 6.52181531452602e-07, "loss": 0.5154, "reward": 1.6300783157348633, "reward_std": 0.06343966722488403, "rewards/accuracy_reward_stage2": 0.7550783753395081, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 1986 }, { "completion_length": 9.296875, "epoch": 0.34816891536709305, "grad_norm": 23.518916407176633, "kl": 0.11474609375, "learning_rate": 6.520063080427545e-07, "loss": -0.0186, "reward": 1.42463219165802, "reward_std": 0.3672451972961426, "rewards/accuracy_reward_stage2": 0.4558822214603424, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1987 }, { "completion_length": 8.921875, "epoch": 0.3483441387769406, "grad_norm": 19.948904765481284, "kl": 0.1728515625, "learning_rate": 6.518310846329069e-07, "loss": -0.0049, "reward": 1.6624736785888672, "reward_std": 0.17071276903152466, "rewards/accuracy_reward_stage2": 0.6937235593795776, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1988 }, { "completion_length": 8.796875, "epoch": 0.34851936218678814, "grad_norm": 26.28195016419193, "kl": 0.0908203125, "learning_rate": 6.516558612230594e-07, "loss": 0.0362, "reward": 1.5121607780456543, "reward_std": 0.19263972342014313, "rewards/accuracy_reward_stage2": 0.5121607184410095, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1989 }, { "completion_length": 8.796875, "epoch": 0.3486945855966357, "grad_norm": 20.032310841077322, "kl": 0.0908203125, "learning_rate": 6.514806378132119e-07, "loss": -0.0245, "reward": 1.4932280778884888, "reward_std": 0.24793201684951782, "rewards/accuracy_reward_stage2": 0.5244780778884888, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 1990 }, { "completion_length": 10.40625, "epoch": 0.3488698090064833, "grad_norm": 14.738256721826753, "kl": 0.050537109375, "learning_rate": 6.513054144033643e-07, "loss": -0.0239, "reward": 1.6377990245819092, "reward_std": 0.10622736066579819, "rewards/accuracy_reward_stage2": 0.6534240245819092, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1991 }, { "completion_length": 17.265625, "epoch": 0.34904503241633084, "grad_norm": 20.695338367248944, "kl": 0.0791015625, "learning_rate": 6.511301909935167e-07, "loss": -0.0121, "reward": 1.6008673906326294, "reward_std": 0.2019246220588684, "rewards/accuracy_reward_stage2": 0.6164922714233398, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1992 }, { "completion_length": 9.921875, "epoch": 0.3492202558261784, "grad_norm": 17.42855634410852, "kl": 0.099609375, "learning_rate": 6.509549675836692e-07, "loss": -0.0043, "reward": 1.5164021253585815, "reward_std": 0.18167875707149506, "rewards/accuracy_reward_stage2": 0.6570271253585815, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1993 }, { "completion_length": 11.921875, "epoch": 0.34939547923602593, "grad_norm": 16.76881427087333, "kl": 0.061767578125, "learning_rate": 6.507797441738216e-07, "loss": 0.0008, "reward": 1.6663645505905151, "reward_std": 0.11451227962970734, "rewards/accuracy_reward_stage2": 0.6819895505905151, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1994 }, { "completion_length": 9.796875, "epoch": 0.3495707026458735, "grad_norm": 20.48561906353898, "kl": 0.154296875, "learning_rate": 6.50604520763974e-07, "loss": 0.0176, "reward": 1.487541913986206, "reward_std": 0.21980035305023193, "rewards/accuracy_reward_stage2": 0.5031670331954956, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1995 }, { "completion_length": 7.8125, "epoch": 0.349745926055721, "grad_norm": 16.811939673960403, "kl": 0.1767578125, "learning_rate": 6.504292973541264e-07, "loss": -0.1069, "reward": 1.3587661981582642, "reward_std": 0.2528875470161438, "rewards/accuracy_reward_stage2": 0.43689125776290894, "rewards/format_reward_stage1_pointerpad": 0.921875, "scores/accuracy_reward_stage2": 0.921875, "step": 1996 }, { "completion_length": 10.5625, "epoch": 0.3499211494655686, "grad_norm": 19.56031623481252, "kl": 0.04931640625, "learning_rate": 6.502540739442789e-07, "loss": 0.0197, "reward": 1.7224457263946533, "reward_std": 0.22622840106487274, "rewards/accuracy_reward_stage2": 0.7224457263946533, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 1997 }, { "completion_length": 15.6875, "epoch": 0.3500963728754162, "grad_norm": 10.344808632978406, "kl": 0.0294189453125, "learning_rate": 6.500788505344314e-07, "loss": -0.0323, "reward": 1.6654764413833618, "reward_std": 0.1032496765255928, "rewards/accuracy_reward_stage2": 0.6811015009880066, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 1998 }, { "completion_length": 13.90625, "epoch": 0.3502715962852637, "grad_norm": 16.99481523909834, "kl": 0.24609375, "learning_rate": 6.499036271245838e-07, "loss": 0.0583, "reward": 1.5105576515197754, "reward_std": 0.16287124156951904, "rewards/accuracy_reward_stage2": 0.6511826515197754, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 1999 }, { "completion_length": 8.515625, "epoch": 0.35044681969511127, "grad_norm": 14.072725639459303, "kl": 0.08740234375, "learning_rate": 6.497284037147363e-07, "loss": 0.035, "reward": 1.741347074508667, "reward_std": 0.11879969388246536, "rewards/accuracy_reward_stage2": 0.7413470149040222, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2000 }, { "completion_length": 10.984375, "epoch": 0.3506220431049588, "grad_norm": 21.66948159691174, "kl": 0.11767578125, "learning_rate": 6.495531803048888e-07, "loss": 0.0061, "reward": 1.5786539316177368, "reward_std": 0.2800479829311371, "rewards/accuracy_reward_stage2": 0.5942789316177368, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2001 }, { "completion_length": 11.078125, "epoch": 0.35079726651480636, "grad_norm": 32.400792294857474, "kl": 0.0810546875, "learning_rate": 6.493779568950411e-07, "loss": 0.0325, "reward": 1.5315885543823242, "reward_std": 0.27468162775039673, "rewards/accuracy_reward_stage2": 0.656588613986969, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2002 }, { "completion_length": 10.875, "epoch": 0.3509724899246539, "grad_norm": 23.578523403709887, "kl": 0.25390625, "learning_rate": 6.492027334851936e-07, "loss": 0.0848, "reward": 1.4257640838623047, "reward_std": 0.25517600774765015, "rewards/accuracy_reward_stage2": 0.5663890838623047, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2003 }, { "completion_length": 19.703125, "epoch": 0.3511477133345015, "grad_norm": 20.596326020016303, "kl": 0.083984375, "learning_rate": 6.49027510075346e-07, "loss": 0.0336, "reward": 1.6645708084106445, "reward_std": 0.2337852567434311, "rewards/accuracy_reward_stage2": 0.6645709276199341, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2004 }, { "completion_length": 16.765625, "epoch": 0.35132293674434906, "grad_norm": 22.441290506296923, "kl": 0.337890625, "learning_rate": 6.488522866654985e-07, "loss": 0.0101, "reward": 1.3200299739837646, "reward_std": 0.30100393295288086, "rewards/accuracy_reward_stage2": 0.4919048845767975, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2005 }, { "completion_length": 15.25, "epoch": 0.3514981601541966, "grad_norm": 27.76775951792323, "kl": 0.051025390625, "learning_rate": 6.48677063255651e-07, "loss": 0.0205, "reward": 1.3646464347839355, "reward_std": 0.25721046328544617, "rewards/accuracy_reward_stage2": 0.6146464943885803, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2006 }, { "completion_length": 10.484375, "epoch": 0.35167338356404415, "grad_norm": 17.07704587908255, "kl": 0.0712890625, "learning_rate": 6.485018398458034e-07, "loss": -0.0116, "reward": 1.518690824508667, "reward_std": 0.1659468710422516, "rewards/accuracy_reward_stage2": 0.534315824508667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2007 }, { "completion_length": 9.5625, "epoch": 0.3518486069738917, "grad_norm": 14.103071172763505, "kl": 0.07666015625, "learning_rate": 6.483266164359558e-07, "loss": 0.0306, "reward": 1.5295997858047485, "reward_std": 0.07649820297956467, "rewards/accuracy_reward_stage2": 0.5295997262001038, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2008 }, { "completion_length": 9.625, "epoch": 0.35202383038373924, "grad_norm": 18.747318004607546, "kl": 0.0673828125, "learning_rate": 6.481513930261083e-07, "loss": 0.0271, "reward": 1.6228842735290527, "reward_std": 0.16448625922203064, "rewards/accuracy_reward_stage2": 0.622884213924408, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2009 }, { "completion_length": 11.3125, "epoch": 0.35219905379358685, "grad_norm": 17.426671028984867, "kl": 0.1357421875, "learning_rate": 6.479761696162607e-07, "loss": 0.0544, "reward": 1.7684483528137207, "reward_std": 0.1293027698993683, "rewards/accuracy_reward_stage2": 0.7684484720230103, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2010 }, { "completion_length": 8.8125, "epoch": 0.3523742772034344, "grad_norm": 19.074783419869533, "kl": 0.1328125, "learning_rate": 6.478009462064131e-07, "loss": 0.0089, "reward": 1.3788700103759766, "reward_std": 0.24218647181987762, "rewards/accuracy_reward_stage2": 0.39449506998062134, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2011 }, { "completion_length": 17.234375, "epoch": 0.35254950061328194, "grad_norm": 19.175166347014933, "kl": 0.010498046875, "learning_rate": 6.476257227965655e-07, "loss": 0.0042, "reward": 1.7504546642303467, "reward_std": 0.12284188717603683, "rewards/accuracy_reward_stage2": 0.7504546046257019, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2012 }, { "completion_length": 10.0, "epoch": 0.3527247240231295, "grad_norm": 21.749683884591207, "kl": 0.134765625, "learning_rate": 6.47450499386718e-07, "loss": 0.02, "reward": 1.6623674631118774, "reward_std": 0.28688478469848633, "rewards/accuracy_reward_stage2": 0.6779924631118774, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2013 }, { "completion_length": 5.421875, "epoch": 0.35289994743297703, "grad_norm": 44.18443001251126, "kl": 0.52734375, "learning_rate": 6.472752759768705e-07, "loss": 0.122, "reward": 1.183675765991211, "reward_std": 0.20090191066265106, "rewards/accuracy_reward_stage2": 0.48055073618888855, "rewards/format_reward_stage1_pointerpad": 0.703125, "scores/accuracy_reward_stage2": 0.703125, "step": 2014 }, { "completion_length": 6.328125, "epoch": 0.3530751708428246, "grad_norm": 22.254088766217205, "kl": 0.06201171875, "learning_rate": 6.471000525670229e-07, "loss": 0.0248, "reward": 1.7993628978729248, "reward_std": 0.1816439926624298, "rewards/accuracy_reward_stage2": 0.7993630170822144, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2015 }, { "completion_length": 7.828125, "epoch": 0.3532503942526722, "grad_norm": 22.0690082720116, "kl": 0.04296875, "learning_rate": 6.469248291571754e-07, "loss": 0.0172, "reward": 1.7569329738616943, "reward_std": 0.15919101238250732, "rewards/accuracy_reward_stage2": 0.7569329142570496, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2016 }, { "completion_length": 12.5, "epoch": 0.35342561766251973, "grad_norm": 257.73596936766705, "kl": 1.484375, "learning_rate": 6.467496057473279e-07, "loss": 0.5931, "reward": 1.65625, "reward_std": 0.1735912710428238, "rewards/accuracy_reward_stage2": 0.90625, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2017 }, { "completion_length": 16.328125, "epoch": 0.3536008410723673, "grad_norm": 22.55185782932836, "kl": 0.12890625, "learning_rate": 6.465743823374803e-07, "loss": 0.0516, "reward": 1.6566040515899658, "reward_std": 0.2475588023662567, "rewards/accuracy_reward_stage2": 0.656603991985321, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2018 }, { "completion_length": 11.921875, "epoch": 0.3537760644822148, "grad_norm": 18.32946455298761, "kl": 0.11181640625, "learning_rate": 6.463991589276328e-07, "loss": -0.0436, "reward": 1.503852367401123, "reward_std": 0.3397676944732666, "rewards/accuracy_reward_stage2": 0.5351022481918335, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2019 }, { "completion_length": 20.140625, "epoch": 0.35395128789206237, "grad_norm": 23.591232229481246, "kl": 0.18359375, "learning_rate": 6.462239355177852e-07, "loss": 0.0734, "reward": 1.4747546911239624, "reward_std": 0.3044975996017456, "rewards/accuracy_reward_stage2": 0.5997546911239624, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2020 }, { "completion_length": 10.953125, "epoch": 0.3541265113019099, "grad_norm": 49.864613932302575, "kl": 0.1708984375, "learning_rate": 6.460487121079375e-07, "loss": 0.0685, "reward": 1.5495915412902832, "reward_std": 0.35907381772994995, "rewards/accuracy_reward_stage2": 0.549591600894928, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2021 }, { "completion_length": 5.390625, "epoch": 0.35430173471175747, "grad_norm": 24.78394195461497, "kl": 0.037841796875, "learning_rate": 6.4587348869809e-07, "loss": 0.0151, "reward": 1.6354167461395264, "reward_std": 0.303213894367218, "rewards/accuracy_reward_stage2": 0.6354166865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2022 }, { "completion_length": 17.65625, "epoch": 0.35447695812160507, "grad_norm": 15.952665146989885, "kl": 0.042724609375, "learning_rate": 6.456982652882424e-07, "loss": -0.0504, "reward": 1.4603793621063232, "reward_std": 0.10579296946525574, "rewards/accuracy_reward_stage2": 0.49162930250167847, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2023 }, { "completion_length": 9.015625, "epoch": 0.3546521815314526, "grad_norm": 23.73446125497201, "kl": 0.08642578125, "learning_rate": 6.455230418783949e-07, "loss": -0.0072, "reward": 1.5714523792266846, "reward_std": 0.2881009578704834, "rewards/accuracy_reward_stage2": 0.5870773196220398, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2024 }, { "completion_length": 11.75, "epoch": 0.35482740494130016, "grad_norm": 19.071701378032504, "kl": 0.0228271484375, "learning_rate": 6.453478184685473e-07, "loss": 0.0091, "reward": 1.512235164642334, "reward_std": 0.1992071270942688, "rewards/accuracy_reward_stage2": 0.5122351050376892, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2025 }, { "completion_length": 5.6875, "epoch": 0.3550026283511477, "grad_norm": 13.827528845383378, "kl": 0.1357421875, "learning_rate": 6.451725950586998e-07, "loss": 0.0541, "reward": 1.8923611640930176, "reward_std": 0.1601068675518036, "rewards/accuracy_reward_stage2": 0.8923611640930176, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2026 }, { "completion_length": 8.21875, "epoch": 0.35517785176099526, "grad_norm": 21.877932825400908, "kl": 0.125, "learning_rate": 6.449973716488523e-07, "loss": -0.0149, "reward": 1.328125, "reward_std": 0.308285653591156, "rewards/accuracy_reward_stage2": 0.359375, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2027 }, { "completion_length": 9.0625, "epoch": 0.3553530751708428, "grad_norm": 21.868545535759246, "kl": 0.1044921875, "learning_rate": 6.448221482390047e-07, "loss": -0.0024, "reward": 1.6145799160003662, "reward_std": 0.2504516839981079, "rewards/accuracy_reward_stage2": 0.6302049160003662, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2028 }, { "completion_length": 8.5, "epoch": 0.3555282985806904, "grad_norm": 34.47660911860323, "kl": 0.2890625, "learning_rate": 6.446469248291572e-07, "loss": 0.1158, "reward": 1.6426146030426025, "reward_std": 0.18083617091178894, "rewards/accuracy_reward_stage2": 0.7676145434379578, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2029 }, { "completion_length": 12.140625, "epoch": 0.35570352199053795, "grad_norm": 28.346024278896905, "kl": 0.09130859375, "learning_rate": 6.444717014193097e-07, "loss": -0.0078, "reward": 1.441169023513794, "reward_std": 0.23020276427268982, "rewards/accuracy_reward_stage2": 0.45679396390914917, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2030 }, { "completion_length": 10.78125, "epoch": 0.3558787454003855, "grad_norm": 19.90543622562572, "kl": 0.1591796875, "learning_rate": 6.44296478009462e-07, "loss": -0.0228, "reward": 1.4047634601593018, "reward_std": 0.3020269274711609, "rewards/accuracy_reward_stage2": 0.5610134601593018, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2031 }, { "completion_length": 7.28125, "epoch": 0.35605396881023305, "grad_norm": 29.985663372852073, "kl": 0.19140625, "learning_rate": 6.441212545996145e-07, "loss": 0.0768, "reward": 1.436646819114685, "reward_std": 0.1707455813884735, "rewards/accuracy_reward_stage2": 0.43664681911468506, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2032 }, { "completion_length": 9.734375, "epoch": 0.3562291922200806, "grad_norm": 41.73996181885468, "kl": 0.2109375, "learning_rate": 6.439460311897668e-07, "loss": 0.0843, "reward": 1.631887674331665, "reward_std": 0.20773212611675262, "rewards/accuracy_reward_stage2": 0.631887674331665, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2033 }, { "completion_length": 10.828125, "epoch": 0.35640441562992814, "grad_norm": 16.95317734365735, "kl": 0.0859375, "learning_rate": 6.437708077799193e-07, "loss": -0.0796, "reward": 1.5492008924484253, "reward_std": 0.2339085042476654, "rewards/accuracy_reward_stage2": 0.5960758924484253, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2034 }, { "completion_length": 17.359375, "epoch": 0.3565796390397757, "grad_norm": 24.50093893215538, "kl": 0.1943359375, "learning_rate": 6.435955843700718e-07, "loss": 0.0248, "reward": 1.3570654392242432, "reward_std": 0.1519032120704651, "rewards/accuracy_reward_stage2": 0.5133154392242432, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2035 }, { "completion_length": 10.984375, "epoch": 0.3567548624496233, "grad_norm": 22.809359510094048, "kl": 0.052490234375, "learning_rate": 6.434203609602242e-07, "loss": 0.021, "reward": 1.6798628568649292, "reward_std": 0.17061826586723328, "rewards/accuracy_reward_stage2": 0.679862916469574, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2036 }, { "completion_length": 8.921875, "epoch": 0.35693008585947084, "grad_norm": 14.822723100907085, "kl": 0.0732421875, "learning_rate": 6.432451375503767e-07, "loss": 0.0293, "reward": 1.671637773513794, "reward_std": 0.1526038646697998, "rewards/accuracy_reward_stage2": 0.671637773513794, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2037 }, { "completion_length": 10.640625, "epoch": 0.3571053092693184, "grad_norm": 17.16508925671016, "kl": 0.0830078125, "learning_rate": 6.430699141405292e-07, "loss": 0.0015, "reward": 1.493004322052002, "reward_std": 0.2090732753276825, "rewards/accuracy_reward_stage2": 0.633629322052002, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2038 }, { "completion_length": 9.0, "epoch": 0.35728053267916593, "grad_norm": 22.10614423154803, "kl": 0.275390625, "learning_rate": 6.428946907306816e-07, "loss": 0.068, "reward": 1.3667113780975342, "reward_std": 0.2098403126001358, "rewards/accuracy_reward_stage2": 0.6479613780975342, "rewards/format_reward_stage1_pointerpad": 0.71875, "scores/accuracy_reward_stage2": 0.71875, "step": 2039 }, { "completion_length": 13.5625, "epoch": 0.3574557560890135, "grad_norm": 29.74656037361351, "kl": 0.2041015625, "learning_rate": 6.427194673208341e-07, "loss": 0.0818, "reward": 1.560437560081482, "reward_std": 0.19831281900405884, "rewards/accuracy_reward_stage2": 0.6854374408721924, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2040 }, { "completion_length": 6.9375, "epoch": 0.357630979498861, "grad_norm": 17.874500673445397, "kl": 0.048583984375, "learning_rate": 6.425442439109864e-07, "loss": 0.0195, "reward": 1.8301217555999756, "reward_std": 0.16129888594150543, "rewards/accuracy_reward_stage2": 0.8301217555999756, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2041 }, { "completion_length": 9.578125, "epoch": 0.3578062029087086, "grad_norm": 20.93377670278759, "kl": 0.10693359375, "learning_rate": 6.423690205011389e-07, "loss": -0.0112, "reward": 1.4852190017700195, "reward_std": 0.18758749961853027, "rewards/accuracy_reward_stage2": 0.5164690613746643, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2042 }, { "completion_length": 6.03125, "epoch": 0.3579814263185562, "grad_norm": 14.282698452510397, "kl": 0.1591796875, "learning_rate": 6.421937970912914e-07, "loss": -0.0633, "reward": 1.5824334621429443, "reward_std": 0.24984443187713623, "rewards/accuracy_reward_stage2": 0.6293083429336548, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2043 }, { "completion_length": 13.140625, "epoch": 0.3581566497284037, "grad_norm": 19.279786705660666, "kl": 0.062255859375, "learning_rate": 6.420185736814438e-07, "loss": -0.0426, "reward": 1.7357683181762695, "reward_std": 0.21315570175647736, "rewards/accuracy_reward_stage2": 0.7670182585716248, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2044 }, { "completion_length": 7.828125, "epoch": 0.35833187313825127, "grad_norm": 24.18269457328421, "kl": 0.1611328125, "learning_rate": 6.418433502715963e-07, "loss": 0.0551, "reward": 1.3229691982269287, "reward_std": 0.2945018708705902, "rewards/accuracy_reward_stage2": 0.4635942578315735, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2045 }, { "completion_length": 11.609375, "epoch": 0.3585070965480988, "grad_norm": 20.038499784926273, "kl": 0.1669921875, "learning_rate": 6.416681268617487e-07, "loss": 0.0451, "reward": 1.503136396408081, "reward_std": 0.2722621560096741, "rewards/accuracy_reward_stage2": 0.5187614560127258, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2046 }, { "completion_length": 8.796875, "epoch": 0.35868231995794636, "grad_norm": 28.963244713081757, "kl": 0.048583984375, "learning_rate": 6.414929034519011e-07, "loss": 0.0194, "reward": 1.3736588954925537, "reward_std": 0.31448879837989807, "rewards/accuracy_reward_stage2": 0.49865883588790894, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2047 }, { "completion_length": 7.09375, "epoch": 0.35885754336779396, "grad_norm": 17.347047787010148, "kl": 0.0869140625, "learning_rate": 6.413176800420536e-07, "loss": -0.0536, "reward": 1.2793877124786377, "reward_std": 0.25073882937431335, "rewards/accuracy_reward_stage2": 0.31063777208328247, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2048 }, { "completion_length": 9.765625, "epoch": 0.3590327667776415, "grad_norm": 54.80440220993308, "kl": 0.365234375, "learning_rate": 6.41142456632206e-07, "loss": 0.0837, "reward": 1.6083563566207886, "reward_std": 0.2655033469200134, "rewards/accuracy_reward_stage2": 0.6396063566207886, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2049 }, { "completion_length": 9.375, "epoch": 0.35920799018748906, "grad_norm": 20.464226476682267, "kl": 0.050537109375, "learning_rate": 6.409672332223585e-07, "loss": 0.0073, "reward": 1.6855225563049316, "reward_std": 0.21928083896636963, "rewards/accuracy_reward_stage2": 0.8261474370956421, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2050 }, { "completion_length": 11.78125, "epoch": 0.3593832135973366, "grad_norm": 21.28322369967594, "kl": 0.056640625, "learning_rate": 6.407920098125109e-07, "loss": -0.0611, "reward": 1.612959384918213, "reward_std": 0.29035934805870056, "rewards/accuracy_reward_stage2": 0.6598344445228577, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2051 }, { "completion_length": 32.5, "epoch": 0.35955843700718415, "grad_norm": 22.26627309796276, "kl": 0.07763671875, "learning_rate": 6.406167864026633e-07, "loss": -0.0091, "reward": 1.3275949954986572, "reward_std": 0.2904391586780548, "rewards/accuracy_reward_stage2": 0.34321993589401245, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2052 }, { "completion_length": 8.21875, "epoch": 0.3597336604170317, "grad_norm": 21.821006377049365, "kl": 0.10400390625, "learning_rate": 6.404415629928158e-07, "loss": 0.0415, "reward": 1.5978240966796875, "reward_std": 0.3226596415042877, "rewards/accuracy_reward_stage2": 0.5978240966796875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2053 }, { "completion_length": 7.203125, "epoch": 0.35990888382687924, "grad_norm": 19.665977379258152, "kl": 0.0712890625, "learning_rate": 6.402663395829683e-07, "loss": 0.0285, "reward": 1.5003751516342163, "reward_std": 0.20897655189037323, "rewards/accuracy_reward_stage2": 0.5003751516342163, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2054 }, { "completion_length": 15.40625, "epoch": 0.36008410723672685, "grad_norm": 15.361039973728369, "kl": 0.0947265625, "learning_rate": 6.400911161731207e-07, "loss": -0.0506, "reward": 1.9023044109344482, "reward_std": 0.2073962688446045, "rewards/accuracy_reward_stage2": 0.9335543513298035, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2055 }, { "completion_length": 8.0, "epoch": 0.3602593306465744, "grad_norm": 22.679126144790594, "kl": 0.1494140625, "learning_rate": 6.399158927632732e-07, "loss": 0.0599, "reward": 1.4556570053100586, "reward_std": 0.24635502696037292, "rewards/accuracy_reward_stage2": 0.4556569457054138, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2056 }, { "completion_length": 14.71875, "epoch": 0.36043455405642194, "grad_norm": 21.89217055198326, "kl": 0.07763671875, "learning_rate": 6.397406693534256e-07, "loss": 0.0311, "reward": 1.569153070449829, "reward_std": 0.21444562077522278, "rewards/accuracy_reward_stage2": 0.5691530704498291, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2057 }, { "completion_length": 14.8125, "epoch": 0.3606097774662695, "grad_norm": 64.99758626128994, "kl": 0.34375, "learning_rate": 6.395654459435781e-07, "loss": 0.0927, "reward": 1.2257872819900513, "reward_std": 0.29136985540390015, "rewards/accuracy_reward_stage2": 0.36641234159469604, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2058 }, { "completion_length": 8.5, "epoch": 0.36078500087611703, "grad_norm": 11.101316283181784, "kl": 0.0162353515625, "learning_rate": 6.393902225337305e-07, "loss": 0.0065, "reward": 1.7482197284698486, "reward_std": 0.1722995638847351, "rewards/accuracy_reward_stage2": 0.7482197284698486, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2059 }, { "completion_length": 23.21875, "epoch": 0.3609602242859646, "grad_norm": 21.513832405825056, "kl": 0.11279296875, "learning_rate": 6.392149991238828e-07, "loss": 0.0451, "reward": 1.5280723571777344, "reward_std": 0.28505265712738037, "rewards/accuracy_reward_stage2": 0.6530723571777344, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2060 }, { "completion_length": 31.703125, "epoch": 0.3611354476958122, "grad_norm": 109.66779035733514, "kl": 0.828125, "learning_rate": 6.390397757140353e-07, "loss": 0.2906, "reward": 1.1904970407485962, "reward_std": 0.07266878336668015, "rewards/accuracy_reward_stage2": 0.7061220407485962, "rewards/format_reward_stage1_pointerpad": 0.484375, "scores/accuracy_reward_stage2": 0.484375, "step": 2061 }, { "completion_length": 7.3125, "epoch": 0.36131067110565973, "grad_norm": 22.10805459794911, "kl": 0.126953125, "learning_rate": 6.388645523041878e-07, "loss": 0.0119, "reward": 1.5646607875823975, "reward_std": 0.32735133171081543, "rewards/accuracy_reward_stage2": 0.5802856683731079, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2062 }, { "completion_length": 13.421875, "epoch": 0.3614858945155073, "grad_norm": 15.694383901469001, "kl": 0.1015625, "learning_rate": 6.386893288943402e-07, "loss": 0.0189, "reward": 1.6609668731689453, "reward_std": 0.134820356965065, "rewards/accuracy_reward_stage2": 0.6765917539596558, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2063 }, { "completion_length": 11.171875, "epoch": 0.3616611179253548, "grad_norm": 17.573652360450506, "kl": 0.0791015625, "learning_rate": 6.385141054844927e-07, "loss": 0.0317, "reward": 1.6683963537216187, "reward_std": 0.09525197744369507, "rewards/accuracy_reward_stage2": 0.6683963537216187, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2064 }, { "completion_length": 8.765625, "epoch": 0.36183634133520237, "grad_norm": 8.684531722027758, "kl": 0.01708984375, "learning_rate": 6.383388820746451e-07, "loss": 0.0069, "reward": 1.521653175354004, "reward_std": 0.03974734991788864, "rewards/accuracy_reward_stage2": 0.5216532349586487, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2065 }, { "completion_length": 10.953125, "epoch": 0.3620115647450499, "grad_norm": 20.975044338571003, "kl": 0.1201171875, "learning_rate": 6.381636586647976e-07, "loss": 0.0152, "reward": 1.4553329944610596, "reward_std": 0.2626515030860901, "rewards/accuracy_reward_stage2": 0.47095784544944763, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2066 }, { "completion_length": 12.5, "epoch": 0.3621867881548975, "grad_norm": 15.954595962342632, "kl": 0.06005859375, "learning_rate": 6.379884352549501e-07, "loss": -0.0074, "reward": 1.5274360179901123, "reward_std": 0.19874641299247742, "rewards/accuracy_reward_stage2": 0.5430610179901123, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2067 }, { "completion_length": 10.21875, "epoch": 0.36236201156474507, "grad_norm": 24.982193204846315, "kl": 0.0927734375, "learning_rate": 6.378132118451025e-07, "loss": -0.0041, "reward": 1.5683791637420654, "reward_std": 0.25179433822631836, "rewards/accuracy_reward_stage2": 0.5840041637420654, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2068 }, { "completion_length": 10.546875, "epoch": 0.3625372349745926, "grad_norm": 14.140044137439842, "kl": 0.0751953125, "learning_rate": 6.37637988435255e-07, "loss": 0.0301, "reward": 1.5177831649780273, "reward_std": 0.13460445404052734, "rewards/accuracy_reward_stage2": 0.5177832245826721, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2069 }, { "completion_length": 11.296875, "epoch": 0.36271245838444016, "grad_norm": 21.19048020714746, "kl": 0.087890625, "learning_rate": 6.374627650254075e-07, "loss": -0.0514, "reward": 1.389535665512085, "reward_std": 0.2172594964504242, "rewards/accuracy_reward_stage2": 0.42078569531440735, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2070 }, { "completion_length": 11.71875, "epoch": 0.3628876817942877, "grad_norm": 25.686707689715494, "kl": 0.1533203125, "learning_rate": 6.372875416155598e-07, "loss": 0.0611, "reward": 1.4730861186981201, "reward_std": 0.22391179203987122, "rewards/accuracy_reward_stage2": 0.5980860590934753, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2071 }, { "completion_length": 16.390625, "epoch": 0.36306290520413526, "grad_norm": 16.942657071815216, "kl": 0.06396484375, "learning_rate": 6.371123182057122e-07, "loss": 0.0257, "reward": 1.5159791707992554, "reward_std": 0.18147556483745575, "rewards/accuracy_reward_stage2": 0.5159791707992554, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2072 }, { "completion_length": 12.84375, "epoch": 0.3632381286139828, "grad_norm": 15.66545943705507, "kl": 0.126953125, "learning_rate": 6.369370947958646e-07, "loss": 0.0508, "reward": 1.5079923868179321, "reward_std": 0.167510986328125, "rewards/accuracy_reward_stage2": 0.6329923868179321, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2073 }, { "completion_length": 9.265625, "epoch": 0.3634133520238304, "grad_norm": 18.989514067325118, "kl": 0.07568359375, "learning_rate": 6.367618713860171e-07, "loss": -0.0139, "reward": 1.6523466110229492, "reward_std": 0.2200305163860321, "rewards/accuracy_reward_stage2": 0.667971670627594, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2074 }, { "completion_length": 10.25, "epoch": 0.36358857543367795, "grad_norm": 17.02760984123236, "kl": 0.047119140625, "learning_rate": 6.365866479761696e-07, "loss": 0.0189, "reward": 1.6422874927520752, "reward_std": 0.0827837586402893, "rewards/accuracy_reward_stage2": 0.6422874927520752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2075 }, { "completion_length": 11.25, "epoch": 0.3637637988435255, "grad_norm": 24.889511070713368, "kl": 0.1376953125, "learning_rate": 6.36411424566322e-07, "loss": 0.055, "reward": 1.5129289627075195, "reward_std": 0.2492441087961197, "rewards/accuracy_reward_stage2": 0.63792884349823, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2076 }, { "completion_length": 7.875, "epoch": 0.36393902225337305, "grad_norm": 19.01981301756976, "kl": 0.09619140625, "learning_rate": 6.362362011564745e-07, "loss": 0.0384, "reward": 1.56331467628479, "reward_std": 0.19022265076637268, "rewards/accuracy_reward_stage2": 0.5633147954940796, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2077 }, { "completion_length": 6.234375, "epoch": 0.3641142456632206, "grad_norm": 22.173410743694834, "kl": 0.12451171875, "learning_rate": 6.36060977746627e-07, "loss": 0.022, "reward": 1.4536991119384766, "reward_std": 0.2785697281360626, "rewards/accuracy_reward_stage2": 0.5943241119384766, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2078 }, { "completion_length": 12.8125, "epoch": 0.36428946907306814, "grad_norm": 10.779453254625155, "kl": 0.039306640625, "learning_rate": 6.358857543367794e-07, "loss": 0.0157, "reward": 1.6614583730697632, "reward_std": 0.08398155868053436, "rewards/accuracy_reward_stage2": 0.7864583730697632, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2079 }, { "completion_length": 10.921875, "epoch": 0.36446469248291574, "grad_norm": 17.875036599921554, "kl": 0.08935546875, "learning_rate": 6.357105309269319e-07, "loss": -0.0084, "reward": 1.8185806274414062, "reward_std": 0.20812779664993286, "rewards/accuracy_reward_stage2": 0.8342055082321167, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2080 }, { "completion_length": 10.140625, "epoch": 0.3646399158927633, "grad_norm": 19.66613709519755, "kl": 0.2119140625, "learning_rate": 6.355353075170842e-07, "loss": 0.0532, "reward": 1.3610090017318726, "reward_std": 0.2860793471336365, "rewards/accuracy_reward_stage2": 0.5016340017318726, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2081 }, { "completion_length": 11.328125, "epoch": 0.36481513930261084, "grad_norm": 14.665167537059556, "kl": 0.1142578125, "learning_rate": 6.353600841072367e-07, "loss": -0.0802, "reward": 1.7025963068008423, "reward_std": 0.1936970353126526, "rewards/accuracy_reward_stage2": 0.7494713068008423, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2082 }, { "completion_length": 10.453125, "epoch": 0.3649903627124584, "grad_norm": 13.06306517458348, "kl": 0.03125, "learning_rate": 6.351848606973892e-07, "loss": -0.0737, "reward": 1.8571877479553223, "reward_std": 0.1453656405210495, "rewards/accuracy_reward_stage2": 0.8884376883506775, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2083 }, { "completion_length": 7.96875, "epoch": 0.36516558612230593, "grad_norm": 36.78432158866736, "kl": 0.0291748046875, "learning_rate": 6.350096372875415e-07, "loss": 0.0117, "reward": 1.804444670677185, "reward_std": 0.17355109751224518, "rewards/accuracy_reward_stage2": 0.8044446706771851, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2084 }, { "completion_length": 9.984375, "epoch": 0.3653408095321535, "grad_norm": 15.207981324473641, "kl": 0.130859375, "learning_rate": 6.34834413877694e-07, "loss": -0.0248, "reward": 1.522946834564209, "reward_std": 0.19198226928710938, "rewards/accuracy_reward_stage2": 0.6791968941688538, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2085 }, { "completion_length": 12.609375, "epoch": 0.3655160329420011, "grad_norm": 16.909722121647306, "kl": 0.0859375, "learning_rate": 6.346591904678465e-07, "loss": 0.0344, "reward": 1.6806175708770752, "reward_std": 0.13519400358200073, "rewards/accuracy_reward_stage2": 0.6806175708770752, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2086 }, { "completion_length": 11.03125, "epoch": 0.3656912563518486, "grad_norm": 20.227633210535608, "kl": 0.1796875, "learning_rate": 6.344839670579989e-07, "loss": 0.0445, "reward": 1.688694953918457, "reward_std": 0.10501417517662048, "rewards/accuracy_reward_stage2": 0.829319953918457, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2087 }, { "completion_length": 17.25, "epoch": 0.3658664797616962, "grad_norm": 15.82704135197256, "kl": 0.0546875, "learning_rate": 6.343087436481514e-07, "loss": 0.0002, "reward": 1.5985554456710815, "reward_std": 0.22206175327301025, "rewards/accuracy_reward_stage2": 0.6141804456710815, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2088 }, { "completion_length": 15.765625, "epoch": 0.3660417031715437, "grad_norm": 18.337294972423447, "kl": 0.0869140625, "learning_rate": 6.341335202383038e-07, "loss": 0.0348, "reward": 1.4946866035461426, "reward_std": 0.1953589916229248, "rewards/accuracy_reward_stage2": 0.4946865439414978, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2089 }, { "completion_length": 9.328125, "epoch": 0.36621692658139127, "grad_norm": 13.987616141282922, "kl": 0.0303955078125, "learning_rate": 6.339582968284563e-07, "loss": -0.0212, "reward": 1.7124465703964233, "reward_std": 0.15865039825439453, "rewards/accuracy_reward_stage2": 0.7280715703964233, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2090 }, { "completion_length": 8.640625, "epoch": 0.3663921499912388, "grad_norm": 20.614469832693583, "kl": 0.267578125, "learning_rate": 6.337830734186087e-07, "loss": 0.1075, "reward": 1.4114258289337158, "reward_std": 0.24925842881202698, "rewards/accuracy_reward_stage2": 0.5364257097244263, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2091 }, { "completion_length": 7.015625, "epoch": 0.36656737340108636, "grad_norm": 9.815510426958314, "kl": 0.033203125, "learning_rate": 6.336078500087611e-07, "loss": 0.0132, "reward": 1.65625, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.65625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2092 }, { "completion_length": 7.671875, "epoch": 0.36674259681093396, "grad_norm": 27.466884223323603, "kl": 0.12353515625, "learning_rate": 6.334326265989136e-07, "loss": 0.0064, "reward": 1.5330727100372314, "reward_std": 0.2515067160129547, "rewards/accuracy_reward_stage2": 0.5486976504325867, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2093 }, { "completion_length": 12.328125, "epoch": 0.3669178202207815, "grad_norm": 22.04138803322858, "kl": 0.1376953125, "learning_rate": 6.332574031890661e-07, "loss": 0.011, "reward": 1.7145648002624512, "reward_std": 0.18631255626678467, "rewards/accuracy_reward_stage2": 0.7301896214485168, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2094 }, { "completion_length": 23.1875, "epoch": 0.36709304363062906, "grad_norm": 165.0439351929352, "kl": 1.03125, "learning_rate": 6.330821797792185e-07, "loss": 0.4126, "reward": 1.3562812805175781, "reward_std": 0.16295039653778076, "rewards/accuracy_reward_stage2": 0.4812812805175781, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2095 }, { "completion_length": 10.28125, "epoch": 0.3672682670404766, "grad_norm": 22.879246711638288, "kl": 0.130859375, "learning_rate": 6.32906956369371e-07, "loss": -0.0129, "reward": 1.5343456268310547, "reward_std": 0.19512513279914856, "rewards/accuracy_reward_stage2": 0.6905956268310547, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2096 }, { "completion_length": 8.671875, "epoch": 0.36744349045032415, "grad_norm": 21.393376018256088, "kl": 0.0712890625, "learning_rate": 6.327317329595233e-07, "loss": -0.0156, "reward": 1.657727837562561, "reward_std": 0.1709435135126114, "rewards/accuracy_reward_stage2": 0.6733528971672058, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2097 }, { "completion_length": 7.46875, "epoch": 0.3676187138601717, "grad_norm": 7.7193000878386275, "kl": 0.01904296875, "learning_rate": 6.325565095496758e-07, "loss": 0.0076, "reward": 1.6041667461395264, "reward_std": 0.044543541967868805, "rewards/accuracy_reward_stage2": 0.6041666865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2098 }, { "completion_length": 11.015625, "epoch": 0.3677939372700193, "grad_norm": 17.218948024547018, "kl": 0.1875, "learning_rate": 6.323812861398283e-07, "loss": 0.0753, "reward": 1.522045612335205, "reward_std": 0.1089363694190979, "rewards/accuracy_reward_stage2": 0.6470456719398499, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2099 }, { "completion_length": 12.765625, "epoch": 0.36796916067986685, "grad_norm": 20.258775312947893, "kl": 0.01092529296875, "learning_rate": 6.322060627299806e-07, "loss": 0.0044, "reward": 1.6412173509597778, "reward_std": 0.2584494352340698, "rewards/accuracy_reward_stage2": 0.6412172913551331, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2100 }, { "completion_length": 15.90625, "epoch": 0.3681443840897144, "grad_norm": 20.08498932619288, "kl": 0.30859375, "learning_rate": 6.320308393201331e-07, "loss": 0.1236, "reward": 1.3641042709350586, "reward_std": 0.2118104249238968, "rewards/accuracy_reward_stage2": 0.4891042113304138, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2101 }, { "completion_length": 5.09375, "epoch": 0.36831960749956194, "grad_norm": 13.095916114622412, "kl": 0.11767578125, "learning_rate": 6.318556159102855e-07, "loss": -0.038, "reward": 1.7996301651000977, "reward_std": 0.12616249918937683, "rewards/accuracy_reward_stage2": 0.8308802843093872, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2102 }, { "completion_length": 23.140625, "epoch": 0.3684948309094095, "grad_norm": 17.25267888432899, "kl": 0.0380859375, "learning_rate": 6.31680392500438e-07, "loss": 0.0152, "reward": 1.6559020280838013, "reward_std": 0.13044781982898712, "rewards/accuracy_reward_stage2": 0.6559020280838013, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2103 }, { "completion_length": 15.453125, "epoch": 0.36867005431925703, "grad_norm": 27.42187132545378, "kl": 0.38671875, "learning_rate": 6.315051690905905e-07, "loss": 0.1546, "reward": 1.5863591432571411, "reward_std": 0.21733585000038147, "rewards/accuracy_reward_stage2": 0.7113592028617859, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2104 }, { "completion_length": 11.09375, "epoch": 0.3688452777291046, "grad_norm": 17.045225521328376, "kl": 0.072265625, "learning_rate": 6.313299456807429e-07, "loss": 0.0291, "reward": 1.1704258918762207, "reward_std": 0.17644906044006348, "rewards/accuracy_reward_stage2": 0.2954259514808655, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2105 }, { "completion_length": 9.265625, "epoch": 0.3690205011389522, "grad_norm": 20.790848387475037, "kl": 0.0732421875, "learning_rate": 6.311547222708954e-07, "loss": 0.0292, "reward": 1.6465411186218262, "reward_std": 0.18840546905994415, "rewards/accuracy_reward_stage2": 0.6465411186218262, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2106 }, { "completion_length": 9.796875, "epoch": 0.36919572454879973, "grad_norm": 15.59715427953365, "kl": 0.109375, "learning_rate": 6.309794988610479e-07, "loss": -0.0006, "reward": 1.696101427078247, "reward_std": 0.21237659454345703, "rewards/accuracy_reward_stage2": 0.7117264270782471, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2107 }, { "completion_length": 11.359375, "epoch": 0.3693709479586473, "grad_norm": 36.65876864188198, "kl": 0.0556640625, "learning_rate": 6.308042754512003e-07, "loss": -0.0219, "reward": 1.7240824699401855, "reward_std": 0.2328636646270752, "rewards/accuracy_reward_stage2": 0.7397074699401855, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2108 }, { "completion_length": 16.671875, "epoch": 0.3695461713684948, "grad_norm": 26.18128665190158, "kl": 0.384765625, "learning_rate": 6.306290520413528e-07, "loss": 0.1252, "reward": 1.4608935117721558, "reward_std": 0.20166131854057312, "rewards/accuracy_reward_stage2": 0.601518452167511, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2109 }, { "completion_length": 9.609375, "epoch": 0.36972139477834237, "grad_norm": 18.71367999781447, "kl": 0.1376953125, "learning_rate": 6.30453828631505e-07, "loss": 0.0239, "reward": 1.3404827117919922, "reward_std": 0.22724974155426025, "rewards/accuracy_reward_stage2": 0.4811077415943146, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2110 }, { "completion_length": 8.96875, "epoch": 0.3698966181881899, "grad_norm": 25.878878293086668, "kl": 0.27734375, "learning_rate": 6.302786052216575e-07, "loss": 0.0349, "reward": 1.4678795337677002, "reward_std": 0.37333595752716064, "rewards/accuracy_reward_stage2": 0.6241295337677002, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2111 }, { "completion_length": 11.765625, "epoch": 0.3700718415980375, "grad_norm": 13.73746502004517, "kl": 0.07275390625, "learning_rate": 6.3010338181181e-07, "loss": -0.0573, "reward": 1.8010525703430176, "reward_std": 0.1689380407333374, "rewards/accuracy_reward_stage2": 0.8323025703430176, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2112 }, { "completion_length": 10.234375, "epoch": 0.37024706500788507, "grad_norm": 17.41833234174483, "kl": 0.2236328125, "learning_rate": 6.299281584019624e-07, "loss": 0.1181, "reward": 1.477597713470459, "reward_std": 0.1575087606906891, "rewards/accuracy_reward_stage2": 0.602597713470459, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2113 }, { "completion_length": 24.203125, "epoch": 0.3704222884177326, "grad_norm": 17.130019289837158, "kl": 0.064453125, "learning_rate": 6.297529349921149e-07, "loss": -0.0723, "reward": 1.571899652481079, "reward_std": 0.21543410420417786, "rewards/accuracy_reward_stage2": 0.6187746524810791, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2114 }, { "completion_length": 10.484375, "epoch": 0.37059751182758016, "grad_norm": 16.148528845551034, "kl": 0.109375, "learning_rate": 6.295777115822674e-07, "loss": -0.0333, "reward": 1.4840588569641113, "reward_std": 0.225162535905838, "rewards/accuracy_reward_stage2": 0.5153088569641113, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2115 }, { "completion_length": 8.390625, "epoch": 0.3707727352374277, "grad_norm": 19.73466678979311, "kl": 0.0791015625, "learning_rate": 6.294024881724198e-07, "loss": -0.0125, "reward": 1.5833333730697632, "reward_std": 0.17150771617889404, "rewards/accuracy_reward_stage2": 0.5989583134651184, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2116 }, { "completion_length": 15.515625, "epoch": 0.37094795864727526, "grad_norm": 23.316172543129646, "kl": 0.1845703125, "learning_rate": 6.292272647625723e-07, "loss": 0.0518, "reward": 1.400850772857666, "reward_std": 0.25529760122299194, "rewards/accuracy_reward_stage2": 0.541475772857666, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2117 }, { "completion_length": 9.734375, "epoch": 0.37112318205712286, "grad_norm": 17.983517471441573, "kl": 0.171875, "learning_rate": 6.290520413527247e-07, "loss": 0.0688, "reward": 1.432976245880127, "reward_std": 0.27066361904144287, "rewards/accuracy_reward_stage2": 0.557976245880127, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2118 }, { "completion_length": 8.640625, "epoch": 0.3712984054669704, "grad_norm": 20.06146237815626, "kl": 0.10791015625, "learning_rate": 6.288768179428772e-07, "loss": 0.0432, "reward": 1.6736295223236084, "reward_std": 0.20630821585655212, "rewards/accuracy_reward_stage2": 0.6736295223236084, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2119 }, { "completion_length": 15.515625, "epoch": 0.37147362887681795, "grad_norm": 33.925749094582095, "kl": 0.169921875, "learning_rate": 6.287015945330297e-07, "loss": 0.0677, "reward": 1.5644185543060303, "reward_std": 0.2227061688899994, "rewards/accuracy_reward_stage2": 0.6894185543060303, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2120 }, { "completion_length": 13.875, "epoch": 0.3716488522866655, "grad_norm": 21.495517940812118, "kl": 0.07470703125, "learning_rate": 6.28526371123182e-07, "loss": 0.03, "reward": 1.561547875404358, "reward_std": 0.2311323881149292, "rewards/accuracy_reward_stage2": 0.6865478754043579, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2121 }, { "completion_length": 13.3125, "epoch": 0.37182407569651305, "grad_norm": 28.243106658760585, "kl": 0.07568359375, "learning_rate": 6.283511477133345e-07, "loss": 0.0304, "reward": 1.183201551437378, "reward_std": 0.24948707222938538, "rewards/accuracy_reward_stage2": 0.30820155143737793, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2122 }, { "completion_length": 14.71875, "epoch": 0.3719992991063606, "grad_norm": 18.79970480525015, "kl": 0.0869140625, "learning_rate": 6.281759243034869e-07, "loss": 0.0347, "reward": 1.5993150472640991, "reward_std": 0.14652368426322937, "rewards/accuracy_reward_stage2": 0.5993151068687439, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2123 }, { "completion_length": 7.484375, "epoch": 0.37217452251620814, "grad_norm": 22.273172047889105, "kl": 0.1162109375, "learning_rate": 6.280007008936393e-07, "loss": 0.0464, "reward": 1.5664750337600708, "reward_std": 0.3070983588695526, "rewards/accuracy_reward_stage2": 0.5664750337600708, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2124 }, { "completion_length": 8.171875, "epoch": 0.37234974592605574, "grad_norm": 18.925406299644248, "kl": 0.0341796875, "learning_rate": 6.278254774837918e-07, "loss": 0.0137, "reward": 1.5416319370269775, "reward_std": 0.18046918511390686, "rewards/accuracy_reward_stage2": 0.5416319966316223, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2125 }, { "completion_length": 13.1875, "epoch": 0.3725249693359033, "grad_norm": 21.017869463214364, "kl": 0.107421875, "learning_rate": 6.276502540739442e-07, "loss": -0.0684, "reward": 1.4259710311889648, "reward_std": 0.351542592048645, "rewards/accuracy_reward_stage2": 0.4728460907936096, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2126 }, { "completion_length": 12.234375, "epoch": 0.37270019274575084, "grad_norm": 24.82390537095879, "kl": 0.040771484375, "learning_rate": 6.274750306640967e-07, "loss": 0.0163, "reward": 1.64809250831604, "reward_std": 0.20966197550296783, "rewards/accuracy_reward_stage2": 0.64809250831604, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2127 }, { "completion_length": 9.875, "epoch": 0.3728754161555984, "grad_norm": 23.547973224097863, "kl": 0.09619140625, "learning_rate": 6.272998072542492e-07, "loss": 0.0386, "reward": 1.5378363132476807, "reward_std": 0.20070700347423553, "rewards/accuracy_reward_stage2": 0.5378363132476807, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2128 }, { "completion_length": 9.609375, "epoch": 0.37305063956544593, "grad_norm": 27.303363055608237, "kl": 0.2197265625, "learning_rate": 6.271245838444016e-07, "loss": -0.034, "reward": 1.6875, "reward_std": 0.2845909595489502, "rewards/accuracy_reward_stage2": 0.734375, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2129 }, { "completion_length": 12.828125, "epoch": 0.3732258629752935, "grad_norm": 22.24181250018731, "kl": 0.171875, "learning_rate": 6.26949360434554e-07, "loss": 0.0259, "reward": 1.425432562828064, "reward_std": 0.3704448342323303, "rewards/accuracy_reward_stage2": 0.4410575032234192, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2130 }, { "completion_length": 12.03125, "epoch": 0.3734010863851411, "grad_norm": 45.756356155634414, "kl": 0.396484375, "learning_rate": 6.267741370247065e-07, "loss": 0.1586, "reward": 1.5892225503921509, "reward_std": 0.21309590339660645, "rewards/accuracy_reward_stage2": 0.7142226099967957, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2131 }, { "completion_length": 14.859375, "epoch": 0.3735763097949886, "grad_norm": 19.61871851111529, "kl": 0.1943359375, "learning_rate": 6.265989136148589e-07, "loss": -0.0106, "reward": 1.5703403949737549, "reward_std": 0.2058640867471695, "rewards/accuracy_reward_stage2": 0.7265903353691101, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2132 }, { "completion_length": 9.640625, "epoch": 0.3737515332048362, "grad_norm": 22.685600796758123, "kl": 0.126953125, "learning_rate": 6.264236902050114e-07, "loss": -0.0376, "reward": 1.619655728340149, "reward_std": 0.31002217531204224, "rewards/accuracy_reward_stage2": 0.6509058475494385, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2133 }, { "completion_length": 8.859375, "epoch": 0.3739267566146837, "grad_norm": 22.704349832902558, "kl": 0.255859375, "learning_rate": 6.262484667951638e-07, "loss": 0.0141, "reward": 1.5410445928573608, "reward_std": 0.40798452496528625, "rewards/accuracy_reward_stage2": 0.5722945928573608, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2134 }, { "completion_length": 12.015625, "epoch": 0.37410198002453127, "grad_norm": 17.017736183879197, "kl": 0.11328125, "learning_rate": 6.260732433853162e-07, "loss": 0.0453, "reward": 1.5101354122161865, "reward_std": 0.1195073202252388, "rewards/accuracy_reward_stage2": 0.5101353526115417, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2135 }, { "completion_length": 8.078125, "epoch": 0.3742772034343788, "grad_norm": 18.307863070101405, "kl": 0.0634765625, "learning_rate": 6.258980199754687e-07, "loss": 0.0253, "reward": 1.782165288925171, "reward_std": 0.15776385366916656, "rewards/accuracy_reward_stage2": 0.7821652889251709, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2136 }, { "completion_length": 10.375, "epoch": 0.3744524268442264, "grad_norm": 19.604165483296928, "kl": 0.0869140625, "learning_rate": 6.257227965656211e-07, "loss": -0.0095, "reward": 1.1285523176193237, "reward_std": 0.14808019995689392, "rewards/accuracy_reward_stage2": 0.14417734742164612, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2137 }, { "completion_length": 9.234375, "epoch": 0.37462765025407396, "grad_norm": 15.117178098316627, "kl": 0.10546875, "learning_rate": 6.255475731557736e-07, "loss": -0.002, "reward": 1.509828805923462, "reward_std": 0.15732884407043457, "rewards/accuracy_reward_stage2": 0.5254538059234619, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2138 }, { "completion_length": 12.59375, "epoch": 0.3748028736639215, "grad_norm": 16.940128197482103, "kl": 0.0673828125, "learning_rate": 6.253723497459261e-07, "loss": -0.0019, "reward": 1.7207591533660889, "reward_std": 0.17644330859184265, "rewards/accuracy_reward_stage2": 0.7363842129707336, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2139 }, { "completion_length": 11.890625, "epoch": 0.37497809707376906, "grad_norm": 26.874987309380455, "kl": 0.09619140625, "learning_rate": 6.251971263360784e-07, "loss": 0.0385, "reward": 1.5684523582458496, "reward_std": 0.23804005980491638, "rewards/accuracy_reward_stage2": 0.5684524178504944, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2140 }, { "completion_length": 27.5, "epoch": 0.3751533204836166, "grad_norm": 26.3035483971051, "kl": 0.287109375, "learning_rate": 6.250219029262309e-07, "loss": 0.0831, "reward": 1.4032840728759766, "reward_std": 0.2784123420715332, "rewards/accuracy_reward_stage2": 0.5439091324806213, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2141 }, { "completion_length": 13.390625, "epoch": 0.37532854389346415, "grad_norm": 22.92592263951425, "kl": 0.11962890625, "learning_rate": 6.248466795163833e-07, "loss": 0.0478, "reward": 1.3103749752044678, "reward_std": 0.23864029347896576, "rewards/accuracy_reward_stage2": 0.4353749752044678, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2142 }, { "completion_length": 11.765625, "epoch": 0.3755037673033117, "grad_norm": 21.834870516210604, "kl": 0.09375, "learning_rate": 6.246714561065358e-07, "loss": -0.0713, "reward": 1.3206520080566406, "reward_std": 0.339372843503952, "rewards/accuracy_reward_stage2": 0.3675270974636078, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2143 }, { "completion_length": 12.078125, "epoch": 0.3756789907131593, "grad_norm": 24.03979986543318, "kl": 0.25, "learning_rate": 6.244962326966883e-07, "loss": 0.0715, "reward": 1.7012312412261963, "reward_std": 0.2878972291946411, "rewards/accuracy_reward_stage2": 0.8418562412261963, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2144 }, { "completion_length": 12.46875, "epoch": 0.37585421412300685, "grad_norm": 16.544610076218923, "kl": 0.06689453125, "learning_rate": 6.243210092868407e-07, "loss": -0.0043, "reward": 1.2745712995529175, "reward_std": 0.1314728707075119, "rewards/accuracy_reward_stage2": 0.4151962697505951, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2145 }, { "completion_length": 22.15625, "epoch": 0.3760294375328544, "grad_norm": 22.459144576557698, "kl": 0.107421875, "learning_rate": 6.241457858769932e-07, "loss": 0.0214, "reward": 1.423964023590088, "reward_std": 0.27767157554626465, "rewards/accuracy_reward_stage2": 0.5645890235900879, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2146 }, { "completion_length": 8.765625, "epoch": 0.37620466094270194, "grad_norm": 13.373006338465403, "kl": 0.05029296875, "learning_rate": 6.239705624671457e-07, "loss": 0.0202, "reward": 1.7165720462799072, "reward_std": 0.10273611545562744, "rewards/accuracy_reward_stage2": 0.841572105884552, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2147 }, { "completion_length": 15.84375, "epoch": 0.3763798843525495, "grad_norm": 11.776072260362834, "kl": 0.0184326171875, "learning_rate": 6.23795339057298e-07, "loss": 0.0074, "reward": 1.5905694961547852, "reward_std": 0.08757132291793823, "rewards/accuracy_reward_stage2": 0.5905694961547852, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2148 }, { "completion_length": 13.359375, "epoch": 0.37655510776239703, "grad_norm": 20.45580036732035, "kl": 0.059326171875, "learning_rate": 6.236201156474505e-07, "loss": 0.0237, "reward": 1.6051902770996094, "reward_std": 0.14381514489650726, "rewards/accuracy_reward_stage2": 0.6051902174949646, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2149 }, { "completion_length": 14.34375, "epoch": 0.37673033117224464, "grad_norm": 20.69266694732591, "kl": 0.048583984375, "learning_rate": 6.234448922376028e-07, "loss": 0.0195, "reward": 1.640191674232483, "reward_std": 0.23035646975040436, "rewards/accuracy_reward_stage2": 0.6401916742324829, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2150 }, { "completion_length": 12.609375, "epoch": 0.3769055545820922, "grad_norm": 20.93672786852049, "kl": 0.083984375, "learning_rate": 6.232696688277553e-07, "loss": -0.0108, "reward": 1.727813720703125, "reward_std": 0.2194058895111084, "rewards/accuracy_reward_stage2": 0.7434388399124146, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2151 }, { "completion_length": 6.578125, "epoch": 0.37708077799193973, "grad_norm": 23.85461277636439, "kl": 0.2578125, "learning_rate": 6.230944454179078e-07, "loss": 0.0705, "reward": 1.5679218769073486, "reward_std": 0.31683841347694397, "rewards/accuracy_reward_stage2": 0.7085468769073486, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2152 }, { "completion_length": 13.140625, "epoch": 0.3772560014017873, "grad_norm": 16.468654634909875, "kl": 0.046875, "learning_rate": 6.229192220080602e-07, "loss": -0.0201, "reward": 1.5747102499008179, "reward_std": 0.08664512634277344, "rewards/accuracy_reward_stage2": 0.5903353095054626, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2153 }, { "completion_length": 6.765625, "epoch": 0.3774312248116348, "grad_norm": 23.244001860146184, "kl": 0.119140625, "learning_rate": 6.227439985982127e-07, "loss": 0.0477, "reward": 1.6278541088104248, "reward_std": 0.2148430198431015, "rewards/accuracy_reward_stage2": 0.6278541088104248, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2154 }, { "completion_length": 11.5, "epoch": 0.37760644822148237, "grad_norm": 21.616613381013284, "kl": 0.09619140625, "learning_rate": 6.225687751883652e-07, "loss": -0.0058, "reward": 1.8042510747909546, "reward_std": 0.19430895149707794, "rewards/accuracy_reward_stage2": 0.8198760747909546, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2155 }, { "completion_length": 10.953125, "epoch": 0.3777816716313299, "grad_norm": 22.314447656053556, "kl": 0.09521484375, "learning_rate": 6.223935517785176e-07, "loss": -0.048, "reward": 1.4477014541625977, "reward_std": 0.2694235146045685, "rewards/accuracy_reward_stage2": 0.47895151376724243, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2156 }, { "completion_length": 9.953125, "epoch": 0.3779568950411775, "grad_norm": 5.851002174857388, "kl": 0.03125, "learning_rate": 6.222183283686701e-07, "loss": 0.0125, "reward": 1.828125, "reward_std": 0.0646936446428299, "rewards/accuracy_reward_stage2": 0.828125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2157 }, { "completion_length": 10.515625, "epoch": 0.37813211845102507, "grad_norm": 17.6330337529898, "kl": 0.058837890625, "learning_rate": 6.220431049588225e-07, "loss": -0.0207, "reward": 1.7743115425109863, "reward_std": 0.2329869270324707, "rewards/accuracy_reward_stage2": 0.7899366021156311, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2158 }, { "completion_length": 11.390625, "epoch": 0.3783073418608726, "grad_norm": 23.5113310408841, "kl": 0.0859375, "learning_rate": 6.21867881548975e-07, "loss": 0.025, "reward": 1.512028455734253, "reward_std": 0.19187329709529877, "rewards/accuracy_reward_stage2": 0.5276533961296082, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2159 }, { "completion_length": 12.625, "epoch": 0.37848256527072016, "grad_norm": 26.345417785246763, "kl": 0.27734375, "learning_rate": 6.216926581391274e-07, "loss": 0.0693, "reward": 1.250259280204773, "reward_std": 0.2544091045856476, "rewards/accuracy_reward_stage2": 0.515884280204773, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2160 }, { "completion_length": 8.203125, "epoch": 0.3786577886805677, "grad_norm": 21.203083672580384, "kl": 0.07861328125, "learning_rate": 6.215174347292797e-07, "loss": 0.0315, "reward": 1.674128532409668, "reward_std": 0.2373107671737671, "rewards/accuracy_reward_stage2": 0.6741284728050232, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2161 }, { "completion_length": 10.8125, "epoch": 0.37883301209041526, "grad_norm": 32.004422207854546, "kl": 0.220703125, "learning_rate": 6.213422113194322e-07, "loss": 0.0879, "reward": 1.3069201707839966, "reward_std": 0.24062323570251465, "rewards/accuracy_reward_stage2": 0.43192020058631897, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2162 }, { "completion_length": 15.796875, "epoch": 0.37900823550026286, "grad_norm": 22.743167065050343, "kl": 0.2734375, "learning_rate": 6.211669879095846e-07, "loss": 0.0652, "reward": 1.62326979637146, "reward_std": 0.2634640336036682, "rewards/accuracy_reward_stage2": 0.76389479637146, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2163 }, { "completion_length": 7.09375, "epoch": 0.3791834589101104, "grad_norm": 15.754559332097065, "kl": 0.099609375, "learning_rate": 6.209917644997371e-07, "loss": 0.0399, "reward": 1.9393177032470703, "reward_std": 0.12810847163200378, "rewards/accuracy_reward_stage2": 0.9393176436424255, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2164 }, { "completion_length": 6.15625, "epoch": 0.37935868231995795, "grad_norm": 22.90820730699427, "kl": 0.2041015625, "learning_rate": 6.208165410898896e-07, "loss": -0.0068, "reward": 1.776153564453125, "reward_std": 0.2677950859069824, "rewards/accuracy_reward_stage2": 0.8074035048484802, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2165 }, { "completion_length": 8.71875, "epoch": 0.3795339057298055, "grad_norm": 22.416892182157884, "kl": 0.0791015625, "learning_rate": 6.20641317680042e-07, "loss": 0.0093, "reward": 1.6350996494293213, "reward_std": 0.20906037092208862, "rewards/accuracy_reward_stage2": 0.6507246494293213, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2166 }, { "completion_length": 18.546875, "epoch": 0.37970912913965305, "grad_norm": 20.848643017035872, "kl": 0.044677734375, "learning_rate": 6.204660942701945e-07, "loss": 0.0179, "reward": 1.5497921705245972, "reward_std": 0.191371887922287, "rewards/accuracy_reward_stage2": 0.5497921705245972, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2167 }, { "completion_length": 9.15625, "epoch": 0.3798843525495006, "grad_norm": 16.90072768619376, "kl": 0.19921875, "learning_rate": 6.20290870860347e-07, "loss": 0.0353, "reward": 1.5379623174667358, "reward_std": 0.1680925339460373, "rewards/accuracy_reward_stage2": 0.5535872578620911, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2168 }, { "completion_length": 8.15625, "epoch": 0.3800595759593482, "grad_norm": 16.751612988843327, "kl": 0.1708984375, "learning_rate": 6.201156474504994e-07, "loss": 0.0684, "reward": 1.4254521131515503, "reward_std": 0.16224229335784912, "rewards/accuracy_reward_stage2": 0.4254521429538727, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2169 }, { "completion_length": 12.09375, "epoch": 0.38023479936919574, "grad_norm": 17.533718907309815, "kl": 0.123046875, "learning_rate": 6.199404240406518e-07, "loss": 0.0162, "reward": 1.5870461463928223, "reward_std": 0.25187405943870544, "rewards/accuracy_reward_stage2": 0.602671205997467, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2170 }, { "completion_length": 9.375, "epoch": 0.3804100227790433, "grad_norm": 17.396942548872524, "kl": 0.1787109375, "learning_rate": 6.197652006308043e-07, "loss": 0.0271, "reward": 1.4302774667739868, "reward_std": 0.2149101197719574, "rewards/accuracy_reward_stage2": 0.5709024667739868, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2171 }, { "completion_length": 7.875, "epoch": 0.38058524618889084, "grad_norm": 15.929708549328153, "kl": 0.0703125, "learning_rate": 6.195899772209567e-07, "loss": 0.0281, "reward": 1.8126511573791504, "reward_std": 0.17158563435077667, "rewards/accuracy_reward_stage2": 0.8126511573791504, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2172 }, { "completion_length": 21.15625, "epoch": 0.3807604695987384, "grad_norm": 19.271770329974064, "kl": 0.154296875, "learning_rate": 6.194147538111091e-07, "loss": 0.0175, "reward": 1.1675336360931396, "reward_std": 0.16348037123680115, "rewards/accuracy_reward_stage2": 0.18315869569778442, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2173 }, { "completion_length": 9.671875, "epoch": 0.38093569300858593, "grad_norm": 16.931454968051227, "kl": 0.1201171875, "learning_rate": 6.192395304012615e-07, "loss": 0.0038, "reward": 1.4680428504943848, "reward_std": 0.24045197665691376, "rewards/accuracy_reward_stage2": 0.48366779088974, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2174 }, { "completion_length": 15.328125, "epoch": 0.3811109164184335, "grad_norm": 18.821037310256308, "kl": 0.06689453125, "learning_rate": 6.19064306991414e-07, "loss": 0.0268, "reward": 1.465679407119751, "reward_std": 0.17180000245571136, "rewards/accuracy_reward_stage2": 0.46567946672439575, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2175 }, { "completion_length": 10.4375, "epoch": 0.3812861398282811, "grad_norm": 18.791259747994744, "kl": 0.140625, "learning_rate": 6.188890835815665e-07, "loss": 0.056, "reward": 1.6078298091888428, "reward_std": 0.10767532885074615, "rewards/accuracy_reward_stage2": 0.6078298687934875, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2176 }, { "completion_length": 7.90625, "epoch": 0.3814613632381286, "grad_norm": 26.132132368079272, "kl": 0.0247802734375, "learning_rate": 6.187138601717189e-07, "loss": 0.0099, "reward": 1.703125, "reward_std": 0.35612428188323975, "rewards/accuracy_reward_stage2": 0.703125, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2177 }, { "completion_length": 5.046875, "epoch": 0.3816365866479762, "grad_norm": 11.699367635387564, "kl": 0.083984375, "learning_rate": 6.185386367618714e-07, "loss": -0.0105, "reward": 1.6875, "reward_std": 0.1462521106004715, "rewards/accuracy_reward_stage2": 0.828125, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2178 }, { "completion_length": 14.09375, "epoch": 0.3818118100578237, "grad_norm": 24.398667267083667, "kl": 0.310546875, "learning_rate": 6.183634133520237e-07, "loss": 0.095, "reward": 1.657011866569519, "reward_std": 0.2222549319267273, "rewards/accuracy_reward_stage2": 0.7976367473602295, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2179 }, { "completion_length": 11.03125, "epoch": 0.38198703346767127, "grad_norm": 28.560326997317556, "kl": 0.236328125, "learning_rate": 6.181881899421762e-07, "loss": 0.0944, "reward": 1.7426857948303223, "reward_std": 0.17225751280784607, "rewards/accuracy_reward_stage2": 0.8676857948303223, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2180 }, { "completion_length": 10.75, "epoch": 0.3821622568775188, "grad_norm": 11.066768967836019, "kl": 0.044921875, "learning_rate": 6.180129665323287e-07, "loss": 0.018, "reward": 1.738937258720398, "reward_std": 0.083560511469841, "rewards/accuracy_reward_stage2": 0.738937258720398, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2181 }, { "completion_length": 9.078125, "epoch": 0.3823374802873664, "grad_norm": 20.679233919704437, "kl": 0.1328125, "learning_rate": 6.178377431224811e-07, "loss": -0.0125, "reward": 1.6758754253387451, "reward_std": 0.24189935624599457, "rewards/accuracy_reward_stage2": 0.7071253657341003, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2182 }, { "completion_length": 10.40625, "epoch": 0.38251270369721396, "grad_norm": 17.54733809618765, "kl": 0.0849609375, "learning_rate": 6.176625197126336e-07, "loss": -0.023, "reward": 1.4875990152359009, "reward_std": 0.17271263897418976, "rewards/accuracy_reward_stage2": 0.5188490748405457, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2183 }, { "completion_length": 9.390625, "epoch": 0.3826879271070615, "grad_norm": 24.9013565917632, "kl": 0.1337890625, "learning_rate": 6.174872963027861e-07, "loss": 0.0538, "reward": 1.5268363952636719, "reward_std": 0.11797383427619934, "rewards/accuracy_reward_stage2": 0.5268364548683167, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2184 }, { "completion_length": 9.734375, "epoch": 0.38286315051690906, "grad_norm": 19.620060637076595, "kl": 0.09326171875, "learning_rate": 6.173120728929385e-07, "loss": 0.0374, "reward": 1.8419290781021118, "reward_std": 0.20093566179275513, "rewards/accuracy_reward_stage2": 0.8419290781021118, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2185 }, { "completion_length": 7.765625, "epoch": 0.3830383739267566, "grad_norm": 17.378286477591992, "kl": 0.1240234375, "learning_rate": 6.171368494830909e-07, "loss": 0.0497, "reward": 1.5082812309265137, "reward_std": 0.13430514931678772, "rewards/accuracy_reward_stage2": 0.6332812309265137, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2186 }, { "completion_length": 10.078125, "epoch": 0.38321359733660415, "grad_norm": 23.07091267330619, "kl": 0.04052734375, "learning_rate": 6.169616260732433e-07, "loss": -0.0279, "reward": 1.455744743347168, "reward_std": 0.17548725008964539, "rewards/accuracy_reward_stage2": 0.596369743347168, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2187 }, { "completion_length": 15.171875, "epoch": 0.38338882074645175, "grad_norm": 15.279125531805924, "kl": 0.05419921875, "learning_rate": 6.167864026633958e-07, "loss": 0.0216, "reward": 1.5746581554412842, "reward_std": 0.08711699396371841, "rewards/accuracy_reward_stage2": 0.6996581554412842, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2188 }, { "completion_length": 15.109375, "epoch": 0.3835640441562993, "grad_norm": 21.233700522460875, "kl": 0.203125, "learning_rate": 6.166111792535483e-07, "loss": 0.0809, "reward": 1.4553985595703125, "reward_std": 0.14188659191131592, "rewards/accuracy_reward_stage2": 0.5803984999656677, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2189 }, { "completion_length": 11.78125, "epoch": 0.38373926756614685, "grad_norm": 13.064192613447082, "kl": 0.052490234375, "learning_rate": 6.164359558437006e-07, "loss": 0.021, "reward": 1.6517360210418701, "reward_std": 0.090608611702919, "rewards/accuracy_reward_stage2": 0.6517360210418701, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2190 }, { "completion_length": 8.171875, "epoch": 0.3839144909759944, "grad_norm": 29.885606654441183, "kl": 0.1923828125, "learning_rate": 6.162607324338531e-07, "loss": 0.0094, "reward": 1.6037235260009766, "reward_std": 0.35476142168045044, "rewards/accuracy_reward_stage2": 0.6349735260009766, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2191 }, { "completion_length": 10.6875, "epoch": 0.38408971438584194, "grad_norm": 28.59603690547388, "kl": 0.1533203125, "learning_rate": 6.160855090240056e-07, "loss": 0.0056, "reward": 1.557417631149292, "reward_std": 0.3235635757446289, "rewards/accuracy_reward_stage2": 0.588667631149292, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2192 }, { "completion_length": 9.8125, "epoch": 0.3842649377956895, "grad_norm": 23.595167727606896, "kl": 0.06103515625, "learning_rate": 6.15910285614158e-07, "loss": 0.0245, "reward": 1.6596101522445679, "reward_std": 0.17235329747200012, "rewards/accuracy_reward_stage2": 0.6596100926399231, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2193 }, { "completion_length": 9.296875, "epoch": 0.38444016120553703, "grad_norm": 25.677820508864002, "kl": 0.12109375, "learning_rate": 6.157350622043105e-07, "loss": 0.0485, "reward": 1.5163013935089111, "reward_std": 0.21374960243701935, "rewards/accuracy_reward_stage2": 0.5163014531135559, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2194 }, { "completion_length": 9.1875, "epoch": 0.38461538461538464, "grad_norm": 19.62805940423102, "kl": 0.051513671875, "learning_rate": 6.155598387944629e-07, "loss": -0.0235, "reward": 1.8182024955749512, "reward_std": 0.2139115333557129, "rewards/accuracy_reward_stage2": 0.8338274955749512, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2195 }, { "completion_length": 7.65625, "epoch": 0.3847906080252322, "grad_norm": 20.08895842095188, "kl": 0.07470703125, "learning_rate": 6.153846153846154e-07, "loss": -0.0064, "reward": 1.3504976034164429, "reward_std": 0.2288835346698761, "rewards/accuracy_reward_stage2": 0.36612263321876526, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2196 }, { "completion_length": 9.3125, "epoch": 0.38496583143507973, "grad_norm": 10.903634089390751, "kl": 0.0126953125, "learning_rate": 6.152093919747679e-07, "loss": 0.0051, "reward": 1.890625, "reward_std": 0.10205793380737305, "rewards/accuracy_reward_stage2": 0.890625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2197 }, { "completion_length": 6.59375, "epoch": 0.3851410548449273, "grad_norm": 23.287103677945193, "kl": 0.1171875, "learning_rate": 6.150341685649203e-07, "loss": -0.0092, "reward": 1.7211157083511353, "reward_std": 0.2485455870628357, "rewards/accuracy_reward_stage2": 0.7523657083511353, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2198 }, { "completion_length": 15.265625, "epoch": 0.3853162782547748, "grad_norm": 21.500939887887792, "kl": 0.09423828125, "learning_rate": 6.148589451550726e-07, "loss": 0.0377, "reward": 1.4081530570983887, "reward_std": 0.07311158627271652, "rewards/accuracy_reward_stage2": 0.5331530570983887, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2199 }, { "completion_length": 13.0625, "epoch": 0.38549150166462237, "grad_norm": 34.72416274037816, "kl": 0.357421875, "learning_rate": 6.146837217452251e-07, "loss": 0.0341, "reward": 1.206498146057129, "reward_std": 0.3895317316055298, "rewards/accuracy_reward_stage2": 0.37837323546409607, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2200 }, { "completion_length": 11.578125, "epoch": 0.38566672507447, "grad_norm": 22.262971245552823, "kl": 0.10546875, "learning_rate": 6.145084983353775e-07, "loss": -0.0006, "reward": 1.291999101638794, "reward_std": 0.2678248882293701, "rewards/accuracy_reward_stage2": 0.43262407183647156, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2201 }, { "completion_length": 7.421875, "epoch": 0.3858419484843175, "grad_norm": 17.792911564404413, "kl": 0.181640625, "learning_rate": 6.1433327492553e-07, "loss": 0.0726, "reward": 1.6500575542449951, "reward_std": 0.11898770928382874, "rewards/accuracy_reward_stage2": 0.6500574946403503, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2202 }, { "completion_length": 10.203125, "epoch": 0.38601717189416507, "grad_norm": 20.116358935868064, "kl": 0.1357421875, "learning_rate": 6.141580515156824e-07, "loss": -0.0391, "reward": 1.5253784656524658, "reward_std": 0.38462263345718384, "rewards/accuracy_reward_stage2": 0.5722534656524658, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2203 }, { "completion_length": 13.1875, "epoch": 0.3861923953040126, "grad_norm": 17.99177828565341, "kl": 0.045654296875, "learning_rate": 6.139828281058349e-07, "loss": 0.0183, "reward": 1.65625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.78125, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2204 }, { "completion_length": 11.640625, "epoch": 0.38636761871386016, "grad_norm": 22.035441172696054, "kl": 0.05224609375, "learning_rate": 6.138076046959874e-07, "loss": 0.021, "reward": 1.440962314605713, "reward_std": 0.24247995018959045, "rewards/accuracy_reward_stage2": 0.4409623146057129, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2205 }, { "completion_length": 10.015625, "epoch": 0.3865428421237077, "grad_norm": 20.296250736825957, "kl": 0.10205078125, "learning_rate": 6.136323812861398e-07, "loss": -0.0034, "reward": 1.5719988346099854, "reward_std": 0.24222303926944733, "rewards/accuracy_reward_stage2": 0.5876238942146301, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2206 }, { "completion_length": 8.46875, "epoch": 0.38671806553355526, "grad_norm": 16.497923845983223, "kl": 0.0849609375, "learning_rate": 6.134571578762923e-07, "loss": 0.0341, "reward": 1.5625, "reward_std": 0.16675157845020294, "rewards/accuracy_reward_stage2": 0.5625, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2207 }, { "completion_length": 11.0625, "epoch": 0.38689328894340286, "grad_norm": 18.295192741683362, "kl": 0.1884765625, "learning_rate": 6.132819344664448e-07, "loss": -0.0002, "reward": 1.644460916519165, "reward_std": 0.19249743223190308, "rewards/accuracy_reward_stage2": 0.675710916519165, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2208 }, { "completion_length": 7.125, "epoch": 0.3870685123532504, "grad_norm": 19.963442006676203, "kl": 0.087890625, "learning_rate": 6.131067110565971e-07, "loss": 0.0353, "reward": 1.5324900150299072, "reward_std": 0.2595484256744385, "rewards/accuracy_reward_stage2": 0.532490074634552, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2209 }, { "completion_length": 13.90625, "epoch": 0.38724373576309795, "grad_norm": 26.61436541495198, "kl": 0.08056640625, "learning_rate": 6.129314876467496e-07, "loss": -0.0095, "reward": 1.7203000783920288, "reward_std": 0.1948830783367157, "rewards/accuracy_reward_stage2": 0.7359250783920288, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2210 }, { "completion_length": 8.078125, "epoch": 0.3874189591729455, "grad_norm": 15.538624785207695, "kl": 0.10009765625, "learning_rate": 6.12756264236902e-07, "loss": 0.0186, "reward": 1.723802089691162, "reward_std": 0.13896231353282928, "rewards/accuracy_reward_stage2": 0.7394270896911621, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2211 }, { "completion_length": 12.71875, "epoch": 0.38759418258279305, "grad_norm": 16.671196293015015, "kl": 0.034423828125, "learning_rate": 6.125810408270544e-07, "loss": 0.0138, "reward": 1.2645647525787354, "reward_std": 0.044438742101192474, "rewards/accuracy_reward_stage2": 0.2645648419857025, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2212 }, { "completion_length": 11.609375, "epoch": 0.3877694059926406, "grad_norm": 17.30957237048228, "kl": 0.05322265625, "learning_rate": 6.124058174172069e-07, "loss": -0.0106, "reward": 1.6624021530151367, "reward_std": 0.16287538409233093, "rewards/accuracy_reward_stage2": 0.8030271530151367, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2213 }, { "completion_length": 12.8125, "epoch": 0.3879446294024882, "grad_norm": 19.486549989967347, "kl": 0.11083984375, "learning_rate": 6.122305940073593e-07, "loss": 0.0443, "reward": 1.536430835723877, "reward_std": 0.25739842653274536, "rewards/accuracy_reward_stage2": 0.5364308953285217, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2214 }, { "completion_length": 9.40625, "epoch": 0.38811985281233574, "grad_norm": 23.428496736016186, "kl": 0.1171875, "learning_rate": 6.120553705975118e-07, "loss": 0.0027, "reward": 1.723336100578308, "reward_std": 0.21733993291854858, "rewards/accuracy_reward_stage2": 0.7389611005783081, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2215 }, { "completion_length": 10.046875, "epoch": 0.3882950762221833, "grad_norm": 23.7601496680345, "kl": 0.1337890625, "learning_rate": 6.118801471876643e-07, "loss": 0.0228, "reward": 1.320266604423523, "reward_std": 0.19674022495746613, "rewards/accuracy_reward_stage2": 0.5858915448188782, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2216 }, { "completion_length": 9.296875, "epoch": 0.38847029963203084, "grad_norm": 20.72527082536805, "kl": 0.1982421875, "learning_rate": 6.117049237778167e-07, "loss": 0.0265, "reward": 1.462338924407959, "reward_std": 0.2675231099128723, "rewards/accuracy_reward_stage2": 0.6185888051986694, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2217 }, { "completion_length": 11.3125, "epoch": 0.3886455230418784, "grad_norm": 17.420874740089047, "kl": 0.09619140625, "learning_rate": 6.115297003679692e-07, "loss": 0.0385, "reward": 1.4358340501785278, "reward_std": 0.13778458535671234, "rewards/accuracy_reward_stage2": 0.5608340501785278, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2218 }, { "completion_length": 16.125, "epoch": 0.38882074645172593, "grad_norm": 20.444965241840418, "kl": 0.2412109375, "learning_rate": 6.113544769581215e-07, "loss": 0.0619, "reward": 1.2542166709899902, "reward_std": 0.1983487606048584, "rewards/accuracy_reward_stage2": 0.39484167098999023, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2219 }, { "completion_length": 9.15625, "epoch": 0.38899596986157353, "grad_norm": 24.584740447360808, "kl": 0.2138671875, "learning_rate": 6.11179253548274e-07, "loss": 0.0066, "reward": 1.4592695236206055, "reward_std": 0.2801477909088135, "rewards/accuracy_reward_stage2": 0.49051961302757263, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2220 }, { "completion_length": 18.75, "epoch": 0.3891711932714211, "grad_norm": 21.180945773655164, "kl": 0.25, "learning_rate": 6.110040301384265e-07, "loss": 0.1003, "reward": 1.3165578842163086, "reward_std": 0.23822440207004547, "rewards/accuracy_reward_stage2": 0.5665579438209534, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2221 }, { "completion_length": 17.46875, "epoch": 0.3893464166812686, "grad_norm": 19.289222933177903, "kl": 0.06787109375, "learning_rate": 6.108288067285789e-07, "loss": 0.0272, "reward": 1.7069063186645508, "reward_std": 0.14726392924785614, "rewards/accuracy_reward_stage2": 0.7069063782691956, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2222 }, { "completion_length": 26.859375, "epoch": 0.3895216400911162, "grad_norm": 18.85145918214092, "kl": 0.0947265625, "learning_rate": 6.106535833187314e-07, "loss": -0.0632, "reward": 1.716736078262329, "reward_std": 0.26862505078315735, "rewards/accuracy_reward_stage2": 0.7636110782623291, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2223 }, { "completion_length": 15.46875, "epoch": 0.3896968635009637, "grad_norm": 19.158419110512753, "kl": 0.11474609375, "learning_rate": 6.104783599088838e-07, "loss": 0.0458, "reward": 1.3567907810211182, "reward_std": 0.20545431971549988, "rewards/accuracy_reward_stage2": 0.356790691614151, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2224 }, { "completion_length": 9.578125, "epoch": 0.38987208691081127, "grad_norm": 27.94855443881219, "kl": 0.146484375, "learning_rate": 6.103031364990362e-07, "loss": 0.0584, "reward": 1.5437999963760376, "reward_std": 0.16191495954990387, "rewards/accuracy_reward_stage2": 0.5437999367713928, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2225 }, { "completion_length": 12.84375, "epoch": 0.3900473103206588, "grad_norm": 21.47202574922326, "kl": 0.08154296875, "learning_rate": 6.101279130891887e-07, "loss": 0.0326, "reward": 1.4971905946731567, "reward_std": 0.20354795455932617, "rewards/accuracy_reward_stage2": 0.622190535068512, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2226 }, { "completion_length": 15.53125, "epoch": 0.3902225337305064, "grad_norm": 3199.861242021072, "kl": 16.5, "learning_rate": 6.099526896793411e-07, "loss": 6.5665, "reward": 1.61332368850708, "reward_std": 0.14749747514724731, "rewards/accuracy_reward_stage2": 0.7383236885070801, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2227 }, { "completion_length": 12.71875, "epoch": 0.39039775714035396, "grad_norm": 23.940551008858936, "kl": 0.2265625, "learning_rate": 6.097774662694936e-07, "loss": 0.0139, "reward": 1.565973162651062, "reward_std": 0.3491760492324829, "rewards/accuracy_reward_stage2": 0.5972232222557068, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2228 }, { "completion_length": 11.234375, "epoch": 0.3905729805502015, "grad_norm": 14.555579503011074, "kl": 0.08984375, "learning_rate": 6.09602242859646e-07, "loss": 0.036, "reward": 1.7619819641113281, "reward_std": 0.12717638909816742, "rewards/accuracy_reward_stage2": 0.7619818449020386, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2229 }, { "completion_length": 6.1875, "epoch": 0.39074820396004906, "grad_norm": 21.25334499700258, "kl": 0.16015625, "learning_rate": 6.094270194497984e-07, "loss": 0.0551, "reward": 1.6436500549316406, "reward_std": 0.27374887466430664, "rewards/accuracy_reward_stage2": 0.7686500549316406, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2230 }, { "completion_length": 13.078125, "epoch": 0.3909234273698966, "grad_norm": 20.036000201879755, "kl": 0.14453125, "learning_rate": 6.092517960399509e-07, "loss": 0.0287, "reward": 1.3101893663406372, "reward_std": 0.2083069086074829, "rewards/accuracy_reward_stage2": 0.3258143961429596, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2231 }, { "completion_length": 10.859375, "epoch": 0.39109865077974415, "grad_norm": 21.01595135113345, "kl": 0.11474609375, "learning_rate": 6.090765726301034e-07, "loss": -0.0198, "reward": 1.6504713296890259, "reward_std": 0.2545785903930664, "rewards/accuracy_reward_stage2": 0.6817213296890259, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2232 }, { "completion_length": 21.90625, "epoch": 0.39127387418959175, "grad_norm": 13.266211129359702, "kl": 0.04052734375, "learning_rate": 6.089013492202558e-07, "loss": -0.028, "reward": 1.5483436584472656, "reward_std": 0.0761621966958046, "rewards/accuracy_reward_stage2": 0.5639687180519104, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2233 }, { "completion_length": 10.96875, "epoch": 0.3914490975994393, "grad_norm": 23.78450389226553, "kl": 0.1162109375, "learning_rate": 6.087261258104083e-07, "loss": 0.0023, "reward": 1.8500159978866577, "reward_std": 0.21330755949020386, "rewards/accuracy_reward_stage2": 0.8656409382820129, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2234 }, { "completion_length": 11.3125, "epoch": 0.39162432100928685, "grad_norm": 15.860994793149466, "kl": 0.08984375, "learning_rate": 6.085509024005607e-07, "loss": -0.0012, "reward": 1.834155797958374, "reward_std": 0.12957896292209625, "rewards/accuracy_reward_stage2": 0.849780797958374, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2235 }, { "completion_length": 10.53125, "epoch": 0.3917995444191344, "grad_norm": 21.031488729143145, "kl": 0.146484375, "learning_rate": 6.083756789907132e-07, "loss": 0.023, "reward": 1.7553956508636475, "reward_std": 0.27307990193367004, "rewards/accuracy_reward_stage2": 0.7710205912590027, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2236 }, { "completion_length": 12.3125, "epoch": 0.39197476782898194, "grad_norm": 45.038068276481724, "kl": 0.052001953125, "learning_rate": 6.082004555808656e-07, "loss": -0.0234, "reward": 1.4747748374938965, "reward_std": 0.21759963035583496, "rewards/accuracy_reward_stage2": 0.6153998374938965, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2237 }, { "completion_length": 8.59375, "epoch": 0.3921499912388295, "grad_norm": 21.49837229610135, "kl": 0.054443359375, "learning_rate": 6.08025232171018e-07, "loss": 0.0218, "reward": 1.4371988773345947, "reward_std": 0.18646962940692902, "rewards/accuracy_reward_stage2": 0.4371989369392395, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2238 }, { "completion_length": 15.15625, "epoch": 0.3923252146486771, "grad_norm": 22.48222287881226, "kl": 0.1796875, "learning_rate": 6.078500087611704e-07, "loss": -0.0154, "reward": 1.4610028266906738, "reward_std": 0.27222740650177, "rewards/accuracy_reward_stage2": 0.6172528862953186, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2239 }, { "completion_length": 16.328125, "epoch": 0.39250043805852464, "grad_norm": 19.858045637418105, "kl": 0.11572265625, "learning_rate": 6.076747853513228e-07, "loss": 0.0123, "reward": 1.6191458702087402, "reward_std": 0.33029234409332275, "rewards/accuracy_reward_stage2": 0.6347708702087402, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2240 }, { "completion_length": 8.703125, "epoch": 0.3926756614683722, "grad_norm": 15.88710029637002, "kl": 0.1083984375, "learning_rate": 6.074995619414753e-07, "loss": -0.0384, "reward": 1.7707568407058716, "reward_std": 0.16039703786373138, "rewards/accuracy_reward_stage2": 0.8020068407058716, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2241 }, { "completion_length": 15.0, "epoch": 0.39285088487821973, "grad_norm": 24.84449475106468, "kl": 0.03662109375, "learning_rate": 6.073243385316278e-07, "loss": 0.0147, "reward": 1.3003172874450684, "reward_std": 0.2628664970397949, "rewards/accuracy_reward_stage2": 0.4253171682357788, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2242 }, { "completion_length": 28.109375, "epoch": 0.3930261082880673, "grad_norm": 20.842704108597864, "kl": 0.1025390625, "learning_rate": 6.071491151217802e-07, "loss": 0.041, "reward": 1.4179072380065918, "reward_std": 0.1976032704114914, "rewards/accuracy_reward_stage2": 0.5429072380065918, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2243 }, { "completion_length": 9.0625, "epoch": 0.3932013316979148, "grad_norm": 22.012758121864316, "kl": 0.08203125, "learning_rate": 6.069738917119327e-07, "loss": 0.0329, "reward": 1.6247165203094482, "reward_std": 0.14634189009666443, "rewards/accuracy_reward_stage2": 0.6247165203094482, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2244 }, { "completion_length": 14.953125, "epoch": 0.39337655510776237, "grad_norm": 24.990969384381177, "kl": 0.3125, "learning_rate": 6.067986683020852e-07, "loss": 0.1155, "reward": 1.501061201095581, "reward_std": 0.2306504249572754, "rewards/accuracy_reward_stage2": 0.641686201095581, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2245 }, { "completion_length": 9.765625, "epoch": 0.39355177851761, "grad_norm": 23.167817382707934, "kl": 0.052734375, "learning_rate": 6.066234448922376e-07, "loss": 0.0211, "reward": 1.5593750476837158, "reward_std": 0.2636833190917969, "rewards/accuracy_reward_stage2": 0.684374988079071, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2246 }, { "completion_length": 11.28125, "epoch": 0.3937270019274575, "grad_norm": 17.03462934298418, "kl": 0.0751953125, "learning_rate": 6.064482214823901e-07, "loss": 0.03, "reward": 1.4761418104171753, "reward_std": 0.2149282991886139, "rewards/accuracy_reward_stage2": 0.6011418104171753, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2247 }, { "completion_length": 10.0625, "epoch": 0.39390222533730507, "grad_norm": 23.729261446581663, "kl": 0.162109375, "learning_rate": 6.062729980725426e-07, "loss": -0.0235, "reward": 1.747470498085022, "reward_std": 0.27710628509521484, "rewards/accuracy_reward_stage2": 0.778720498085022, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2248 }, { "completion_length": 12.90625, "epoch": 0.3940774487471526, "grad_norm": 17.009665912805705, "kl": 0.076171875, "learning_rate": 6.060977746626949e-07, "loss": -0.0578, "reward": 1.5438098907470703, "reward_std": 0.2387259602546692, "rewards/accuracy_reward_stage2": 0.5750599503517151, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2249 }, { "completion_length": 11.484375, "epoch": 0.39425267215700016, "grad_norm": 17.876369069783316, "kl": 0.0257568359375, "learning_rate": 6.059225512528473e-07, "loss": 0.0103, "reward": 1.800662875175476, "reward_std": 0.18343248963356018, "rewards/accuracy_reward_stage2": 0.8006628751754761, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2250 }, { "completion_length": 13.65625, "epoch": 0.3944278955668477, "grad_norm": 17.8567731672947, "kl": 0.0166015625, "learning_rate": 6.057473278429997e-07, "loss": 0.0066, "reward": 1.554578423500061, "reward_std": 0.2657412886619568, "rewards/accuracy_reward_stage2": 0.679578423500061, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2251 }, { "completion_length": 7.46875, "epoch": 0.3946031189766953, "grad_norm": 21.025622163803355, "kl": 0.1298828125, "learning_rate": 6.055721044331522e-07, "loss": 0.0519, "reward": 1.5240048170089722, "reward_std": 0.25898078083992004, "rewards/accuracy_reward_stage2": 0.7740048766136169, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2252 }, { "completion_length": 14.46875, "epoch": 0.39477834238654286, "grad_norm": 14.991669037359548, "kl": 0.1845703125, "learning_rate": 6.053968810233047e-07, "loss": 0.0736, "reward": 1.6325411796569824, "reward_std": 0.10772719979286194, "rewards/accuracy_reward_stage2": 0.7575411200523376, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2253 }, { "completion_length": 9.609375, "epoch": 0.3949535657963904, "grad_norm": 20.17179782436066, "kl": 0.193359375, "learning_rate": 6.052216576134571e-07, "loss": 0.0773, "reward": 1.7336421012878418, "reward_std": 0.2646172046661377, "rewards/accuracy_reward_stage2": 0.7336422204971313, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2254 }, { "completion_length": 9.421875, "epoch": 0.39512878920623795, "grad_norm": 22.18340436662211, "kl": 0.0859375, "learning_rate": 6.050464342036096e-07, "loss": -0.0032, "reward": 1.788942575454712, "reward_std": 0.17048048973083496, "rewards/accuracy_reward_stage2": 0.8045675158500671, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2255 }, { "completion_length": 10.53125, "epoch": 0.3953040126160855, "grad_norm": 21.908798401167978, "kl": 0.07763671875, "learning_rate": 6.04871210793762e-07, "loss": 0.0022, "reward": 1.2886775732040405, "reward_std": 0.23748990893363953, "rewards/accuracy_reward_stage2": 0.3043026328086853, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2256 }, { "completion_length": 9.015625, "epoch": 0.39547923602593305, "grad_norm": 14.891452971280476, "kl": 0.1376953125, "learning_rate": 6.046959873839145e-07, "loss": 0.055, "reward": 1.643110990524292, "reward_std": 0.08917830139398575, "rewards/accuracy_reward_stage2": 0.768110990524292, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2257 }, { "completion_length": 24.640625, "epoch": 0.39565445943578065, "grad_norm": 18.532267801307796, "kl": 0.06689453125, "learning_rate": 6.04520763974067e-07, "loss": -0.0174, "reward": 1.3986705541610718, "reward_std": 0.2709914743900299, "rewards/accuracy_reward_stage2": 0.5392955541610718, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2258 }, { "completion_length": 7.75, "epoch": 0.3958296828456282, "grad_norm": 19.705860118413078, "kl": 0.14453125, "learning_rate": 6.043455405642193e-07, "loss": 0.0578, "reward": 1.7622469663619995, "reward_std": 0.16544394195079803, "rewards/accuracy_reward_stage2": 0.7622469663619995, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2259 }, { "completion_length": 20.859375, "epoch": 0.39600490625547574, "grad_norm": 17.54658164701785, "kl": 0.0478515625, "learning_rate": 6.041703171543718e-07, "loss": 0.0191, "reward": 1.5218894481658936, "reward_std": 0.16388002038002014, "rewards/accuracy_reward_stage2": 0.6468895673751831, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2260 }, { "completion_length": 9.71875, "epoch": 0.3961801296653233, "grad_norm": 15.360252803195012, "kl": 0.0556640625, "learning_rate": 6.039950937445243e-07, "loss": 0.0222, "reward": 1.729975938796997, "reward_std": 0.08503744006156921, "rewards/accuracy_reward_stage2": 0.8549759387969971, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2261 }, { "completion_length": 7.609375, "epoch": 0.39635535307517084, "grad_norm": 21.248695797842434, "kl": 0.0908203125, "learning_rate": 6.038198703346767e-07, "loss": 0.0363, "reward": 1.5866072177886963, "reward_std": 0.25131991505622864, "rewards/accuracy_reward_stage2": 0.7116071581840515, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2262 }, { "completion_length": 12.5625, "epoch": 0.3965305764850184, "grad_norm": 25.841012280803945, "kl": 0.10498046875, "learning_rate": 6.036446469248291e-07, "loss": 0.0421, "reward": 1.6510810852050781, "reward_std": 0.21648067235946655, "rewards/accuracy_reward_stage2": 0.6510810852050781, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2263 }, { "completion_length": 10.796875, "epoch": 0.39670579989486593, "grad_norm": 26.90376670754385, "kl": 0.28515625, "learning_rate": 6.034694235149815e-07, "loss": 0.1143, "reward": 1.5416667461395264, "reward_std": 0.26196783781051636, "rewards/accuracy_reward_stage2": 0.6666666865348816, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2264 }, { "completion_length": 14.5, "epoch": 0.39688102330471353, "grad_norm": 17.25363604070593, "kl": 0.1435546875, "learning_rate": 6.03294200105134e-07, "loss": 0.0575, "reward": 1.4917428493499756, "reward_std": 0.14269062876701355, "rewards/accuracy_reward_stage2": 0.616742730140686, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2265 }, { "completion_length": 7.953125, "epoch": 0.3970562467145611, "grad_norm": 24.496076146044928, "kl": 0.1142578125, "learning_rate": 6.031189766952865e-07, "loss": 0.0456, "reward": 1.4624096155166626, "reward_std": 0.2695634663105011, "rewards/accuracy_reward_stage2": 0.5874096751213074, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2266 }, { "completion_length": 9.890625, "epoch": 0.3972314701244086, "grad_norm": 20.884483065722286, "kl": 0.10107421875, "learning_rate": 6.029437532854389e-07, "loss": 0.0405, "reward": 1.7499773502349854, "reward_std": 0.29926127195358276, "rewards/accuracy_reward_stage2": 0.7499772310256958, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2267 }, { "completion_length": 8.53125, "epoch": 0.3974066935342562, "grad_norm": 11.781013559961279, "kl": 0.0162353515625, "learning_rate": 6.027685298755914e-07, "loss": 0.0065, "reward": 1.59375, "reward_std": 0.10888782143592834, "rewards/accuracy_reward_stage2": 0.59375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2268 }, { "completion_length": 7.828125, "epoch": 0.3975819169441037, "grad_norm": 21.6861325693261, "kl": 0.07421875, "learning_rate": 6.025933064657438e-07, "loss": -0.002, "reward": 1.6320500373840332, "reward_std": 0.21844886243343353, "rewards/accuracy_reward_stage2": 0.6476749777793884, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2269 }, { "completion_length": 10.78125, "epoch": 0.39775714035395127, "grad_norm": 23.714687842214964, "kl": 0.1748046875, "learning_rate": 6.024180830558962e-07, "loss": 0.0698, "reward": 1.3585705757141113, "reward_std": 0.2857874035835266, "rewards/accuracy_reward_stage2": 0.3585706651210785, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2270 }, { "completion_length": 10.28125, "epoch": 0.39793236376379887, "grad_norm": 14.626456891793655, "kl": 0.041015625, "learning_rate": 6.022428596460487e-07, "loss": 0.0164, "reward": 1.2995471954345703, "reward_std": 0.20875424146652222, "rewards/accuracy_reward_stage2": 0.5495471954345703, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2271 }, { "completion_length": 8.046875, "epoch": 0.3981075871736464, "grad_norm": 17.670543866931045, "kl": 0.12109375, "learning_rate": 6.020676362362011e-07, "loss": 0.0484, "reward": 1.6284711360931396, "reward_std": 0.2129313349723816, "rewards/accuracy_reward_stage2": 0.6284710764884949, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2272 }, { "completion_length": 7.96875, "epoch": 0.39828281058349396, "grad_norm": 15.718648701678724, "kl": 0.09619140625, "learning_rate": 6.018924128263536e-07, "loss": -0.0058, "reward": 1.484375, "reward_std": 0.22673699259757996, "rewards/accuracy_reward_stage2": 0.5, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2273 }, { "completion_length": 8.921875, "epoch": 0.3984580339933415, "grad_norm": 16.76238405847528, "kl": 0.15234375, "learning_rate": 6.017171894165061e-07, "loss": 0.0169, "reward": 1.5914230346679688, "reward_std": 0.10464628040790558, "rewards/accuracy_reward_stage2": 0.6070479154586792, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2274 }, { "completion_length": 14.640625, "epoch": 0.39863325740318906, "grad_norm": 19.55876690771695, "kl": 0.08642578125, "learning_rate": 6.015419660066584e-07, "loss": 0.0345, "reward": 1.808529257774353, "reward_std": 0.10393321514129639, "rewards/accuracy_reward_stage2": 0.808529257774353, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2275 }, { "completion_length": 14.078125, "epoch": 0.3988084808130366, "grad_norm": 25.976134613056775, "kl": 0.330078125, "learning_rate": 6.013667425968109e-07, "loss": 0.0878, "reward": 1.2277390956878662, "reward_std": 0.2750273048877716, "rewards/accuracy_reward_stage2": 0.49336421489715576, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2276 }, { "completion_length": 10.359375, "epoch": 0.39898370422288415, "grad_norm": 25.334821129538327, "kl": 0.1015625, "learning_rate": 6.011915191869634e-07, "loss": 0.0408, "reward": 1.575644850730896, "reward_std": 0.2729370892047882, "rewards/accuracy_reward_stage2": 0.575644850730896, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2277 }, { "completion_length": 17.734375, "epoch": 0.39915892763273175, "grad_norm": 21.873042456168918, "kl": 0.224609375, "learning_rate": 6.010162957771157e-07, "loss": 0.09, "reward": 1.3560564517974854, "reward_std": 0.22519102692604065, "rewards/accuracy_reward_stage2": 0.48105645179748535, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2278 }, { "completion_length": 13.09375, "epoch": 0.3993341510425793, "grad_norm": 27.114162602146166, "kl": 0.16796875, "learning_rate": 6.008410723672682e-07, "loss": 0.0675, "reward": 1.3410875797271729, "reward_std": 0.1357228308916092, "rewards/accuracy_reward_stage2": 0.4660876393318176, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2279 }, { "completion_length": 10.71875, "epoch": 0.39950937445242685, "grad_norm": 16.282857035884422, "kl": 0.12353515625, "learning_rate": 6.006658489574206e-07, "loss": 0.0495, "reward": 1.3618073463439941, "reward_std": 0.17284999787807465, "rewards/accuracy_reward_stage2": 0.4868074059486389, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2280 }, { "completion_length": 9.21875, "epoch": 0.3996845978622744, "grad_norm": 22.962387664587848, "kl": 0.1708984375, "learning_rate": 6.004906255475731e-07, "loss": 0.0684, "reward": 1.4452757835388184, "reward_std": 0.22587110102176666, "rewards/accuracy_reward_stage2": 0.44527584314346313, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2281 }, { "completion_length": 6.796875, "epoch": 0.39985982127212194, "grad_norm": 11.122280502077027, "kl": 0.107421875, "learning_rate": 6.003154021377256e-07, "loss": 0.0058, "reward": 1.4873359203338623, "reward_std": 0.08620868623256683, "rewards/accuracy_reward_stage2": 0.6279608607292175, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2282 }, { "completion_length": 9.828125, "epoch": 0.4000350446819695, "grad_norm": 18.48475644218214, "kl": 0.06982421875, "learning_rate": 6.00140178727878e-07, "loss": -0.037, "reward": 1.5395770072937012, "reward_std": 0.3086152970790863, "rewards/accuracy_reward_stage2": 0.5708270072937012, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2283 }, { "completion_length": 12.078125, "epoch": 0.4002102680918171, "grad_norm": 20.75370415639769, "kl": 0.2001953125, "learning_rate": 5.999649553180305e-07, "loss": 0.0359, "reward": 1.367240309715271, "reward_std": 0.2471608817577362, "rewards/accuracy_reward_stage2": 0.507865309715271, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2284 }, { "completion_length": 8.75, "epoch": 0.40038549150166464, "grad_norm": 20.710375361431648, "kl": 0.09619140625, "learning_rate": 5.99789731908183e-07, "loss": -0.0057, "reward": 1.6774652004241943, "reward_std": 0.2268453687429428, "rewards/accuracy_reward_stage2": 0.6930902004241943, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2285 }, { "completion_length": 8.09375, "epoch": 0.4005607149115122, "grad_norm": 18.298710329293545, "kl": 0.04296875, "learning_rate": 5.996145084983354e-07, "loss": -0.0038, "reward": 1.4890004396438599, "reward_std": 0.1191844642162323, "rewards/accuracy_reward_stage2": 0.5046254992485046, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2286 }, { "completion_length": 9.78125, "epoch": 0.40073593832135973, "grad_norm": 16.98439905746384, "kl": 0.048583984375, "learning_rate": 5.994392850884879e-07, "loss": 0.0194, "reward": 1.4807779788970947, "reward_std": 0.18450888991355896, "rewards/accuracy_reward_stage2": 0.4807780086994171, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2287 }, { "completion_length": 16.03125, "epoch": 0.4009111617312073, "grad_norm": 12.240341016748312, "kl": 0.052001953125, "learning_rate": 5.992640616786401e-07, "loss": -0.0126, "reward": 1.375, "reward_std": 0.1552036553621292, "rewards/accuracy_reward_stage2": 0.515625, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2288 }, { "completion_length": 7.1875, "epoch": 0.4010863851410548, "grad_norm": 19.206907098055197, "kl": 0.1474609375, "learning_rate": 5.990888382687926e-07, "loss": 0.0146, "reward": 1.5250904560089111, "reward_std": 0.2802599370479584, "rewards/accuracy_reward_stage2": 0.5407153964042664, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2289 }, { "completion_length": 10.21875, "epoch": 0.4012616085509024, "grad_norm": 16.23780482715037, "kl": 0.1015625, "learning_rate": 5.989136148589451e-07, "loss": -0.0369, "reward": 1.2621527910232544, "reward_std": 0.25270047783851624, "rewards/accuracy_reward_stage2": 0.4027777910232544, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2290 }, { "completion_length": 11.046875, "epoch": 0.40143683196075, "grad_norm": 39.776298461562874, "kl": 0.1328125, "learning_rate": 5.987383914490975e-07, "loss": 0.0088, "reward": 1.7533235549926758, "reward_std": 0.19562682509422302, "rewards/accuracy_reward_stage2": 0.8939485549926758, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2291 }, { "completion_length": 27.515625, "epoch": 0.4016120553705975, "grad_norm": 22.133452025655327, "kl": 0.0908203125, "learning_rate": 5.9856316803925e-07, "loss": 0.0364, "reward": 1.5148825645446777, "reward_std": 0.26382148265838623, "rewards/accuracy_reward_stage2": 0.5148825645446777, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2292 }, { "completion_length": 9.96875, "epoch": 0.40178727878044507, "grad_norm": 14.090229255752924, "kl": 0.057373046875, "learning_rate": 5.983879446294025e-07, "loss": 0.0229, "reward": 1.6614583730697632, "reward_std": 0.12609022855758667, "rewards/accuracy_reward_stage2": 0.6614583134651184, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2293 }, { "completion_length": 16.15625, "epoch": 0.4019625021902926, "grad_norm": 24.598171444560585, "kl": 0.0810546875, "learning_rate": 5.982127212195549e-07, "loss": 0.0324, "reward": 1.4326512813568115, "reward_std": 0.24987728893756866, "rewards/accuracy_reward_stage2": 0.43265122175216675, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2294 }, { "completion_length": 8.765625, "epoch": 0.40213772560014016, "grad_norm": 23.768059758477552, "kl": 0.193359375, "learning_rate": 5.980374978097074e-07, "loss": 0.0021, "reward": 1.7757015228271484, "reward_std": 0.31451430916786194, "rewards/accuracy_reward_stage2": 0.8069514036178589, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2295 }, { "completion_length": 12.859375, "epoch": 0.4023129490099877, "grad_norm": 38.013862095694, "kl": 0.130859375, "learning_rate": 5.978622743998598e-07, "loss": 0.0117, "reward": 1.4523301124572754, "reward_std": 0.23838970065116882, "rewards/accuracy_reward_stage2": 0.7179551124572754, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2296 }, { "completion_length": 13.703125, "epoch": 0.4024881724198353, "grad_norm": 1750.6093752011525, "kl": 7.34375, "learning_rate": 5.976870509900123e-07, "loss": 2.9339, "reward": 1.464247703552246, "reward_std": 0.19597555696964264, "rewards/accuracy_reward_stage2": 0.5892475843429565, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2297 }, { "completion_length": 7.96875, "epoch": 0.40266339582968286, "grad_norm": 17.351812722344764, "kl": 0.123046875, "learning_rate": 5.975118275801648e-07, "loss": 0.0051, "reward": 1.6678106784820557, "reward_std": 0.18729940056800842, "rewards/accuracy_reward_stage2": 0.6834356784820557, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2298 }, { "completion_length": 19.296875, "epoch": 0.4028386192395304, "grad_norm": 16.46383858884921, "kl": 0.04150390625, "learning_rate": 5.973366041703171e-07, "loss": -0.0025, "reward": 1.6293494701385498, "reward_std": 0.12097081542015076, "rewards/accuracy_reward_stage2": 0.6449744701385498, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2299 }, { "completion_length": 5.984375, "epoch": 0.40301384264937795, "grad_norm": 44.639775379090395, "kl": 0.154296875, "learning_rate": 5.971613807604696e-07, "loss": 0.062, "reward": 1.7036259174346924, "reward_std": 0.2740706503391266, "rewards/accuracy_reward_stage2": 0.7036257982254028, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2300 }, { "completion_length": 10.21875, "epoch": 0.4031890660592255, "grad_norm": 16.01770704561227, "kl": 0.044189453125, "learning_rate": 5.969861573506219e-07, "loss": 0.0177, "reward": 1.8585901260375977, "reward_std": 0.14846956729888916, "rewards/accuracy_reward_stage2": 0.8585900068283081, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2301 }, { "completion_length": 7.4375, "epoch": 0.40336428946907305, "grad_norm": 21.29640137549983, "kl": 0.1064453125, "learning_rate": 5.968109339407744e-07, "loss": 0.0426, "reward": 1.6757261753082275, "reward_std": 0.28504133224487305, "rewards/accuracy_reward_stage2": 0.6757262349128723, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2302 }, { "completion_length": 10.34375, "epoch": 0.40353951287892065, "grad_norm": 20.686258071687735, "kl": 0.076171875, "learning_rate": 5.966357105309269e-07, "loss": 0.0025, "reward": 1.31874418258667, "reward_std": 0.239344522356987, "rewards/accuracy_reward_stage2": 0.4593692123889923, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2303 }, { "completion_length": 12.9375, "epoch": 0.4037147362887682, "grad_norm": 27.775159769904608, "kl": 0.2314453125, "learning_rate": 5.964604871210793e-07, "loss": 0.0925, "reward": 1.5497610569000244, "reward_std": 0.1822834312915802, "rewards/accuracy_reward_stage2": 0.6747609972953796, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2304 }, { "completion_length": 12.25, "epoch": 0.40388995969861574, "grad_norm": 17.129685183594717, "kl": 0.080078125, "learning_rate": 5.962852637112318e-07, "loss": -0.0226, "reward": 1.566606044769287, "reward_std": 0.20607224106788635, "rewards/accuracy_reward_stage2": 0.5978560447692871, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2305 }, { "completion_length": 24.8125, "epoch": 0.4040651831084633, "grad_norm": 626.6167581526231, "kl": 5.28125, "learning_rate": 5.961100403013843e-07, "loss": 2.0717, "reward": 1.5107133388519287, "reward_std": 0.1535727083683014, "rewards/accuracy_reward_stage2": 0.6513383984565735, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2306 }, { "completion_length": 19.75, "epoch": 0.40424040651831084, "grad_norm": 58.60220982847609, "kl": 0.455078125, "learning_rate": 5.959348168915367e-07, "loss": 0.1376, "reward": 1.3191068172454834, "reward_std": 0.22959591448307037, "rewards/accuracy_reward_stage2": 0.5847317576408386, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2307 }, { "completion_length": 10.140625, "epoch": 0.4044156299281584, "grad_norm": 24.806034726052975, "kl": 0.31640625, "learning_rate": 5.957595934816891e-07, "loss": 0.0931, "reward": 1.4665522575378418, "reward_std": 0.3047543168067932, "rewards/accuracy_reward_stage2": 0.607177197933197, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2308 }, { "completion_length": 7.09375, "epoch": 0.404590853338006, "grad_norm": 16.157756302348197, "kl": 0.08642578125, "learning_rate": 5.955843700718416e-07, "loss": 0.0346, "reward": 1.6599851846694946, "reward_std": 0.08853545039892197, "rewards/accuracy_reward_stage2": 0.6599851250648499, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2309 }, { "completion_length": 13.234375, "epoch": 0.40476607674785353, "grad_norm": 50.421575985750216, "kl": 0.1484375, "learning_rate": 5.95409146661994e-07, "loss": 0.0594, "reward": 1.6158902645111084, "reward_std": 0.39847201108932495, "rewards/accuracy_reward_stage2": 0.6158903241157532, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2310 }, { "completion_length": 9.171875, "epoch": 0.4049413001577011, "grad_norm": 21.215574229033695, "kl": 0.109375, "learning_rate": 5.952339232521465e-07, "loss": 0.0089, "reward": 1.5476425886154175, "reward_std": 0.2785415053367615, "rewards/accuracy_reward_stage2": 0.563267707824707, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2311 }, { "completion_length": 6.34375, "epoch": 0.4051165235675486, "grad_norm": 19.026066267211252, "kl": 0.185546875, "learning_rate": 5.950586998422989e-07, "loss": 0.0298, "reward": 1.668402910232544, "reward_std": 0.25595739483833313, "rewards/accuracy_reward_stage2": 0.6840278506278992, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2312 }, { "completion_length": 9.15625, "epoch": 0.4052917469773962, "grad_norm": 18.9975286170903, "kl": 0.12255859375, "learning_rate": 5.948834764324514e-07, "loss": 0.0489, "reward": 1.6058623790740967, "reward_std": 0.17858222126960754, "rewards/accuracy_reward_stage2": 0.6058623790740967, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2313 }, { "completion_length": 7.109375, "epoch": 0.4054669703872437, "grad_norm": 21.660857721388656, "kl": 0.208984375, "learning_rate": 5.947082530226038e-07, "loss": -0.0046, "reward": 1.3151360750198364, "reward_std": 0.3293163776397705, "rewards/accuracy_reward_stage2": 0.3463861048221588, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2314 }, { "completion_length": 9.234375, "epoch": 0.40564219379709127, "grad_norm": 25.92194524304459, "kl": 0.259765625, "learning_rate": 5.945330296127562e-07, "loss": 0.1021, "reward": 1.5383646488189697, "reward_std": 0.32094958424568176, "rewards/accuracy_reward_stage2": 0.6633646488189697, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2315 }, { "completion_length": 13.359375, "epoch": 0.40581741720693887, "grad_norm": 134.6202242774508, "kl": 0.37890625, "learning_rate": 5.943578062029087e-07, "loss": 0.1511, "reward": 1.5572917461395264, "reward_std": 0.22779878973960876, "rewards/accuracy_reward_stage2": 0.6822916269302368, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2316 }, { "completion_length": 10.6875, "epoch": 0.4059926406167864, "grad_norm": 17.41673028501159, "kl": 0.0576171875, "learning_rate": 5.941825827930611e-07, "loss": 0.0231, "reward": 1.4323480129241943, "reward_std": 0.2165302187204361, "rewards/accuracy_reward_stage2": 0.43234801292419434, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2317 }, { "completion_length": 10.5, "epoch": 0.40616786402663396, "grad_norm": 19.564501287263905, "kl": 0.1201171875, "learning_rate": 5.940073593832135e-07, "loss": 0.0483, "reward": 1.4573872089385986, "reward_std": 0.2566508650779724, "rewards/accuracy_reward_stage2": 0.5823871493339539, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2318 }, { "completion_length": 8.015625, "epoch": 0.4063430874364815, "grad_norm": 13.005218537193727, "kl": 0.1376953125, "learning_rate": 5.93832135973366e-07, "loss": -0.0747, "reward": 1.7554097175598145, "reward_std": 0.21100175380706787, "rewards/accuracy_reward_stage2": 0.8022847771644592, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2319 }, { "completion_length": 9.484375, "epoch": 0.40651831084632906, "grad_norm": 19.67616530990649, "kl": 0.064453125, "learning_rate": 5.936569125635184e-07, "loss": 0.0259, "reward": 1.3368675708770752, "reward_std": 0.3198486864566803, "rewards/accuracy_reward_stage2": 0.5868675708770752, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2320 }, { "completion_length": 15.015625, "epoch": 0.4066935342561766, "grad_norm": 15.25698057189163, "kl": 0.1123046875, "learning_rate": 5.934816891536709e-07, "loss": 0.0017, "reward": 1.3410911560058594, "reward_std": 0.07204613089561462, "rewards/accuracy_reward_stage2": 0.6067162752151489, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2321 }, { "completion_length": 9.078125, "epoch": 0.4068687576660242, "grad_norm": 19.99227162306004, "kl": 0.10546875, "learning_rate": 5.933064657438234e-07, "loss": 0.0133, "reward": 1.8546524047851562, "reward_std": 0.1757163405418396, "rewards/accuracy_reward_stage2": 0.8702772855758667, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2322 }, { "completion_length": 9.171875, "epoch": 0.40704398107587175, "grad_norm": 22.055503664853624, "kl": 0.095703125, "learning_rate": 5.931312423339758e-07, "loss": -0.0058, "reward": 1.2581727504730225, "reward_std": 0.18284207582473755, "rewards/accuracy_reward_stage2": 0.39879778027534485, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2323 }, { "completion_length": 10.296875, "epoch": 0.4072192044857193, "grad_norm": 20.67006780550727, "kl": 0.07763671875, "learning_rate": 5.929560189241283e-07, "loss": -0.0129, "reward": 1.6703296899795532, "reward_std": 0.3147152066230774, "rewards/accuracy_reward_stage2": 0.6859546899795532, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2324 }, { "completion_length": 8.609375, "epoch": 0.40739442789556685, "grad_norm": 22.610598881736344, "kl": 0.1533203125, "learning_rate": 5.927807955142807e-07, "loss": 0.0005, "reward": 1.388228416442871, "reward_std": 0.27816927433013916, "rewards/accuracy_reward_stage2": 0.5444784164428711, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2325 }, { "completion_length": 9.5625, "epoch": 0.4075696513054144, "grad_norm": 15.881553175034364, "kl": 0.09912109375, "learning_rate": 5.926055721044331e-07, "loss": 0.0023, "reward": 1.7211300134658813, "reward_std": 0.22270438075065613, "rewards/accuracy_reward_stage2": 0.7367550134658813, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2326 }, { "completion_length": 7.59375, "epoch": 0.40774487471526194, "grad_norm": 15.311621032584444, "kl": 0.051513671875, "learning_rate": 5.924303486945856e-07, "loss": 0.0206, "reward": 1.537257194519043, "reward_std": 0.19294340908527374, "rewards/accuracy_reward_stage2": 0.537257194519043, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2327 }, { "completion_length": 9.390625, "epoch": 0.4079200981251095, "grad_norm": 17.162111824963418, "kl": 0.12890625, "learning_rate": 5.922551252847379e-07, "loss": 0.0073, "reward": 1.4032111167907715, "reward_std": 0.16666026413440704, "rewards/accuracy_reward_stage2": 0.4188360273838043, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2328 }, { "completion_length": 9.03125, "epoch": 0.4080953215349571, "grad_norm": 21.17231097602088, "kl": 0.2021484375, "learning_rate": 5.920799018748904e-07, "loss": 0.0426, "reward": 1.358152151107788, "reward_std": 0.20485907793045044, "rewards/accuracy_reward_stage2": 0.3737771511077881, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2329 }, { "completion_length": 12.265625, "epoch": 0.40827054494480464, "grad_norm": 30.753156072640614, "kl": 0.25, "learning_rate": 5.919046784650429e-07, "loss": 0.1002, "reward": 1.5117536783218384, "reward_std": 0.21533477306365967, "rewards/accuracy_reward_stage2": 0.7617536783218384, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2330 }, { "completion_length": 16.671875, "epoch": 0.4084457683546522, "grad_norm": 16.282888838198275, "kl": 0.031005859375, "learning_rate": 5.917294550551953e-07, "loss": 0.0124, "reward": 1.5380942821502686, "reward_std": 0.24841409921646118, "rewards/accuracy_reward_stage2": 0.5380942821502686, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2331 }, { "completion_length": 17.609375, "epoch": 0.40862099176449973, "grad_norm": 24.394382354395756, "kl": 0.052001953125, "learning_rate": 5.915542316453478e-07, "loss": 0.0208, "reward": 1.5208165645599365, "reward_std": 0.16024133563041687, "rewards/accuracy_reward_stage2": 0.5208166241645813, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2332 }, { "completion_length": 8.875, "epoch": 0.4087962151743473, "grad_norm": 18.123673916162268, "kl": 0.10107421875, "learning_rate": 5.913790082355002e-07, "loss": 0.0404, "reward": 1.6102049350738525, "reward_std": 0.22374418377876282, "rewards/accuracy_reward_stage2": 0.7352049350738525, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2333 }, { "completion_length": 7.75, "epoch": 0.4089714385841948, "grad_norm": 18.41739347695288, "kl": 0.0859375, "learning_rate": 5.912037848256527e-07, "loss": 0.0343, "reward": 1.2719743251800537, "reward_std": 0.19079017639160156, "rewards/accuracy_reward_stage2": 0.39697423577308655, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2334 }, { "completion_length": 9.609375, "epoch": 0.4091466619940424, "grad_norm": 21.074570496495934, "kl": 0.049072265625, "learning_rate": 5.910285614158052e-07, "loss": 0.0196, "reward": 1.3504436016082764, "reward_std": 0.2122042328119278, "rewards/accuracy_reward_stage2": 0.47544366121292114, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2335 }, { "completion_length": 22.625, "epoch": 0.40932188540389, "grad_norm": 19.930927351670906, "kl": 0.1630859375, "learning_rate": 5.908533380059576e-07, "loss": 0.0238, "reward": 1.6350904703140259, "reward_std": 0.2252962738275528, "rewards/accuracy_reward_stage2": 0.6507154703140259, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2336 }, { "completion_length": 6.921875, "epoch": 0.4094971088137375, "grad_norm": 13.86422551326349, "kl": 0.1845703125, "learning_rate": 5.906781145961101e-07, "loss": -0.0588, "reward": 1.7604167461395264, "reward_std": 0.19533005356788635, "rewards/accuracy_reward_stage2": 0.8072916865348816, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2337 }, { "completion_length": 8.78125, "epoch": 0.40967233222358507, "grad_norm": 18.223599790270328, "kl": 0.0693359375, "learning_rate": 5.905028911862626e-07, "loss": -0.0035, "reward": 1.4242560863494873, "reward_std": 0.21478287875652313, "rewards/accuracy_reward_stage2": 0.43988117575645447, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2338 }, { "completion_length": 14.75, "epoch": 0.4098475556334326, "grad_norm": 20.60659264277177, "kl": 0.04150390625, "learning_rate": 5.903276677764148e-07, "loss": 0.0166, "reward": 1.7482521533966064, "reward_std": 0.23158448934555054, "rewards/accuracy_reward_stage2": 0.7482522130012512, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2339 }, { "completion_length": 8.03125, "epoch": 0.41002277904328016, "grad_norm": 17.999935466107047, "kl": 0.046142578125, "learning_rate": 5.901524443665673e-07, "loss": 0.0184, "reward": 1.4947917461395264, "reward_std": 0.2120075523853302, "rewards/accuracy_reward_stage2": 0.4947916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2340 }, { "completion_length": 13.9375, "epoch": 0.41019800245312776, "grad_norm": 12.194046839176925, "kl": 0.134765625, "learning_rate": 5.899772209567197e-07, "loss": 0.0121, "reward": 1.36506986618042, "reward_std": 0.1311139315366745, "rewards/accuracy_reward_stage2": 0.6306947469711304, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2341 }, { "completion_length": 9.859375, "epoch": 0.4103732258629753, "grad_norm": 19.060395869898677, "kl": 0.0927734375, "learning_rate": 5.898019975468722e-07, "loss": 0.0371, "reward": 1.7756450176239014, "reward_std": 0.15166711807250977, "rewards/accuracy_reward_stage2": 0.7756450176239014, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2342 }, { "completion_length": 10.359375, "epoch": 0.41054844927282286, "grad_norm": 19.750944712631785, "kl": 0.1142578125, "learning_rate": 5.896267741370247e-07, "loss": 0.0457, "reward": 1.4291049242019653, "reward_std": 0.158258855342865, "rewards/accuracy_reward_stage2": 0.5541049242019653, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2343 }, { "completion_length": 9.390625, "epoch": 0.4107236726826704, "grad_norm": 20.40117083117346, "kl": 0.1357421875, "learning_rate": 5.894515507271771e-07, "loss": 0.0255, "reward": 1.6197458505630493, "reward_std": 0.26166221499443054, "rewards/accuracy_reward_stage2": 0.6353708505630493, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2344 }, { "completion_length": 9.390625, "epoch": 0.41089889609251795, "grad_norm": 13.735586804906264, "kl": 0.0576171875, "learning_rate": 5.892763273173296e-07, "loss": -0.0124, "reward": 1.5586662292480469, "reward_std": 0.187656432390213, "rewards/accuracy_reward_stage2": 0.5742912888526917, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2345 }, { "completion_length": 18.46875, "epoch": 0.4110741195023655, "grad_norm": 15.581619516175259, "kl": 0.392578125, "learning_rate": 5.891011039074821e-07, "loss": 0.1548, "reward": 1.738080620765686, "reward_std": 0.13184432685375214, "rewards/accuracy_reward_stage2": 0.863080620765686, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2346 }, { "completion_length": 19.359375, "epoch": 0.41124934291221305, "grad_norm": 19.30749538544802, "kl": 0.060546875, "learning_rate": 5.889258804976345e-07, "loss": -0.0199, "reward": 1.4864542484283447, "reward_std": 0.16072504222393036, "rewards/accuracy_reward_stage2": 0.6270792484283447, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2347 }, { "completion_length": 15.796875, "epoch": 0.41142456632206065, "grad_norm": 14.788056029744874, "kl": 0.06689453125, "learning_rate": 5.887506570877869e-07, "loss": 0.0268, "reward": 1.5268596410751343, "reward_std": 0.1403314173221588, "rewards/accuracy_reward_stage2": 0.5268596410751343, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2348 }, { "completion_length": 10.921875, "epoch": 0.4115997897319082, "grad_norm": 19.221941814879607, "kl": 0.24609375, "learning_rate": 5.885754336779393e-07, "loss": -0.0237, "reward": 1.5493875741958618, "reward_std": 0.2811095118522644, "rewards/accuracy_reward_stage2": 0.5962625741958618, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2349 }, { "completion_length": 16.515625, "epoch": 0.41177501314175574, "grad_norm": 8.794078311705457, "kl": 0.03125, "learning_rate": 5.884002102680918e-07, "loss": 0.0125, "reward": 1.5402517318725586, "reward_std": 0.015763016417622566, "rewards/accuracy_reward_stage2": 0.5402517318725586, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2350 }, { "completion_length": 13.828125, "epoch": 0.4119502365516033, "grad_norm": 17.140128237746744, "kl": 0.2236328125, "learning_rate": 5.882249868582443e-07, "loss": 0.0453, "reward": 1.393404245376587, "reward_std": 0.19191637635231018, "rewards/accuracy_reward_stage2": 0.6590292453765869, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2351 }, { "completion_length": 13.1875, "epoch": 0.41212545996145084, "grad_norm": 23.406708522628044, "kl": 0.2119140625, "learning_rate": 5.880497634483966e-07, "loss": -0.006, "reward": 1.6162077188491821, "reward_std": 0.24657636880874634, "rewards/accuracy_reward_stage2": 0.6630827188491821, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2352 }, { "completion_length": 10.125, "epoch": 0.4123006833712984, "grad_norm": 15.536105603528492, "kl": 0.095703125, "learning_rate": 5.878745400385491e-07, "loss": 0.0383, "reward": 1.2230710983276367, "reward_std": 0.16474489867687225, "rewards/accuracy_reward_stage2": 0.3480711877346039, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2353 }, { "completion_length": 17.515625, "epoch": 0.412475906781146, "grad_norm": 27.055096838392842, "kl": 0.08203125, "learning_rate": 5.876993166287016e-07, "loss": 0.033, "reward": 1.4038621187210083, "reward_std": 0.20572970807552338, "rewards/accuracy_reward_stage2": 0.5288619995117188, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2354 }, { "completion_length": 9.1875, "epoch": 0.41265113019099353, "grad_norm": 19.439410924795908, "kl": 0.1611328125, "learning_rate": 5.87524093218854e-07, "loss": 0.0247, "reward": 1.529209852218628, "reward_std": 0.2930196523666382, "rewards/accuracy_reward_stage2": 0.5448348522186279, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2355 }, { "completion_length": 9.890625, "epoch": 0.4128263536008411, "grad_norm": 19.132398622575774, "kl": 0.2080078125, "learning_rate": 5.873488698090065e-07, "loss": 0.0829, "reward": 1.3353736400604248, "reward_std": 0.12006399035453796, "rewards/accuracy_reward_stage2": 0.46037358045578003, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2356 }, { "completion_length": 8.625, "epoch": 0.4130015770106886, "grad_norm": 18.508285146546704, "kl": 0.099609375, "learning_rate": 5.871736463991589e-07, "loss": -0.0044, "reward": 1.7954076528549194, "reward_std": 0.16407202184200287, "rewards/accuracy_reward_stage2": 0.8110326528549194, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2357 }, { "completion_length": 7.703125, "epoch": 0.41317680042053617, "grad_norm": 13.905171688530661, "kl": 0.04296875, "learning_rate": 5.869984229893113e-07, "loss": 0.0172, "reward": 1.6086777448654175, "reward_std": 0.10335144400596619, "rewards/accuracy_reward_stage2": 0.6086777448654175, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2358 }, { "completion_length": 7.5, "epoch": 0.4133520238303837, "grad_norm": 19.268524405558992, "kl": 0.1005859375, "learning_rate": 5.868231995794638e-07, "loss": -0.0039, "reward": 1.7492554187774658, "reward_std": 0.24843813478946686, "rewards/accuracy_reward_stage2": 0.7648804187774658, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2359 }, { "completion_length": 6.796875, "epoch": 0.4135272472402313, "grad_norm": 19.160492785539688, "kl": 0.189453125, "learning_rate": 5.866479761696162e-07, "loss": 0.0756, "reward": 1.198094129562378, "reward_std": 0.22037328779697418, "rewards/accuracy_reward_stage2": 0.3230942189693451, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2360 }, { "completion_length": 8.96875, "epoch": 0.41370247065007887, "grad_norm": 16.825110820402383, "kl": 0.09228515625, "learning_rate": 5.864727527597687e-07, "loss": 0.0089, "reward": 1.6092510223388672, "reward_std": 0.12170203030109406, "rewards/accuracy_reward_stage2": 0.6248759031295776, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2361 }, { "completion_length": 8.5, "epoch": 0.4138776940599264, "grad_norm": 7.999499917781706, "kl": 0.05126953125, "learning_rate": 5.862975293499212e-07, "loss": 0.0205, "reward": 1.359375, "reward_std": 0.04419417306780815, "rewards/accuracy_reward_stage2": 0.359375, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2362 }, { "completion_length": 13.21875, "epoch": 0.41405291746977396, "grad_norm": 21.393899908468846, "kl": 0.052001953125, "learning_rate": 5.861223059400736e-07, "loss": 0.0209, "reward": 1.5607662200927734, "reward_std": 0.2627609670162201, "rewards/accuracy_reward_stage2": 0.5607661008834839, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2363 }, { "completion_length": 11.78125, "epoch": 0.4142281408796215, "grad_norm": 22.023127107995297, "kl": 0.05908203125, "learning_rate": 5.859470825302261e-07, "loss": 0.0236, "reward": 1.6148502826690674, "reward_std": 0.16442753374576569, "rewards/accuracy_reward_stage2": 0.6148503422737122, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2364 }, { "completion_length": 9.03125, "epoch": 0.41440336428946906, "grad_norm": 14.362750051760493, "kl": 0.06787109375, "learning_rate": 5.857718591203784e-07, "loss": 0.0272, "reward": 1.6884300708770752, "reward_std": 0.07231159508228302, "rewards/accuracy_reward_stage2": 0.6884300112724304, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2365 }, { "completion_length": 22.828125, "epoch": 0.4145785876993166, "grad_norm": 23.613349877555066, "kl": 0.37890625, "learning_rate": 5.855966357105309e-07, "loss": 0.1514, "reward": 1.411705493927002, "reward_std": 0.31298691034317017, "rewards/accuracy_reward_stage2": 0.5367054343223572, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2366 }, { "completion_length": 6.6875, "epoch": 0.4147538111091642, "grad_norm": 20.00550996319013, "kl": 0.09423828125, "learning_rate": 5.854214123006834e-07, "loss": 0.0377, "reward": 1.5901319980621338, "reward_std": 0.18717949092388153, "rewards/accuracy_reward_stage2": 0.7151321172714233, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2367 }, { "completion_length": 12.265625, "epoch": 0.41492903451901175, "grad_norm": 17.797273669738534, "kl": 0.05126953125, "learning_rate": 5.852461888908357e-07, "loss": 0.0205, "reward": 1.3920097351074219, "reward_std": 0.1308312863111496, "rewards/accuracy_reward_stage2": 0.3920097351074219, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2368 }, { "completion_length": 11.859375, "epoch": 0.4151042579288593, "grad_norm": 26.299178173381538, "kl": 0.166015625, "learning_rate": 5.850709654809882e-07, "loss": 0.0593, "reward": 1.4359591007232666, "reward_std": 0.17541763186454773, "rewards/accuracy_reward_stage2": 0.685958981513977, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2369 }, { "completion_length": 17.78125, "epoch": 0.41527948133870685, "grad_norm": 14.485488856701455, "kl": 0.06640625, "learning_rate": 5.848957420711407e-07, "loss": -0.0014, "reward": 1.775770664215088, "reward_std": 0.10899923741817474, "rewards/accuracy_reward_stage2": 0.7913956046104431, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2370 }, { "completion_length": 27.265625, "epoch": 0.4154547047485544, "grad_norm": 17.881757409429884, "kl": 0.341796875, "learning_rate": 5.847205186612931e-07, "loss": 0.1376, "reward": 1.4621247053146362, "reward_std": 0.07601737231016159, "rewards/accuracy_reward_stage2": 0.5871245861053467, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2371 }, { "completion_length": 8.390625, "epoch": 0.41562992815840194, "grad_norm": 25.055742932493168, "kl": 0.0986328125, "learning_rate": 5.845452952514456e-07, "loss": 0.0395, "reward": 1.6540504693984985, "reward_std": 0.20673657953739166, "rewards/accuracy_reward_stage2": 0.6540504693984985, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2372 }, { "completion_length": 11.375, "epoch": 0.41580515156824954, "grad_norm": 16.521109231983434, "kl": 0.10546875, "learning_rate": 5.84370071841598e-07, "loss": 0.0421, "reward": 1.4801661968231201, "reward_std": 0.12700024247169495, "rewards/accuracy_reward_stage2": 0.4801662564277649, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2373 }, { "completion_length": 11.171875, "epoch": 0.4159803749780971, "grad_norm": 20.12090258507575, "kl": 0.07568359375, "learning_rate": 5.841948484317505e-07, "loss": 0.0302, "reward": 1.670166015625, "reward_std": 0.10102277249097824, "rewards/accuracy_reward_stage2": 0.6701659560203552, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2374 }, { "completion_length": 10.09375, "epoch": 0.41615559838794464, "grad_norm": 16.442767290434187, "kl": 0.12451171875, "learning_rate": 5.84019625021903e-07, "loss": 0.0499, "reward": 1.6291148662567139, "reward_std": 0.06639102101325989, "rewards/accuracy_reward_stage2": 0.7541148662567139, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2375 }, { "completion_length": 8.296875, "epoch": 0.4163308217977922, "grad_norm": 21.97629019067572, "kl": 0.1572265625, "learning_rate": 5.838444016120554e-07, "loss": 0.0202, "reward": 1.7240822315216064, "reward_std": 0.2035062313079834, "rewards/accuracy_reward_stage2": 0.7397072315216064, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2376 }, { "completion_length": 14.984375, "epoch": 0.41650604520763973, "grad_norm": 30.896256657541, "kl": 0.12158203125, "learning_rate": 5.836691782022077e-07, "loss": 0.0487, "reward": 1.5382190942764282, "reward_std": 0.3024444580078125, "rewards/accuracy_reward_stage2": 0.5382190942764282, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2377 }, { "completion_length": 10.125, "epoch": 0.4166812686174873, "grad_norm": 20.28747836466928, "kl": 0.11572265625, "learning_rate": 5.834939547923601e-07, "loss": 0.0462, "reward": 1.73995041847229, "reward_std": 0.20338395237922668, "rewards/accuracy_reward_stage2": 0.7399503588676453, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2378 }, { "completion_length": 10.609375, "epoch": 0.4168564920273349, "grad_norm": 13.97976660621444, "kl": 0.0196533203125, "learning_rate": 5.833187313825126e-07, "loss": 0.0079, "reward": 1.5437802076339722, "reward_std": 0.11976291984319687, "rewards/accuracy_reward_stage2": 0.6687802076339722, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2379 }, { "completion_length": 16.25, "epoch": 0.4170317154371824, "grad_norm": 16.29885843641377, "kl": 0.033447265625, "learning_rate": 5.831435079726651e-07, "loss": 0.0134, "reward": 1.239460825920105, "reward_std": 0.17725922167301178, "rewards/accuracy_reward_stage2": 0.2394607961177826, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2380 }, { "completion_length": 9.4375, "epoch": 0.41720693884703, "grad_norm": 17.520922343117828, "kl": 0.177734375, "learning_rate": 5.829682845628175e-07, "loss": 0.071, "reward": 1.414095163345337, "reward_std": 0.1704052984714508, "rewards/accuracy_reward_stage2": 0.41409510374069214, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2381 }, { "completion_length": 21.375, "epoch": 0.4173821622568775, "grad_norm": 17.758151692325338, "kl": 0.07666015625, "learning_rate": 5.8279306115297e-07, "loss": -0.0132, "reward": 1.573897123336792, "reward_std": 0.15478408336639404, "rewards/accuracy_reward_stage2": 0.589522123336792, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2382 }, { "completion_length": 10.6875, "epoch": 0.41755738566672507, "grad_norm": 21.532408390881507, "kl": 0.1416015625, "learning_rate": 5.826178377431225e-07, "loss": 0.0567, "reward": 1.5427581071853638, "reward_std": 0.1347476989030838, "rewards/accuracy_reward_stage2": 0.6677581071853638, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2383 }, { "completion_length": 13.84375, "epoch": 0.4177326090765726, "grad_norm": 16.64175548823734, "kl": 0.07080078125, "learning_rate": 5.824426143332749e-07, "loss": 0.0282, "reward": 1.6984035968780518, "reward_std": 0.1400546133518219, "rewards/accuracy_reward_stage2": 0.6984036564826965, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2384 }, { "completion_length": 10.765625, "epoch": 0.41790783248642016, "grad_norm": 16.949582968543215, "kl": 0.08544921875, "learning_rate": 5.822673909234274e-07, "loss": -0.0414, "reward": 1.7102024555206299, "reward_std": 0.28166982531547546, "rewards/accuracy_reward_stage2": 0.7414524555206299, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2385 }, { "completion_length": 11.203125, "epoch": 0.41808305589626776, "grad_norm": 12.914250997053147, "kl": 0.1337890625, "learning_rate": 5.820921675135799e-07, "loss": -0.0193, "reward": 1.5751521587371826, "reward_std": 0.10636032372713089, "rewards/accuracy_reward_stage2": 0.6064021587371826, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2386 }, { "completion_length": 12.765625, "epoch": 0.4182582793061153, "grad_norm": 17.62833798934218, "kl": 0.16796875, "learning_rate": 5.819169441037323e-07, "loss": 0.0231, "reward": 1.8179621696472168, "reward_std": 0.18188484013080597, "rewards/accuracy_reward_stage2": 0.8335871696472168, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2387 }, { "completion_length": 12.96875, "epoch": 0.41843350271596286, "grad_norm": 18.760679579906835, "kl": 0.09765625, "learning_rate": 5.817417206938847e-07, "loss": 0.0392, "reward": 1.4825525283813477, "reward_std": 0.19686242938041687, "rewards/accuracy_reward_stage2": 0.6075524687767029, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2388 }, { "completion_length": 9.125, "epoch": 0.4186087261258104, "grad_norm": 16.455958478420655, "kl": 0.0208740234375, "learning_rate": 5.815664972840371e-07, "loss": 0.0083, "reward": 1.5572917461395264, "reward_std": 0.062747523188591, "rewards/accuracy_reward_stage2": 0.5572916865348816, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2389 }, { "completion_length": 17.234375, "epoch": 0.41878394953565795, "grad_norm": 21.767371412439168, "kl": 0.177734375, "learning_rate": 5.813912738741895e-07, "loss": -0.022, "reward": 1.750274419784546, "reward_std": 0.2638322710990906, "rewards/accuracy_reward_stage2": 0.7971494793891907, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2390 }, { "completion_length": 7.15625, "epoch": 0.4189591729455055, "grad_norm": 19.398944510953225, "kl": 0.09716796875, "learning_rate": 5.81216050464342e-07, "loss": 0.0388, "reward": 1.7229228019714355, "reward_std": 0.1554725468158722, "rewards/accuracy_reward_stage2": 0.7229227423667908, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2391 }, { "completion_length": 11.984375, "epoch": 0.4191343963553531, "grad_norm": 25.80667769908971, "kl": 0.1220703125, "learning_rate": 5.810408270544944e-07, "loss": 0.0489, "reward": 1.493762493133545, "reward_std": 0.32909145951271057, "rewards/accuracy_reward_stage2": 0.6187624335289001, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2392 }, { "completion_length": 9.6875, "epoch": 0.41930961976520065, "grad_norm": 24.204346190531403, "kl": 0.2890625, "learning_rate": 5.808656036446469e-07, "loss": 0.1156, "reward": 1.220840334892273, "reward_std": 0.378046452999115, "rewards/accuracy_reward_stage2": 0.595840334892273, "rewards/format_reward_stage1_pointerpad": 0.625, "scores/accuracy_reward_stage2": 0.625, "step": 2393 }, { "completion_length": 20.84375, "epoch": 0.4194848431750482, "grad_norm": 27.14247331797663, "kl": 0.12353515625, "learning_rate": 5.806903802347993e-07, "loss": 0.0053, "reward": 1.4899253845214844, "reward_std": 0.243827223777771, "rewards/accuracy_reward_stage2": 0.6305502653121948, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2394 }, { "completion_length": 11.125, "epoch": 0.41966006658489574, "grad_norm": 21.81038085160431, "kl": 0.2138671875, "learning_rate": 5.805151568249518e-07, "loss": 0.0856, "reward": 1.4024500846862793, "reward_std": 0.35116198658943176, "rewards/accuracy_reward_stage2": 0.5274500846862793, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2395 }, { "completion_length": 17.34375, "epoch": 0.4198352899947433, "grad_norm": 24.3348961842837, "kl": 0.027099609375, "learning_rate": 5.803399334151043e-07, "loss": -0.0334, "reward": 1.5152560472488403, "reward_std": 0.20777665078639984, "rewards/accuracy_reward_stage2": 0.6558809876441956, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2396 }, { "completion_length": 8.40625, "epoch": 0.42001051340459084, "grad_norm": 22.865852095520676, "kl": 0.06640625, "learning_rate": 5.801647100052566e-07, "loss": -0.007, "reward": 1.7035300731658936, "reward_std": 0.2735273838043213, "rewards/accuracy_reward_stage2": 0.7191551327705383, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2397 }, { "completion_length": 7.6875, "epoch": 0.4201857368144384, "grad_norm": 12.864278552382983, "kl": 0.0267333984375, "learning_rate": 5.799894865954091e-07, "loss": 0.0107, "reward": 1.5834779739379883, "reward_std": 0.10687437653541565, "rewards/accuracy_reward_stage2": 0.5834779739379883, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2398 }, { "completion_length": 10.234375, "epoch": 0.420360960224286, "grad_norm": 14.545711269406413, "kl": 0.031494140625, "learning_rate": 5.798142631855616e-07, "loss": 0.0126, "reward": 1.5031486749649048, "reward_std": 0.15417417883872986, "rewards/accuracy_reward_stage2": 0.6281486749649048, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2399 }, { "completion_length": 11.40625, "epoch": 0.42053618363413353, "grad_norm": 21.409633551926984, "kl": 0.0791015625, "learning_rate": 5.79639039775714e-07, "loss": 0.0028, "reward": 1.5188215970993042, "reward_std": 0.26314669847488403, "rewards/accuracy_reward_stage2": 0.5344465970993042, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2400 }, { "completion_length": 9.671875, "epoch": 0.4207114070439811, "grad_norm": 19.715486362642697, "kl": 0.06591796875, "learning_rate": 5.794638163658665e-07, "loss": 0.0264, "reward": 1.4238401651382446, "reward_std": 0.18752865493297577, "rewards/accuracy_reward_stage2": 0.42384013533592224, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2401 }, { "completion_length": 8.671875, "epoch": 0.4208866304538286, "grad_norm": 26.864964821679248, "kl": 0.2158203125, "learning_rate": 5.792885929560189e-07, "loss": 0.0476, "reward": 1.5920884609222412, "reward_std": 0.20743471384048462, "rewards/accuracy_reward_stage2": 0.607713520526886, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2402 }, { "completion_length": 7.1875, "epoch": 0.42106185386367617, "grad_norm": 19.490899661332257, "kl": 0.111328125, "learning_rate": 5.791133695461713e-07, "loss": 0.0059, "reward": 1.6936674118041992, "reward_std": 0.1947699785232544, "rewards/accuracy_reward_stage2": 0.7092924118041992, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2403 }, { "completion_length": 8.515625, "epoch": 0.4212370772735237, "grad_norm": 18.196450672215533, "kl": 0.138671875, "learning_rate": 5.789381461363238e-07, "loss": -0.0328, "reward": 1.6714731454849243, "reward_std": 0.287492960691452, "rewards/accuracy_reward_stage2": 0.7027231454849243, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2404 }, { "completion_length": 14.765625, "epoch": 0.4214123006833713, "grad_norm": 12.377034503915139, "kl": 0.01177978515625, "learning_rate": 5.787629227264762e-07, "loss": 0.0047, "reward": 1.6268939971923828, "reward_std": 0.11237125098705292, "rewards/accuracy_reward_stage2": 0.751893937587738, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2405 }, { "completion_length": 11.703125, "epoch": 0.42158752409321887, "grad_norm": 22.596957861242434, "kl": 0.1015625, "learning_rate": 5.785876993166287e-07, "loss": 0.0407, "reward": 1.673307180404663, "reward_std": 0.26322025060653687, "rewards/accuracy_reward_stage2": 0.6733071804046631, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2406 }, { "completion_length": 10.765625, "epoch": 0.4217627475030664, "grad_norm": 10.012152757261067, "kl": 0.06640625, "learning_rate": 5.784124759067811e-07, "loss": 0.0266, "reward": 1.777231216430664, "reward_std": 0.07235102355480194, "rewards/accuracy_reward_stage2": 0.7772312760353088, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2407 }, { "completion_length": 33.0625, "epoch": 0.42193797091291396, "grad_norm": 22.549033708865686, "kl": 0.06396484375, "learning_rate": 5.782372524969335e-07, "loss": -0.0232, "reward": 1.5575335025787354, "reward_std": 0.25364547967910767, "rewards/accuracy_reward_stage2": 0.5887835025787354, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2408 }, { "completion_length": 10.1875, "epoch": 0.4221131943227615, "grad_norm": 16.69878136588533, "kl": 0.11328125, "learning_rate": 5.78062029087086e-07, "loss": 0.0012, "reward": 1.7619128227233887, "reward_std": 0.23174157738685608, "rewards/accuracy_reward_stage2": 0.7775378227233887, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2409 }, { "completion_length": 8.5625, "epoch": 0.42228841773260906, "grad_norm": 19.92237217185688, "kl": 0.1728515625, "learning_rate": 5.778868056772384e-07, "loss": 0.025, "reward": 1.4769458770751953, "reward_std": 0.22557707130908966, "rewards/accuracy_reward_stage2": 0.4925709366798401, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2410 }, { "completion_length": 10.265625, "epoch": 0.42246364114245666, "grad_norm": 21.942952623213802, "kl": 0.056884765625, "learning_rate": 5.777115822673909e-07, "loss": 0.0227, "reward": 1.838803768157959, "reward_std": 0.1688080132007599, "rewards/accuracy_reward_stage2": 0.838803768157959, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2411 }, { "completion_length": 11.25, "epoch": 0.4226388645523042, "grad_norm": 16.68553099657329, "kl": 0.07958984375, "learning_rate": 5.775363588575434e-07, "loss": -0.0122, "reward": 1.596681833267212, "reward_std": 0.20045030117034912, "rewards/accuracy_reward_stage2": 0.6123068332672119, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2412 }, { "completion_length": 9.546875, "epoch": 0.42281408796215175, "grad_norm": 19.458085509509655, "kl": 0.2001953125, "learning_rate": 5.773611354476958e-07, "loss": 0.0078, "reward": 1.3699358701705933, "reward_std": 0.256511926651001, "rewards/accuracy_reward_stage2": 0.4011858403682709, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2413 }, { "completion_length": 13.25, "epoch": 0.4229893113719993, "grad_norm": 21.411986512608394, "kl": 0.2060546875, "learning_rate": 5.771859120378483e-07, "loss": 0.0826, "reward": 1.5075629949569702, "reward_std": 0.20391272008419037, "rewards/accuracy_reward_stage2": 0.632563054561615, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2414 }, { "completion_length": 17.734375, "epoch": 0.42316453478184685, "grad_norm": 30.774292690055958, "kl": 0.1767578125, "learning_rate": 5.770106886280008e-07, "loss": 0.0706, "reward": 1.3399405479431152, "reward_std": 0.2806154489517212, "rewards/accuracy_reward_stage2": 0.46494060754776, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2415 }, { "completion_length": 7.5, "epoch": 0.4233397581916944, "grad_norm": 14.927647667623011, "kl": 0.0595703125, "learning_rate": 5.768354652181531e-07, "loss": -0.0204, "reward": 1.621319055557251, "reward_std": 0.12363035976886749, "rewards/accuracy_reward_stage2": 0.636944055557251, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2416 }, { "completion_length": 10.6875, "epoch": 0.42351498160154194, "grad_norm": 20.65694451978839, "kl": 0.087890625, "learning_rate": 5.766602418083055e-07, "loss": 0.0352, "reward": 1.674845814704895, "reward_std": 0.1950603872537613, "rewards/accuracy_reward_stage2": 0.6748457551002502, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2417 }, { "completion_length": 10.671875, "epoch": 0.42369020501138954, "grad_norm": 19.60176983232829, "kl": 0.0908203125, "learning_rate": 5.764850183984579e-07, "loss": 0.0234, "reward": 1.410792350769043, "reward_std": 0.20295007526874542, "rewards/accuracy_reward_stage2": 0.4264172911643982, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2418 }, { "completion_length": 17.28125, "epoch": 0.4238654284212371, "grad_norm": 15.447747709824291, "kl": 0.029296875, "learning_rate": 5.763097949886104e-07, "loss": 0.0117, "reward": 1.539503812789917, "reward_std": 0.120786651968956, "rewards/accuracy_reward_stage2": 0.539503812789917, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2419 }, { "completion_length": 12.53125, "epoch": 0.42404065183108464, "grad_norm": 26.228768769751692, "kl": 0.1552734375, "learning_rate": 5.761345715787629e-07, "loss": 0.0285, "reward": 1.535796880722046, "reward_std": 0.2801082730293274, "rewards/accuracy_reward_stage2": 0.5514217615127563, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2420 }, { "completion_length": 29.546875, "epoch": 0.4242158752409322, "grad_norm": 19.516174382115324, "kl": 0.12890625, "learning_rate": 5.759593481689153e-07, "loss": 0.0075, "reward": 1.733359456062317, "reward_std": 0.17085593938827515, "rewards/accuracy_reward_stage2": 0.7489844560623169, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2421 }, { "completion_length": 14.4375, "epoch": 0.42439109865077973, "grad_norm": 19.17537638480682, "kl": 0.091796875, "learning_rate": 5.757841247590678e-07, "loss": 0.0368, "reward": 1.246988296508789, "reward_std": 0.1498282104730606, "rewards/accuracy_reward_stage2": 0.49698832631111145, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2422 }, { "completion_length": 11.703125, "epoch": 0.4245663220606273, "grad_norm": 23.20436750248409, "kl": 0.0751953125, "learning_rate": 5.756089013492203e-07, "loss": -0.0139, "reward": 1.7541325092315674, "reward_std": 0.22958284616470337, "rewards/accuracy_reward_stage2": 0.7697575092315674, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2423 }, { "completion_length": 8.96875, "epoch": 0.4247415454704749, "grad_norm": 21.20783789239163, "kl": 0.1357421875, "learning_rate": 5.754336779393727e-07, "loss": 0.0542, "reward": 1.7458889484405518, "reward_std": 0.29691436886787415, "rewards/accuracy_reward_stage2": 0.7458890080451965, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2424 }, { "completion_length": 16.796875, "epoch": 0.4249167688803224, "grad_norm": 23.664822906611626, "kl": 0.24609375, "learning_rate": 5.752584545295252e-07, "loss": 0.0985, "reward": 1.4297152757644653, "reward_std": 0.28032106161117554, "rewards/accuracy_reward_stage2": 0.5547152161598206, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2425 }, { "completion_length": 17.390625, "epoch": 0.42509199229017, "grad_norm": 15.369191055475348, "kl": 0.037109375, "learning_rate": 5.750832311196776e-07, "loss": 0.0148, "reward": 1.5607839822769165, "reward_std": 0.09787797927856445, "rewards/accuracy_reward_stage2": 0.6857839822769165, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2426 }, { "completion_length": 19.46875, "epoch": 0.4252672157000175, "grad_norm": 24.312044808358525, "kl": 0.1904296875, "learning_rate": 5.7490800770983e-07, "loss": 0.076, "reward": 1.4541585445404053, "reward_std": 0.1744290292263031, "rewards/accuracy_reward_stage2": 0.7041586637496948, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2427 }, { "completion_length": 17.9375, "epoch": 0.42544243910986507, "grad_norm": 23.50930204251329, "kl": 0.1923828125, "learning_rate": 5.747327842999824e-07, "loss": 0.0771, "reward": 1.4003291130065918, "reward_std": 0.22130905091762543, "rewards/accuracy_reward_stage2": 0.6503292322158813, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2428 }, { "completion_length": 9.015625, "epoch": 0.4256176625197126, "grad_norm": 22.13947088990283, "kl": 0.1806640625, "learning_rate": 5.745575608901348e-07, "loss": 0.0161, "reward": 1.6461684703826904, "reward_std": 0.2596883475780487, "rewards/accuracy_reward_stage2": 0.6774183511734009, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2429 }, { "completion_length": 11.3125, "epoch": 0.4257928859295602, "grad_norm": 22.035671063081804, "kl": 0.1474609375, "learning_rate": 5.743823374802873e-07, "loss": 0.015, "reward": 1.395911693572998, "reward_std": 0.24767053127288818, "rewards/accuracy_reward_stage2": 0.41153672337532043, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2430 }, { "completion_length": 21.15625, "epoch": 0.42596810933940776, "grad_norm": 20.96133435505049, "kl": 0.1572265625, "learning_rate": 5.742071140704398e-07, "loss": 0.0628, "reward": 1.5663808584213257, "reward_std": 0.1595151424407959, "rewards/accuracy_reward_stage2": 0.6913807988166809, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2431 }, { "completion_length": 7.875, "epoch": 0.4261433327492553, "grad_norm": 28.340403516253645, "kl": 0.0498046875, "learning_rate": 5.740318906605922e-07, "loss": -0.0241, "reward": 1.5094510316848755, "reward_std": 0.2710912227630615, "rewards/accuracy_reward_stage2": 0.5250759720802307, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2432 }, { "completion_length": 13.765625, "epoch": 0.42631855615910286, "grad_norm": 19.890579026197372, "kl": 0.039306640625, "learning_rate": 5.738566672507447e-07, "loss": 0.0158, "reward": 1.6429111957550049, "reward_std": 0.16243639588356018, "rewards/accuracy_reward_stage2": 0.6429111957550049, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2433 }, { "completion_length": 12.59375, "epoch": 0.4264937795689504, "grad_norm": 22.00464367213492, "kl": 0.1494140625, "learning_rate": 5.736814438408971e-07, "loss": -0.0047, "reward": 1.474339246749878, "reward_std": 0.2699277997016907, "rewards/accuracy_reward_stage2": 0.5055892467498779, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2434 }, { "completion_length": 11.609375, "epoch": 0.42666900297879795, "grad_norm": 32.41247879005575, "kl": 0.279296875, "learning_rate": 5.735062204310496e-07, "loss": 0.0677, "reward": 1.5541329383850098, "reward_std": 0.13045634329319, "rewards/accuracy_reward_stage2": 0.6947579383850098, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2435 }, { "completion_length": 10.5, "epoch": 0.4268442263886455, "grad_norm": 19.876951523107635, "kl": 0.046630859375, "learning_rate": 5.733309970212021e-07, "loss": -0.0256, "reward": 1.3027304410934448, "reward_std": 0.21071302890777588, "rewards/accuracy_reward_stage2": 0.3183554708957672, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2436 }, { "completion_length": 9.25, "epoch": 0.4270194497984931, "grad_norm": 18.20379230867892, "kl": 0.01953125, "learning_rate": 5.731557736113544e-07, "loss": 0.0078, "reward": 1.5104167461395264, "reward_std": 0.25927814841270447, "rewards/accuracy_reward_stage2": 0.5104166269302368, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2437 }, { "completion_length": 5.984375, "epoch": 0.42719467320834065, "grad_norm": 16.66665150309482, "kl": 0.0673828125, "learning_rate": 5.729805502015069e-07, "loss": -0.002, "reward": 1.6153621673583984, "reward_std": 0.2000160962343216, "rewards/accuracy_reward_stage2": 0.6309871673583984, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2438 }, { "completion_length": 8.515625, "epoch": 0.4273698966181882, "grad_norm": 15.293604925338169, "kl": 0.0712890625, "learning_rate": 5.728053267916594e-07, "loss": 0.0286, "reward": 1.5463223457336426, "reward_std": 0.12554559111595154, "rewards/accuracy_reward_stage2": 0.5463222861289978, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2439 }, { "completion_length": 7.78125, "epoch": 0.42754512002803574, "grad_norm": 12.185368006882296, "kl": 0.053955078125, "learning_rate": 5.726301033818118e-07, "loss": 0.0216, "reward": 1.6412497758865356, "reward_std": 0.06281961500644684, "rewards/accuracy_reward_stage2": 0.6412497758865356, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2440 }, { "completion_length": 8.6875, "epoch": 0.4277203434378833, "grad_norm": 15.494160366052823, "kl": 0.125, "learning_rate": 5.724548799719642e-07, "loss": 0.05, "reward": 1.5815420150756836, "reward_std": 0.10027365386486053, "rewards/accuracy_reward_stage2": 0.5815420150756836, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2441 }, { "completion_length": 10.921875, "epoch": 0.42789556684773083, "grad_norm": 18.107070394943033, "kl": 0.07080078125, "learning_rate": 5.722796565621166e-07, "loss": 0.0283, "reward": 1.3592140674591064, "reward_std": 0.12985925376415253, "rewards/accuracy_reward_stage2": 0.4842139780521393, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2442 }, { "completion_length": 9.21875, "epoch": 0.42807079025757844, "grad_norm": 26.157579176268243, "kl": 0.1611328125, "learning_rate": 5.721044331522691e-07, "loss": 0.0689, "reward": 1.3738298416137695, "reward_std": 0.31853947043418884, "rewards/accuracy_reward_stage2": 0.63945472240448, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2443 }, { "completion_length": 13.328125, "epoch": 0.428246013667426, "grad_norm": 44.37524748602229, "kl": 0.283203125, "learning_rate": 5.719292097424216e-07, "loss": 0.0741, "reward": 1.4276965856552124, "reward_std": 0.2782999873161316, "rewards/accuracy_reward_stage2": 0.5683215260505676, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2444 }, { "completion_length": 11.09375, "epoch": 0.42842123707727353, "grad_norm": 23.422889478777957, "kl": 0.208984375, "learning_rate": 5.71753986332574e-07, "loss": 0.0458, "reward": 1.4923583269119263, "reward_std": 0.24954932928085327, "rewards/accuracy_reward_stage2": 0.6329833269119263, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2445 }, { "completion_length": 7.0, "epoch": 0.4285964604871211, "grad_norm": 17.028429839600122, "kl": 0.08251953125, "learning_rate": 5.715787629227265e-07, "loss": 0.0041, "reward": 1.652631163597107, "reward_std": 0.1518530398607254, "rewards/accuracy_reward_stage2": 0.6682562232017517, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2446 }, { "completion_length": 11.71875, "epoch": 0.4287716838969686, "grad_norm": 18.844946250417486, "kl": 0.259765625, "learning_rate": 5.714035395128789e-07, "loss": -0.0017, "reward": 1.7519108057022095, "reward_std": 0.26788169145584106, "rewards/accuracy_reward_stage2": 0.7987857460975647, "rewards/format_reward_stage1_pointerpad": 0.953125, "scores/accuracy_reward_stage2": 0.953125, "step": 2447 }, { "completion_length": 9.390625, "epoch": 0.42894690730681617, "grad_norm": 16.434159640612602, "kl": 0.11669921875, "learning_rate": 5.712283161030313e-07, "loss": -0.0362, "reward": 1.7319084405899048, "reward_std": 0.1767083704471588, "rewards/accuracy_reward_stage2": 0.7631585001945496, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2448 }, { "completion_length": 9.5, "epoch": 0.4291221307166637, "grad_norm": 21.436378957860942, "kl": 0.10205078125, "learning_rate": 5.710530926931838e-07, "loss": 0.0407, "reward": 1.6460349559783936, "reward_std": 0.09250953048467636, "rewards/accuracy_reward_stage2": 0.6460349559783936, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2449 }, { "completion_length": 9.921875, "epoch": 0.4292973541265113, "grad_norm": 28.61624407275498, "kl": 0.12158203125, "learning_rate": 5.708778692833362e-07, "loss": 0.0487, "reward": 1.5566973686218262, "reward_std": 0.3204444646835327, "rewards/accuracy_reward_stage2": 0.5566972494125366, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2450 }, { "completion_length": 26.046875, "epoch": 0.42947257753635887, "grad_norm": 20.448100007493757, "kl": 0.068359375, "learning_rate": 5.707026458734887e-07, "loss": 0.0274, "reward": 1.34342622756958, "reward_std": 0.23968136310577393, "rewards/accuracy_reward_stage2": 0.34342628717422485, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2451 }, { "completion_length": 9.953125, "epoch": 0.4296478009462064, "grad_norm": 18.723259657679318, "kl": 0.1201171875, "learning_rate": 5.705274224636412e-07, "loss": 0.048, "reward": 1.5735918283462524, "reward_std": 0.2153104841709137, "rewards/accuracy_reward_stage2": 0.6985918283462524, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2452 }, { "completion_length": 11.15625, "epoch": 0.42982302435605396, "grad_norm": 21.985458003635763, "kl": 0.1123046875, "learning_rate": 5.703521990537936e-07, "loss": -0.017, "reward": 1.6417429447174072, "reward_std": 0.28753870725631714, "rewards/accuracy_reward_stage2": 0.6729929447174072, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2453 }, { "completion_length": 9.296875, "epoch": 0.4299982477659015, "grad_norm": 20.593440273884532, "kl": 0.032958984375, "learning_rate": 5.70176975643946e-07, "loss": 0.0132, "reward": 1.4509015083312988, "reward_std": 0.2628650367259979, "rewards/accuracy_reward_stage2": 0.45090147852897644, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2454 }, { "completion_length": 11.484375, "epoch": 0.43017347117574906, "grad_norm": 18.415737733768058, "kl": 0.1259765625, "learning_rate": 5.700017522340984e-07, "loss": 0.0504, "reward": 1.5575731992721558, "reward_std": 0.35987773537635803, "rewards/accuracy_reward_stage2": 0.6825731992721558, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2455 }, { "completion_length": 8.359375, "epoch": 0.43034869458559666, "grad_norm": 14.98305908133991, "kl": 0.042724609375, "learning_rate": 5.698265288242509e-07, "loss": 0.017, "reward": 1.908919095993042, "reward_std": 0.11924922466278076, "rewards/accuracy_reward_stage2": 0.9089190363883972, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2456 }, { "completion_length": 9.015625, "epoch": 0.4305239179954442, "grad_norm": 15.713865925408056, "kl": 0.05908203125, "learning_rate": 5.696513054144033e-07, "loss": 0.0237, "reward": 1.5216166973114014, "reward_std": 0.14264705777168274, "rewards/accuracy_reward_stage2": 0.6466167569160461, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2457 }, { "completion_length": 10.5625, "epoch": 0.43069914140529175, "grad_norm": 19.880669874137872, "kl": 0.140625, "learning_rate": 5.694760820045557e-07, "loss": 0.0195, "reward": 1.5184917449951172, "reward_std": 0.30239593982696533, "rewards/accuracy_reward_stage2": 0.5341167449951172, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2458 }, { "completion_length": 7.671875, "epoch": 0.4308743648151393, "grad_norm": 16.329410163714197, "kl": 0.044921875, "learning_rate": 5.693008585947082e-07, "loss": 0.018, "reward": 1.787500023841858, "reward_std": 0.07550577819347382, "rewards/accuracy_reward_stage2": 0.7874999642372131, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2459 }, { "completion_length": 11.140625, "epoch": 0.43104958822498685, "grad_norm": 19.381287915776806, "kl": 0.12109375, "learning_rate": 5.691256351848607e-07, "loss": 0.0482, "reward": 1.1594098806381226, "reward_std": 0.1518731415271759, "rewards/accuracy_reward_stage2": 0.40940988063812256, "rewards/format_reward_stage1_pointerpad": 0.75, "scores/accuracy_reward_stage2": 0.75, "step": 2460 }, { "completion_length": 10.109375, "epoch": 0.4312248116348344, "grad_norm": 11.975975628094352, "kl": 0.12060546875, "learning_rate": 5.689504117750131e-07, "loss": 0.004, "reward": 1.5058554410934448, "reward_std": 0.10613877326250076, "rewards/accuracy_reward_stage2": 0.7714804410934448, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2461 }, { "completion_length": 10.25, "epoch": 0.431400035044682, "grad_norm": 20.121126160568057, "kl": 0.091796875, "learning_rate": 5.687751883651656e-07, "loss": 0.0367, "reward": 1.7084333896636963, "reward_std": 0.22800734639167786, "rewards/accuracy_reward_stage2": 0.708433210849762, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2462 }, { "completion_length": 9.28125, "epoch": 0.43157525845452954, "grad_norm": 22.460435213851525, "kl": 0.134765625, "learning_rate": 5.685999649553181e-07, "loss": -0.0247, "reward": 1.7333886623382568, "reward_std": 0.2131943702697754, "rewards/accuracy_reward_stage2": 0.7646386623382568, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2463 }, { "completion_length": 8.015625, "epoch": 0.4317504818643771, "grad_norm": 14.680107507234679, "kl": 0.10400390625, "learning_rate": 5.684247415454705e-07, "loss": 0.0135, "reward": 1.5010437965393066, "reward_std": 0.20351773500442505, "rewards/accuracy_reward_stage2": 0.5166687369346619, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2464 }, { "completion_length": 11.21875, "epoch": 0.43192570527422464, "grad_norm": 19.2462032033978, "kl": 0.05078125, "learning_rate": 5.68249518135623e-07, "loss": -0.0113, "reward": 1.697539210319519, "reward_std": 0.24802234768867493, "rewards/accuracy_reward_stage2": 0.713164210319519, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2465 }, { "completion_length": 24.890625, "epoch": 0.4321009286840722, "grad_norm": 20.761453619278438, "kl": 0.058837890625, "learning_rate": 5.680742947257752e-07, "loss": 0.0007, "reward": 1.7051244974136353, "reward_std": 0.12879568338394165, "rewards/accuracy_reward_stage2": 0.7207494974136353, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2466 }, { "completion_length": 11.5625, "epoch": 0.43227615209391973, "grad_norm": 23.788088443112787, "kl": 0.060302734375, "learning_rate": 5.678990713159277e-07, "loss": -0.02, "reward": 1.5080852508544922, "reward_std": 0.24169126152992249, "rewards/accuracy_reward_stage2": 0.5237102508544922, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2467 }, { "completion_length": 7.53125, "epoch": 0.4324513755037673, "grad_norm": 17.387253813940422, "kl": 0.11328125, "learning_rate": 5.677238479060802e-07, "loss": 0.0452, "reward": 1.4800641536712646, "reward_std": 0.19356288015842438, "rewards/accuracy_reward_stage2": 0.6050641536712646, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2468 }, { "completion_length": 13.421875, "epoch": 0.4326265989136149, "grad_norm": 325.77192228254404, "kl": 1.84375, "learning_rate": 5.675486244962326e-07, "loss": 0.707, "reward": 1.5729882717132568, "reward_std": 0.26776865124702454, "rewards/accuracy_reward_stage2": 0.7136133313179016, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2469 }, { "completion_length": 10.703125, "epoch": 0.4328018223234624, "grad_norm": 16.006993697155597, "kl": 0.03564453125, "learning_rate": 5.673734010863851e-07, "loss": 0.0046, "reward": 1.266721487045288, "reward_std": 0.17739826440811157, "rewards/accuracy_reward_stage2": 0.5323464870452881, "rewards/format_reward_stage1_pointerpad": 0.734375, "scores/accuracy_reward_stage2": 0.734375, "step": 2470 }, { "completion_length": 9.921875, "epoch": 0.43297704573331, "grad_norm": 18.505760660623352, "kl": 0.1484375, "learning_rate": 5.671981776765375e-07, "loss": 0.0152, "reward": 1.5191692113876343, "reward_std": 0.1488463580608368, "rewards/accuracy_reward_stage2": 0.5347942113876343, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2471 }, { "completion_length": 6.875, "epoch": 0.4331522691431575, "grad_norm": 17.408087882955247, "kl": 0.1328125, "learning_rate": 5.6702295426669e-07, "loss": -0.0542, "reward": 1.3365036249160767, "reward_std": 0.3592662811279297, "rewards/accuracy_reward_stage2": 0.5083786249160767, "rewards/format_reward_stage1_pointerpad": 0.828125, "scores/accuracy_reward_stage2": 0.828125, "step": 2472 }, { "completion_length": 7.1875, "epoch": 0.43332749255300507, "grad_norm": 19.829436669177255, "kl": 0.083984375, "learning_rate": 5.668477308568425e-07, "loss": 0.0335, "reward": 1.5234953165054321, "reward_std": 0.25521036982536316, "rewards/accuracy_reward_stage2": 0.6484953165054321, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2473 }, { "completion_length": 8.984375, "epoch": 0.4335027159628526, "grad_norm": 19.423133602621352, "kl": 0.111328125, "learning_rate": 5.666725074469949e-07, "loss": 0.0157, "reward": 1.6337876319885254, "reward_std": 0.1884712278842926, "rewards/accuracy_reward_stage2": 0.6494127511978149, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2474 }, { "completion_length": 10.375, "epoch": 0.4336779393727002, "grad_norm": 18.298341038325223, "kl": 0.068359375, "learning_rate": 5.664972840371474e-07, "loss": 0.0274, "reward": 1.5837030410766602, "reward_std": 0.1801297813653946, "rewards/accuracy_reward_stage2": 0.7087030410766602, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2475 }, { "completion_length": 8.390625, "epoch": 0.43385316278254776, "grad_norm": 15.754352909022117, "kl": 0.030029296875, "learning_rate": 5.663220606272999e-07, "loss": 0.012, "reward": 1.860360860824585, "reward_std": 0.1001751571893692, "rewards/accuracy_reward_stage2": 0.8603609204292297, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2476 }, { "completion_length": 8.671875, "epoch": 0.4340283861923953, "grad_norm": 18.118293617702054, "kl": 0.166015625, "learning_rate": 5.661468372174522e-07, "loss": -0.0209, "reward": 1.7263150215148926, "reward_std": 0.18398618698120117, "rewards/accuracy_reward_stage2": 0.7575650215148926, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2477 }, { "completion_length": 10.671875, "epoch": 0.43420360960224286, "grad_norm": 12.758581469841406, "kl": 0.0693359375, "learning_rate": 5.659716138076047e-07, "loss": 0.0278, "reward": 1.5945075750350952, "reward_std": 0.07799089699983597, "rewards/accuracy_reward_stage2": 0.5945075750350952, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2478 }, { "completion_length": 7.53125, "epoch": 0.4343788330120904, "grad_norm": 24.724876782895745, "kl": 0.08056640625, "learning_rate": 5.65796390397757e-07, "loss": -0.0011, "reward": 1.3923568725585938, "reward_std": 0.30109161138534546, "rewards/accuracy_reward_stage2": 0.4079819321632385, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2479 }, { "completion_length": 17.015625, "epoch": 0.43455405642193795, "grad_norm": 21.30851433712103, "kl": 0.103515625, "learning_rate": 5.656211669879095e-07, "loss": 0.0415, "reward": 1.6417262554168701, "reward_std": 0.16019636392593384, "rewards/accuracy_reward_stage2": 0.6417261362075806, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2480 }, { "completion_length": 11.625, "epoch": 0.43472927983178555, "grad_norm": 16.069909922504603, "kl": 0.1123046875, "learning_rate": 5.65445943578062e-07, "loss": 0.0449, "reward": 1.7745733261108398, "reward_std": 0.10483638942241669, "rewards/accuracy_reward_stage2": 0.7745733261108398, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2481 }, { "completion_length": 7.78125, "epoch": 0.4349045032416331, "grad_norm": 18.202272906284808, "kl": 0.04052734375, "learning_rate": 5.652707201682144e-07, "loss": 0.0162, "reward": 1.6692678928375244, "reward_std": 0.23714250326156616, "rewards/accuracy_reward_stage2": 0.6692679524421692, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2482 }, { "completion_length": 10.40625, "epoch": 0.43507972665148065, "grad_norm": 19.22569853451002, "kl": 0.1455078125, "learning_rate": 5.650954967583669e-07, "loss": 0.0584, "reward": 1.2947709560394287, "reward_std": 0.11568181961774826, "rewards/accuracy_reward_stage2": 0.41977089643478394, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2483 }, { "completion_length": 9.59375, "epoch": 0.4352549500613282, "grad_norm": 18.60676009657278, "kl": 0.07470703125, "learning_rate": 5.649202733485194e-07, "loss": 0.0298, "reward": 1.6659901142120361, "reward_std": 0.21120445430278778, "rewards/accuracy_reward_stage2": 0.6659901738166809, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2484 }, { "completion_length": 12.375, "epoch": 0.43543017347117574, "grad_norm": 17.448368004805243, "kl": 0.10107421875, "learning_rate": 5.647450499386718e-07, "loss": 0.0404, "reward": 1.3877105712890625, "reward_std": 0.17023152112960815, "rewards/accuracy_reward_stage2": 0.5127106308937073, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2485 }, { "completion_length": 7.734375, "epoch": 0.4356053968810233, "grad_norm": 27.701798930911647, "kl": 0.05615234375, "learning_rate": 5.645698265288243e-07, "loss": 0.0225, "reward": 1.686922550201416, "reward_std": 0.21318362653255463, "rewards/accuracy_reward_stage2": 0.6869224905967712, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2486 }, { "completion_length": 10.46875, "epoch": 0.43578062029087083, "grad_norm": 33.38221389414447, "kl": 0.298828125, "learning_rate": 5.643946031189766e-07, "loss": 0.0468, "reward": 1.533717393875122, "reward_std": 0.19223450124263763, "rewards/accuracy_reward_stage2": 0.5649674534797668, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2487 }, { "completion_length": 8.765625, "epoch": 0.43595584370071844, "grad_norm": 15.92423013091103, "kl": 0.357421875, "learning_rate": 5.642193797091291e-07, "loss": 0.1043, "reward": 1.5287775993347168, "reward_std": 0.21819572150707245, "rewards/accuracy_reward_stage2": 0.6694026589393616, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2488 }, { "completion_length": 11.125, "epoch": 0.436131067110566, "grad_norm": 24.990874315078415, "kl": 0.1318359375, "learning_rate": 5.640441562992816e-07, "loss": 0.0085, "reward": 1.7180120944976807, "reward_std": 0.2930488586425781, "rewards/accuracy_reward_stage2": 0.7336370944976807, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2489 }, { "completion_length": 14.5, "epoch": 0.43630629052041353, "grad_norm": 23.423213369547156, "kl": 0.09033203125, "learning_rate": 5.63868932889434e-07, "loss": 0.0363, "reward": 1.8398503065109253, "reward_std": 0.17344442009925842, "rewards/accuracy_reward_stage2": 0.8398503065109253, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2490 }, { "completion_length": 13.078125, "epoch": 0.4364815139302611, "grad_norm": 22.765187899068547, "kl": 0.06982421875, "learning_rate": 5.636937094795865e-07, "loss": -0.0055, "reward": 1.4840041399002075, "reward_std": 0.21035104990005493, "rewards/accuracy_reward_stage2": 0.6246291399002075, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2491 }, { "completion_length": 9.5625, "epoch": 0.4366567373401086, "grad_norm": 25.422366034224048, "kl": 0.205078125, "learning_rate": 5.635184860697389e-07, "loss": 0.082, "reward": 1.4929587841033936, "reward_std": 0.1953394114971161, "rewards/accuracy_reward_stage2": 0.6179587841033936, "rewards/format_reward_stage1_pointerpad": 0.875, "scores/accuracy_reward_stage2": 0.875, "step": 2492 }, { "completion_length": 11.84375, "epoch": 0.43683196074995617, "grad_norm": 23.39126913379409, "kl": 0.059326171875, "learning_rate": 5.633432626598913e-07, "loss": -0.02, "reward": 1.788152813911438, "reward_std": 0.23067131638526917, "rewards/accuracy_reward_stage2": 0.803777813911438, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2493 }, { "completion_length": 8.296875, "epoch": 0.4370071841598038, "grad_norm": 24.067458583341157, "kl": 0.07373046875, "learning_rate": 5.631680392500438e-07, "loss": 0.0295, "reward": 1.729994773864746, "reward_std": 0.18137015402317047, "rewards/accuracy_reward_stage2": 0.7299947142601013, "rewards/format_reward_stage1_pointerpad": 1.0, "scores/accuracy_reward_stage2": 1.0, "step": 2494 }, { "completion_length": 10.78125, "epoch": 0.4371824075696513, "grad_norm": 20.6936402781379, "kl": 0.1025390625, "learning_rate": 5.629928158401962e-07, "loss": -0.0031, "reward": 1.8095977306365967, "reward_std": 0.2006131410598755, "rewards/accuracy_reward_stage2": 0.8252226114273071, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2495 }, { "completion_length": 6.9375, "epoch": 0.43735763097949887, "grad_norm": 56.06477061667996, "kl": 0.5703125, "learning_rate": 5.628175924303486e-07, "loss": 0.183, "reward": 1.601702094078064, "reward_std": 0.1429881602525711, "rewards/accuracy_reward_stage2": 0.6173270344734192, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2496 }, { "completion_length": 10.421875, "epoch": 0.4375328543893464, "grad_norm": 18.460030194723316, "kl": 0.150390625, "learning_rate": 5.626423690205011e-07, "loss": 0.016, "reward": 1.5749945640563965, "reward_std": 0.23786082863807678, "rewards/accuracy_reward_stage2": 0.7156196236610413, "rewards/format_reward_stage1_pointerpad": 0.859375, "scores/accuracy_reward_stage2": 0.859375, "step": 2497 }, { "completion_length": 12.75, "epoch": 0.43770807779919396, "grad_norm": 13.524925919219523, "kl": 0.0751953125, "learning_rate": 5.624671456106535e-07, "loss": -0.0475, "reward": 1.4493248462677002, "reward_std": 0.1437452733516693, "rewards/accuracy_reward_stage2": 0.4805747866630554, "rewards/format_reward_stage1_pointerpad": 0.96875, "scores/accuracy_reward_stage2": 0.96875, "step": 2498 }, { "completion_length": 10.703125, "epoch": 0.4378833012090415, "grad_norm": 15.790480007846668, "kl": 0.08349609375, "learning_rate": 5.62291922200806e-07, "loss": -0.0109, "reward": 1.7373511791229248, "reward_std": 0.15441085398197174, "rewards/accuracy_reward_stage2": 0.7529761791229248, "rewards/format_reward_stage1_pointerpad": 0.984375, "scores/accuracy_reward_stage2": 0.984375, "step": 2499 }, { "completion_length": 8.3125, "epoch": 0.4380585246188891, "grad_norm": 19.997615203454874, "kl": 0.1591796875, "learning_rate": 5.621166987909585e-07, "loss": -0.0242, "reward": 1.714186668395996, "reward_std": 0.30740267038345337, "rewards/accuracy_reward_stage2": 0.8704366087913513, "rewards/format_reward_stage1_pointerpad": 0.84375, "scores/accuracy_reward_stage2": 0.84375, "step": 2500 } ], "logging_steps": 1.0, "max_steps": 5707, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }