diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,18688 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500, + "global_step": 565, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 240.77734375, + "completions/mean_terminated_length": 240.77734375, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.16800164990127087, + "epoch": 0.0017699115044247787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26899288624647755, + "learning_rate": 0.0, + "loss": -0.003, + "num_tokens": 464071.0, + "reward": 0.48417970538139343, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9883767366409302, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.13759836554527283, + "step": 1 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 325.0, + "completions/max_terminated_length": 325.0, + "completions/mean_length": 196.4765625, + "completions/mean_terminated_length": 196.4765625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.1572934128344059, + "epoch": 0.0035398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43014753967539887, + "learning_rate": 1.7543859649122805e-08, + "loss": 0.0014, + "num_tokens": 977745.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887896776199341, + "sampling/importance_sampling_ratio/min": 0.0009695081971585751, + "sampling/sampling_logp_difference/max": 6.938721656799316, + "sampling/sampling_logp_difference/mean": 0.13283054530620575, + "step": 2 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 229.14453125, + "completions/mean_terminated_length": 229.14453125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.1506793014705181, + "epoch": 0.005309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2460889331215385, + "learning_rate": 3.508771929824561e-08, + "loss": 0.0061, + "num_tokens": 1405414.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219762206077576, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899247884750366, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12096783518791199, + "step": 3 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 204.56640625, + "completions/mean_terminated_length": 204.56640625, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.17143337801098824, + "epoch": 0.007079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16121964031093983, + "learning_rate": 5.2631578947368416e-08, + "loss": 0.0004, + "num_tokens": 2009431.0, + "reward": 0.47304683923721313, + "reward_std": 0.47307515144348145, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882920980453491, + "sampling/importance_sampling_ratio/min": 0.008775105699896812, + "sampling/sampling_logp_difference/max": 4.735836505889893, + "sampling/sampling_logp_difference/mean": 0.14046800136566162, + "step": 4 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 407.0, + "completions/max_terminated_length": 407.0, + "completions/mean_length": 222.2109375, + "completions/mean_terminated_length": 222.2109375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.1771502736955881, + "epoch": 0.008849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4317320322048489, + "learning_rate": 7.017543859649122e-08, + "loss": 0.0139, + "num_tokens": 2565437.0, + "reward": 0.7253906726837158, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9947786331176758, + "sampling/importance_sampling_ratio/min": 0.011136677116155624, + "sampling/sampling_logp_difference/max": 4.497511386871338, + "sampling/sampling_logp_difference/mean": 0.13672447204589844, + "step": 5 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 403.0, + "completions/max_terminated_length": 403.0, + "completions/mean_length": 212.046875, + "completions/mean_terminated_length": 212.046875, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.15770719200372696, + "epoch": 0.010619469026548672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36976823638973716, + "learning_rate": 8.771929824561403e-08, + "loss": 0.0003, + "num_tokens": 3020473.0, + "reward": 0.45820313692092896, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.4296875, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911689758300781, + "sampling/importance_sampling_ratio/min": 0.014309810474514961, + "sampling/sampling_logp_difference/max": 4.246809959411621, + "sampling/sampling_logp_difference/mean": 0.1317376345396042, + "step": 6 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 214.2734375, + "completions/mean_terminated_length": 214.2734375, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.16596073657274246, + "epoch": 0.012389380530973451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4258994461942591, + "learning_rate": 1.0526315789473683e-07, + "loss": 0.011, + "num_tokens": 3576879.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600866317749023, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991350531578064, + "sampling/importance_sampling_ratio/min": 0.011154413223266602, + "sampling/sampling_logp_difference/max": 4.495920181274414, + "sampling/sampling_logp_difference/mean": 0.13485443592071533, + "step": 7 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 215.6171875, + "completions/mean_terminated_length": 215.6171875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.1521406676620245, + "epoch": 0.01415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32626688667132286, + "learning_rate": 1.2280701754385964e-07, + "loss": -0.0054, + "num_tokens": 3908813.0, + "reward": 0.502734363079071, + "reward_std": 0.47540730237960815, + "rewards/execution_accuracy_EX/mean": 0.4765625, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880743026733398, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13180431723594666, + "step": 8 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 528.0, + "completions/max_terminated_length": 528.0, + "completions/mean_length": 226.26953125, + "completions/mean_terminated_length": 226.26953125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.15583448484539986, + "epoch": 0.01592920353982301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37532224908039835, + "learning_rate": 1.4035087719298244e-07, + "loss": 0.0151, + "num_tokens": 4292898.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915869235992432, + "sampling/importance_sampling_ratio/min": 0.005292921327054501, + "sampling/sampling_logp_difference/max": 5.241384983062744, + "sampling/sampling_logp_difference/mean": 0.1297648698091507, + "step": 9 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 219.03125, + "completions/mean_terminated_length": 219.03125, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.1605638675391674, + "epoch": 0.017699115044247787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32937675464045957, + "learning_rate": 1.5789473684210525e-07, + "loss": 0.0007, + "num_tokens": 4669290.0, + "reward": 0.4990234375, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915338754653931, + "sampling/importance_sampling_ratio/min": 0.011154407635331154, + "sampling/sampling_logp_difference/max": 4.495920658111572, + "sampling/sampling_logp_difference/mean": 0.1286957561969757, + "step": 10 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 395.0, + "completions/max_terminated_length": 395.0, + "completions/mean_length": 215.46484375, + "completions/mean_terminated_length": 215.46484375, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.1649212446063757, + "epoch": 0.019469026548672566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39580334353692825, + "learning_rate": 1.7543859649122805e-07, + "loss": -0.0117, + "num_tokens": 5107425.0, + "reward": 0.36542966961860657, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.33203125, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916530847549438, + "sampling/importance_sampling_ratio/min": 0.011154402047395706, + "sampling/sampling_logp_difference/max": 4.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.13201645016670227, + "step": 11 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 396.0, + "completions/max_terminated_length": 396.0, + "completions/mean_length": 209.59765625, + "completions/mean_terminated_length": 209.59765625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15810802951455116, + "epoch": 0.021238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19899075420567763, + "learning_rate": 1.9298245614035086e-07, + "loss": -0.0015, + "num_tokens": 5582698.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912006855010986, + "sampling/importance_sampling_ratio/min": 0.003199489787220955, + "sampling/sampling_logp_difference/max": 5.7447638511657715, + "sampling/sampling_logp_difference/mean": 0.1252116858959198, + "step": 12 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 452.0, + "completions/max_terminated_length": 452.0, + "completions/mean_length": 238.0703125, + "completions/mean_terminated_length": 238.0703125, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.18869901075959206, + "epoch": 0.023008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3439687256876209, + "learning_rate": 2.1052631578947366e-07, + "loss": 0.0054, + "num_tokens": 6027484.0, + "reward": 0.3505859375, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.31640625, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9924343824386597, + "sampling/importance_sampling_ratio/min": 0.008684132248163223, + "sampling/sampling_logp_difference/max": 4.746257781982422, + "sampling/sampling_logp_difference/mean": 0.1439538300037384, + "step": 13 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 225.69921875, + "completions/mean_terminated_length": 225.69921875, + "completions/min_length": 82.0, + "completions/min_terminated_length": 82.0, + "entropy": 0.15718093514442444, + "epoch": 0.024778761061946902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4053142697711595, + "learning_rate": 2.2807017543859647e-07, + "loss": -0.0031, + "num_tokens": 6373727.0, + "reward": 0.4916015565395355, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912569522857666, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.1230582520365715, + "step": 14 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 338.0, + "completions/max_terminated_length": 338.0, + "completions/mean_length": 190.36328125, + "completions/mean_terminated_length": 190.36328125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.16634388826787472, + "epoch": 0.02654867256637168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4612921435624558, + "learning_rate": 2.456140350877193e-07, + "loss": 0.0042, + "num_tokens": 6859052.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9878740906715393, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.13844618201255798, + "step": 15 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 203.71875, + "completions/mean_terminated_length": 203.71875, + "completions/min_length": 96.0, + "completions/min_terminated_length": 96.0, + "entropy": 0.15448766387999058, + "epoch": 0.02831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28992343133582726, + "learning_rate": 2.631578947368421e-07, + "loss": -0.0047, + "num_tokens": 7186676.0, + "reward": 0.45820313692092896, + "reward_std": 0.4712011218070984, + "rewards/execution_accuracy_EX/mean": 0.4296875, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9857417941093445, + "sampling/importance_sampling_ratio/min": 0.014740431681275368, + "sampling/sampling_logp_difference/max": 4.217161178588867, + "sampling/sampling_logp_difference/mean": 0.13474053144454956, + "step": 16 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 216.94140625, + "completions/mean_terminated_length": 216.94140625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.15913797728717327, + "epoch": 0.03008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22356548494377418, + "learning_rate": 2.807017543859649e-07, + "loss": -0.004, + "num_tokens": 7617845.0, + "reward": 0.6585937738418579, + "reward_std": 0.4567192792892456, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925726056098938, + "sampling/importance_sampling_ratio/min": 0.008900564163923264, + "sampling/sampling_logp_difference/max": 4.721640586853027, + "sampling/sampling_logp_difference/mean": 0.12318142503499985, + "step": 17 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 245.546875, + "completions/mean_terminated_length": 245.546875, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.1641867607831955, + "epoch": 0.03185840707964602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3618873588349014, + "learning_rate": 2.982456140350877e-07, + "loss": 0.0014, + "num_tokens": 8243361.0, + "reward": 0.37285155057907104, + "reward_std": 0.4508545994758606, + "rewards/execution_accuracy_EX/mean": 0.33984375, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991119921207428, + "sampling/importance_sampling_ratio/min": 0.0025241519324481487, + "sampling/sampling_logp_difference/max": 5.9818501472473145, + "sampling/sampling_logp_difference/mean": 0.12859690189361572, + "step": 18 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 410.0, + "completions/max_terminated_length": 410.0, + "completions/mean_length": 210.50390625, + "completions/mean_terminated_length": 210.50390625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.15631975419819355, + "epoch": 0.033628318584070796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37756396523373564, + "learning_rate": 3.157894736842105e-07, + "loss": 0.0031, + "num_tokens": 8973794.0, + "reward": 0.6437499523162842, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988776445388794, + "sampling/importance_sampling_ratio/min": 0.0143876438960433, + "sampling/sampling_logp_difference/max": 4.241385459899902, + "sampling/sampling_logp_difference/mean": 0.1299668550491333, + "step": 19 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 221.0625, + "completions/mean_terminated_length": 221.0625, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.16394091956317425, + "epoch": 0.035398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.330010003255172, + "learning_rate": 3.333333333333333e-07, + "loss": 0.0028, + "num_tokens": 9432866.0, + "reward": 0.3246093690395355, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.2890625, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900156259536743, + "sampling/importance_sampling_ratio/min": 0.014467723667621613, + "sampling/sampling_logp_difference/max": 4.235835075378418, + "sampling/sampling_logp_difference/mean": 0.13131310045719147, + "step": 20 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 228.7734375, + "completions/mean_terminated_length": 228.7734375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.1680222861468792, + "epoch": 0.03716814159292035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28002706033303626, + "learning_rate": 3.508771929824561e-07, + "loss": 0.0017, + "num_tokens": 9927112.0, + "reward": 0.44707030057907104, + "reward_std": 0.46948158740997314, + "rewards/execution_accuracy_EX/mean": 0.41796875, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991767406463623, + "sampling/importance_sampling_ratio/min": 0.014291780069470406, + "sampling/sampling_logp_difference/max": 4.24807071685791, + "sampling/sampling_logp_difference/mean": 0.13199591636657715, + "step": 21 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 429.0, + "completions/max_terminated_length": 429.0, + "completions/mean_length": 232.265625, + "completions/mean_terminated_length": 232.265625, + "completions/min_length": 124.0, + "completions/min_terminated_length": 124.0, + "entropy": 0.16237134858965874, + "epoch": 0.03893805309734513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4130597941325235, + "learning_rate": 3.684210526315789e-07, + "loss": -0.0038, + "num_tokens": 10399820.0, + "reward": 0.4136718809604645, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.3828125, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.993659257888794, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.1262757033109665, + "step": 22 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 200.19140625, + "completions/mean_terminated_length": 200.19140625, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15257065370678902, + "epoch": 0.04070796460176991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2539925648836394, + "learning_rate": 3.859649122807017e-07, + "loss": -0.0034, + "num_tokens": 10959933.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871996641159058, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.13107885420322418, + "step": 23 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 198.0703125, + "completions/mean_terminated_length": 198.0703125, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.15924336947500706, + "epoch": 0.04247787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2149158834516334, + "learning_rate": 4.035087719298245e-07, + "loss": -0.002, + "num_tokens": 11359039.0, + "reward": 0.4693359434604645, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.44140625, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988457977771759, + "sampling/importance_sampling_ratio/min": 0.00676548620685935, + "sampling/sampling_logp_difference/max": 4.9959211349487305, + "sampling/sampling_logp_difference/mean": 0.13254688680171967, + "step": 24 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 229.98046875, + "completions/mean_terminated_length": 229.98046875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.16456064768135548, + "epoch": 0.04424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4167611685117223, + "learning_rate": 4.2105263157894733e-07, + "loss": -0.012, + "num_tokens": 11779002.0, + "reward": 0.369140625, + "reward_std": 0.44958022236824036, + "rewards/execution_accuracy_EX/mean": 0.3359375, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894763231277466, + "sampling/importance_sampling_ratio/min": 0.018519118428230286, + "sampling/sampling_logp_difference/max": 3.9889516830444336, + "sampling/sampling_logp_difference/mean": 0.13387247920036316, + "step": 25 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 592.0, + "completions/max_terminated_length": 592.0, + "completions/mean_length": 234.1796875, + "completions/mean_terminated_length": 234.1796875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.15004423819482327, + "epoch": 0.04601769911504425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 4.3859649122807013e-07, + "loss": 0.0, + "num_tokens": 12292088.0, + "reward": 0.6437499523162842, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890333414077759, + "sampling/importance_sampling_ratio/min": 0.0052689663134515285, + "sampling/sampling_logp_difference/max": 5.2459211349487305, + "sampling/sampling_logp_difference/mean": 0.1237257719039917, + "step": 26 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 219.54296875, + "completions/mean_terminated_length": 219.54296875, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.15362016297876835, + "epoch": 0.047787610619469026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26895058438899755, + "learning_rate": 4.5614035087719294e-07, + "loss": 0.0031, + "num_tokens": 12736963.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897770285606384, + "sampling/importance_sampling_ratio/min": 0.004099072422832251, + "sampling/sampling_logp_difference/max": 5.496994495391846, + "sampling/sampling_logp_difference/mean": 0.13121642172336578, + "step": 27 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 208.80078125, + "completions/mean_terminated_length": 208.80078125, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.16322246752679348, + "epoch": 0.049557522123893805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4795479621446814, + "learning_rate": 4.7368421052631574e-07, + "loss": 0.0137, + "num_tokens": 13086768.0, + "reward": 0.5064452886581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900233149528503, + "sampling/importance_sampling_ratio/min": 0.0067833466455340385, + "sampling/sampling_logp_difference/max": 4.993284702301025, + "sampling/sampling_logp_difference/mean": 0.13302835822105408, + "step": 28 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 225.50390625, + "completions/mean_terminated_length": 225.50390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.15761182643473148, + "epoch": 0.05132743362831858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24046164282219176, + "learning_rate": 4.912280701754385e-07, + "loss": -0.0056, + "num_tokens": 13506401.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904501438140869, + "sampling/importance_sampling_ratio/min": 0.014360358938574791, + "sampling/sampling_logp_difference/max": 4.243283748626709, + "sampling/sampling_logp_difference/mean": 0.12762247025966644, + "step": 29 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 399.0, + "completions/max_terminated_length": 399.0, + "completions/mean_length": 217.421875, + "completions/mean_terminated_length": 217.421875, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.15651825070381165, + "epoch": 0.05309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3764552993060396, + "learning_rate": 5.087719298245614e-07, + "loss": 0.0011, + "num_tokens": 13861437.0, + "reward": 0.44707030057907104, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.41796875, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907035827636719, + "sampling/importance_sampling_ratio/min": 0.008661828935146332, + "sampling/sampling_logp_difference/max": 4.748829364776611, + "sampling/sampling_logp_difference/mean": 0.1288561224937439, + "step": 30 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 257.89453125, + "completions/mean_terminated_length": 257.89453125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.1710120104253292, + "epoch": 0.05486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26678307296231335, + "learning_rate": 5.263157894736842e-07, + "loss": -0.0016, + "num_tokens": 14277938.0, + "reward": 0.32832032442092896, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.29296875, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9946006536483765, + "sampling/importance_sampling_ratio/min": 0.01841953955590725, + "sampling/sampling_logp_difference/max": 3.9943432807922363, + "sampling/sampling_logp_difference/mean": 0.12613239884376526, + "step": 31 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 221.19140625, + "completions/mean_terminated_length": 221.19140625, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.1595402956008911, + "epoch": 0.05663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19593070618767436, + "learning_rate": 5.43859649122807e-07, + "loss": -0.0014, + "num_tokens": 14805619.0, + "reward": 0.4878906011581421, + "reward_std": 0.47447580099105835, + "rewards/execution_accuracy_EX/mean": 0.4609375, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907799959182739, + "sampling/importance_sampling_ratio/min": 0.014339092187583447, + "sampling/sampling_logp_difference/max": 4.244765758514404, + "sampling/sampling_logp_difference/mean": 0.1260884702205658, + "step": 32 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 235.86328125, + "completions/mean_terminated_length": 235.86328125, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.16799591109156609, + "epoch": 0.0584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22532606473149772, + "learning_rate": 5.614035087719298e-07, + "loss": 0.0039, + "num_tokens": 15278592.0, + "reward": 0.5843749642372131, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9923399686813354, + "sampling/importance_sampling_ratio/min": 0.008697102777659893, + "sampling/sampling_logp_difference/max": 4.744765281677246, + "sampling/sampling_logp_difference/mean": 0.13059130311012268, + "step": 33 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 357.0, + "completions/max_terminated_length": 357.0, + "completions/mean_length": 213.27734375, + "completions/mean_terminated_length": 213.27734375, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.15989402122795582, + "epoch": 0.06017699115044248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27799309147935464, + "learning_rate": 5.789473684210526e-07, + "loss": -0.0052, + "num_tokens": 15746663.0, + "reward": 0.6732421517372131, + "reward_std": 0.4523758888244629, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9851380586624146, + "sampling/importance_sampling_ratio/min": 0.011232429184019566, + "sampling/sampling_logp_difference/max": 4.488950252532959, + "sampling/sampling_logp_difference/mean": 0.13754978775978088, + "step": 34 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 477.0, + "completions/max_terminated_length": 477.0, + "completions/mean_length": 207.76953125, + "completions/mean_terminated_length": 207.76953125, + "completions/min_length": 91.0, + "completions/min_terminated_length": 91.0, + "entropy": 0.153695460408926, + "epoch": 0.061946902654867256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29407859317224894, + "learning_rate": 5.964912280701754e-07, + "loss": 0.0029, + "num_tokens": 16104268.0, + "reward": 0.521289050579071, + "reward_std": 0.47591590881347656, + "rewards/execution_accuracy_EX/mean": 0.49609375, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990626871585846, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.12581679224967957, + "step": 35 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 229.19140625, + "completions/mean_terminated_length": 229.19140625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1576980296522379, + "epoch": 0.06371681415929203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29319992058837874, + "learning_rate": 6.140350877192982e-07, + "loss": -0.0031, + "num_tokens": 16453741.0, + "reward": 0.3951171934604645, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.36328125, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867162108421326, + "sampling/importance_sampling_ratio/min": 0.011136544868350029, + "sampling/sampling_logp_difference/max": 4.497523307800293, + "sampling/sampling_logp_difference/mean": 0.13592344522476196, + "step": 36 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 214.5390625, + "completions/mean_terminated_length": 214.5390625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.1740215141326189, + "epoch": 0.06548672566371681, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34071497092016156, + "learning_rate": 6.31578947368421e-07, + "loss": -0.0136, + "num_tokens": 16972215.0, + "reward": 0.49531248211860657, + "reward_std": 0.47499996423721313, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9905092120170593, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.1363052874803543, + "step": 37 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 236.515625, + "completions/mean_terminated_length": 236.515625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.16431527957320213, + "epoch": 0.06725663716814159, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17369171883723106, + "learning_rate": 6.491228070175438e-07, + "loss": -0.0015, + "num_tokens": 17443371.0, + "reward": 0.5732421875, + "reward_std": 0.4734695255756378, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886480569839478, + "sampling/importance_sampling_ratio/min": 0.008687056601047516, + "sampling/sampling_logp_difference/max": 4.7459211349487305, + "sampling/sampling_logp_difference/mean": 0.1330389678478241, + "step": 38 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 209.31640625, + "completions/mean_terminated_length": 209.31640625, + "completions/min_length": 77.0, + "completions/min_terminated_length": 77.0, + "entropy": 0.15421190671622753, + "epoch": 0.06902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3589419087860088, + "learning_rate": 6.666666666666666e-07, + "loss": 0.0042, + "num_tokens": 17979468.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990709662437439, + "sampling/importance_sampling_ratio/min": 0.011155104264616966, + "sampling/sampling_logp_difference/max": 4.495858192443848, + "sampling/sampling_logp_difference/mean": 0.1269502341747284, + "step": 39 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 210.71484375, + "completions/mean_terminated_length": 210.71484375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.16004453226923943, + "epoch": 0.07079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49120329671078605, + "learning_rate": 6.842105263157895e-07, + "loss": 0.0094, + "num_tokens": 18319395.0, + "reward": 0.6585937738418579, + "reward_std": 0.456719309091568, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9913509488105774, + "sampling/importance_sampling_ratio/min": 0.011126003228127956, + "sampling/sampling_logp_difference/max": 4.498470306396484, + "sampling/sampling_logp_difference/mean": 0.12969903647899628, + "step": 40 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 427.0, + "completions/max_terminated_length": 427.0, + "completions/mean_length": 206.0234375, + "completions/mean_terminated_length": 206.0234375, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.16033608093857765, + "epoch": 0.07256637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4078587228416583, + "learning_rate": 7.017543859649122e-07, + "loss": -0.013, + "num_tokens": 18998921.0, + "reward": 0.6509765386581421, + "reward_std": 0.4586948752403259, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990849494934082, + "sampling/importance_sampling_ratio/min": 0.0024824803695082664, + "sampling/sampling_logp_difference/max": 5.998497009277344, + "sampling/sampling_logp_difference/mean": 0.1328970044851303, + "step": 41 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 213.39453125, + "completions/mean_terminated_length": 213.39453125, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.15308724157512188, + "epoch": 0.0743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4876723398817694, + "learning_rate": 7.192982456140351e-07, + "loss": 0.0108, + "num_tokens": 19372830.0, + "reward": 0.4916015565395355, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98912513256073, + "sampling/importance_sampling_ratio/min": 0.014310465194284916, + "sampling/sampling_logp_difference/max": 4.246764183044434, + "sampling/sampling_logp_difference/mean": 0.1300293505191803, + "step": 42 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 235.95703125, + "completions/mean_terminated_length": 235.95703125, + "completions/min_length": 99.0, + "completions/min_terminated_length": 99.0, + "entropy": 0.1614510864019394, + "epoch": 0.07610619469026549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2473094768090876, + "learning_rate": 7.368421052631578e-07, + "loss": 0.0057, + "num_tokens": 19849571.0, + "reward": 0.32832029461860657, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.29296875, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908735752105713, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.1290094405412674, + "step": 43 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 366.0, + "completions/max_terminated_length": 366.0, + "completions/mean_length": 209.10546875, + "completions/mean_terminated_length": 209.10546875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.16792303882539272, + "epoch": 0.07787610619469026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6490977517753318, + "learning_rate": 7.543859649122807e-07, + "loss": 0.0234, + "num_tokens": 20362238.0, + "reward": 0.7365233898162842, + "reward_std": 0.4261363446712494, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866508841514587, + "sampling/importance_sampling_ratio/min": 0.008679235354065895, + "sampling/sampling_logp_difference/max": 4.746821880340576, + "sampling/sampling_logp_difference/mean": 0.1376384198665619, + "step": 44 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 229.12890625, + "completions/mean_terminated_length": 229.12890625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.17156759835779667, + "epoch": 0.07964601769911504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2624399039112141, + "learning_rate": 7.719298245614034e-07, + "loss": 0.0067, + "num_tokens": 20861359.0, + "reward": 0.47675782442092896, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.994779109954834, + "sampling/importance_sampling_ratio/min": 0.014779280871152878, + "sampling/sampling_logp_difference/max": 4.214529037475586, + "sampling/sampling_logp_difference/mean": 0.13185223937034607, + "step": 45 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 212.03515625, + "completions/mean_terminated_length": 212.03515625, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.15802641212940216, + "epoch": 0.08141592920353982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3114338856066691, + "learning_rate": 7.894736842105263e-07, + "loss": 0.0052, + "num_tokens": 21430072.0, + "reward": 0.49531251192092896, + "reward_std": 0.4749999940395355, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9872924089431763, + "sampling/importance_sampling_ratio/min": 0.005292925983667374, + "sampling/sampling_logp_difference/max": 5.241384029388428, + "sampling/sampling_logp_difference/mean": 0.13467440009117126, + "step": 46 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 195.765625, + "completions/mean_terminated_length": 195.765625, + "completions/min_length": 85.0, + "completions/min_terminated_length": 85.0, + "entropy": 0.15841412916779518, + "epoch": 0.0831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2544691175388164, + "learning_rate": 8.07017543859649e-07, + "loss": 0.0084, + "num_tokens": 21939036.0, + "reward": 0.49531251192092896, + "reward_std": 0.4749999940395355, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892590641975403, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.13591071963310242, + "step": 47 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 375.0, + "completions/max_terminated_length": 375.0, + "completions/mean_length": 222.29296875, + "completions/mean_terminated_length": 222.29296875, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.16205408424139023, + "epoch": 0.08495575221238938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4450917013702582, + "learning_rate": 8.245614035087719e-07, + "loss": 0.0135, + "num_tokens": 22428375.0, + "reward": 0.569531261920929, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925086498260498, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.1286633014678955, + "step": 48 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 335.0, + "completions/max_terminated_length": 335.0, + "completions/mean_length": 218.953125, + "completions/mean_terminated_length": 218.953125, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.14652677439153194, + "epoch": 0.08672566371681416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36398969040701207, + "learning_rate": 8.421052631578947e-07, + "loss": 0.0007, + "num_tokens": 22773851.0, + "reward": 0.45820313692092896, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.4296875, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920421838760376, + "sampling/importance_sampling_ratio/min": 0.00559335108846426, + "sampling/sampling_logp_difference/max": 5.186176776885986, + "sampling/sampling_logp_difference/mean": 0.12167568504810333, + "step": 49 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 228.1875, + "completions/mean_terminated_length": 228.1875, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.16296331398189068, + "epoch": 0.08849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3706959212541897, + "learning_rate": 8.596491228070175e-07, + "loss": -0.0071, + "num_tokens": 23245995.0, + "reward": 0.5138671398162842, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.990206241607666, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1312432885169983, + "step": 50 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 360.0, + "completions/max_terminated_length": 360.0, + "completions/mean_length": 193.80078125, + "completions/mean_terminated_length": 193.80078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.14984596893191338, + "epoch": 0.09026548672566372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4595076261204974, + "learning_rate": 8.771929824561403e-07, + "loss": 0.013, + "num_tokens": 23594776.0, + "reward": 0.755078136920929, + "reward_std": 0.41637277603149414, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.987525224685669, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.12903794646263123, + "step": 51 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 228.0703125, + "completions/mean_terminated_length": 228.0703125, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.16216460056602955, + "epoch": 0.0920353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.352096192636124, + "learning_rate": 8.947368421052631e-07, + "loss": 0.0045, + "num_tokens": 24127690.0, + "reward": 0.6585937738418579, + "reward_std": 0.456719309091568, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989894449710846, + "sampling/importance_sampling_ratio/min": 0.006812799721956253, + "sampling/sampling_logp_difference/max": 4.988952159881592, + "sampling/sampling_logp_difference/mean": 0.13485217094421387, + "step": 52 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 409.0, + "completions/max_terminated_length": 409.0, + "completions/mean_length": 209.2734375, + "completions/mean_terminated_length": 209.2734375, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.1585012786090374, + "epoch": 0.09380530973451327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2009889434955974, + "learning_rate": 9.122807017543859e-07, + "loss": -0.008, + "num_tokens": 24534224.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9906299710273743, + "sampling/importance_sampling_ratio/min": 0.008700824342668056, + "sampling/sampling_logp_difference/max": 4.744337558746338, + "sampling/sampling_logp_difference/mean": 0.13042978942394257, + "step": 53 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 204.09765625, + "completions/mean_terminated_length": 204.09765625, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.14675415121018887, + "epoch": 0.09557522123893805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34401304041307923, + "learning_rate": 9.298245614035087e-07, + "loss": 0.0045, + "num_tokens": 24958953.0, + "reward": 0.5992187261581421, + "reward_std": 0.47008487582206726, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9833692312240601, + "sampling/importance_sampling_ratio/min": 0.008987164124846458, + "sampling/sampling_logp_difference/max": 4.711957931518555, + "sampling/sampling_logp_difference/mean": 0.13392765820026398, + "step": 54 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 422.0, + "completions/max_terminated_length": 422.0, + "completions/mean_length": 207.64453125, + "completions/mean_terminated_length": 207.64453125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15594125725328922, + "epoch": 0.09734513274336283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3232663547794188, + "learning_rate": 9.473684210526315e-07, + "loss": -0.0031, + "num_tokens": 25571086.0, + "reward": 0.5361328125, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889358878135681, + "sampling/importance_sampling_ratio/min": 0.014340179972350597, + "sampling/sampling_logp_difference/max": 4.24468994140625, + "sampling/sampling_logp_difference/mean": 0.12645173072814941, + "step": 55 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 216.69140625, + "completions/mean_terminated_length": 216.69140625, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.16085772216320038, + "epoch": 0.09911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3271752358225103, + "learning_rate": 9.649122807017545e-07, + "loss": 0.0044, + "num_tokens": 25971743.0, + "reward": 0.47675782442092896, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919253587722778, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.128218412399292, + "step": 56 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 243.22265625, + "completions/mean_terminated_length": 243.22265625, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.15728727541863918, + "epoch": 0.10088495575221239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29179080050168243, + "learning_rate": 9.82456140350877e-07, + "loss": 0.0059, + "num_tokens": 26556936.0, + "reward": 0.4507812559604645, + "reward_std": 0.47008487582206726, + "rewards/execution_accuracy_EX/mean": 0.421875, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9931042194366455, + "sampling/importance_sampling_ratio/min": 0.014340603724122047, + "sampling/sampling_logp_difference/max": 4.244660377502441, + "sampling/sampling_logp_difference/mean": 0.12253376841545105, + "step": 57 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 217.87109375, + "completions/mean_terminated_length": 217.87109375, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.15230992063879967, + "epoch": 0.10265486725663717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5333505943398746, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 27050215.0, + "reward": 0.6734374761581421, + "reward_std": 0.45209482312202454, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868412017822266, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.1287211775779724, + "step": 58 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 222.07421875, + "completions/mean_terminated_length": 222.07421875, + "completions/min_length": 94.0, + "completions/min_terminated_length": 94.0, + "entropy": 0.15885837003588676, + "epoch": 0.10442477876106195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3224764308639901, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 27697418.0, + "reward": 0.48417967557907104, + "reward_std": 0.4741697311401367, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875861406326294, + "sampling/importance_sampling_ratio/min": 0.011184130795300007, + "sampling/sampling_logp_difference/max": 4.493259429931641, + "sampling/sampling_logp_difference/mean": 0.13573023676872253, + "step": 59 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 201.859375, + "completions/mean_terminated_length": 201.859375, + "completions/min_length": 84.0, + "completions/min_terminated_length": 84.0, + "entropy": 0.15865694545209408, + "epoch": 0.10619469026548672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.339729340208444, + "learning_rate": 1e-06, + "loss": -0.0034, + "num_tokens": 28109862.0, + "reward": 0.8070312738418579, + "reward_std": 0.38295724987983704, + "rewards/execution_accuracy_EX/mean": 0.796875, + "rewards/execution_accuracy_EX/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9855418801307678, + "sampling/importance_sampling_ratio/min": 0.004103474784642458, + "sampling/sampling_logp_difference/max": 5.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.1409788727760315, + "step": 60 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 206.37109375, + "completions/mean_terminated_length": 206.37109375, + "completions/min_length": 93.0, + "completions/min_terminated_length": 93.0, + "entropy": 0.14121604897081852, + "epoch": 0.1079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2794506948692285, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 28482789.0, + "reward": 0.40996092557907104, + "reward_std": 0.4617617130279541, + "rewards/execution_accuracy_EX/mean": 0.37890625, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9912207126617432, + "sampling/importance_sampling_ratio/min": 0.008697095327079296, + "sampling/sampling_logp_difference/max": 4.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.11767181754112244, + "step": 61 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 201.08984375, + "completions/mean_terminated_length": 201.08984375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.1559954546391964, + "epoch": 0.10973451327433628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.6063657467020576, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 28949948.0, + "reward": 0.294921875, + "reward_std": 0.41637277603149414, + "rewards/execution_accuracy_EX/mean": 0.2578125, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887830018997192, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13012146949768066, + "step": 62 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 364.0, + "completions/max_terminated_length": 364.0, + "completions/mean_length": 211.2734375, + "completions/mean_terminated_length": 211.2734375, + "completions/min_length": 97.0, + "completions/min_terminated_length": 97.0, + "entropy": 0.15071019157767296, + "epoch": 0.11150442477876106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27922915476043525, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 29324226.0, + "reward": 0.6808593273162842, + "reward_std": 0.44958025217056274, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868204593658447, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1329112946987152, + "step": 63 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 211.91015625, + "completions/mean_terminated_length": 211.91015625, + "completions/min_length": 95.0, + "completions/min_terminated_length": 95.0, + "entropy": 0.15028476156294346, + "epoch": 0.11327433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21032008894745782, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 29642747.0, + "reward": 0.6845703125, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9902558922767639, + "sampling/importance_sampling_ratio/min": 0.011159200221300125, + "sampling/sampling_logp_difference/max": 4.495491027832031, + "sampling/sampling_logp_difference/mean": 0.12290841341018677, + "step": 64 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 383.0, + "completions/max_terminated_length": 383.0, + "completions/mean_length": 209.05078125, + "completions/mean_terminated_length": 209.05078125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.15721244923770428, + "epoch": 0.11504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.536090475913442, + "learning_rate": 1e-06, + "loss": -0.0127, + "num_tokens": 30274040.0, + "reward": 0.539843738079071, + "reward_std": 0.47569799423217773, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889378547668457, + "sampling/importance_sampling_ratio/min": 0.0143876438960433, + "sampling/sampling_logp_difference/max": 4.241385459899902, + "sampling/sampling_logp_difference/mean": 0.1261480450630188, + "step": 65 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 398.0, + "completions/max_terminated_length": 398.0, + "completions/mean_length": 200.1640625, + "completions/mean_terminated_length": 200.1640625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.16187943145632744, + "epoch": 0.1168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46705033131972595, + "learning_rate": 1e-06, + "loss": -0.0057, + "num_tokens": 30624178.0, + "reward": 0.866406261920929, + "reward_std": 0.33089950680732727, + "rewards/execution_accuracy_EX/mean": 0.859375, + "rewards/execution_accuracy_EX/std": 0.3483152687549591, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888778924942017, + "sampling/importance_sampling_ratio/min": 0.0052929287776350975, + "sampling/sampling_logp_difference/max": 5.2413835525512695, + "sampling/sampling_logp_difference/mean": 0.13253706693649292, + "step": 66 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 371.0, + "completions/max_terminated_length": 371.0, + "completions/mean_length": 218.359375, + "completions/mean_terminated_length": 218.359375, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.15289377607405186, + "epoch": 0.11858407079646018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3026216260708256, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 30960318.0, + "reward": 0.7291015386581421, + "reward_std": 0.4297545850276947, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879290461540222, + "sampling/importance_sampling_ratio/min": 0.008726546540856361, + "sampling/sampling_logp_difference/max": 4.741385459899902, + "sampling/sampling_logp_difference/mean": 0.12670288980007172, + "step": 67 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 319.0, + "completions/max_terminated_length": 319.0, + "completions/mean_length": 208.44921875, + "completions/mean_terminated_length": 208.44921875, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.15477639250457287, + "epoch": 0.12035398230088495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4698306933028935, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 31364753.0, + "reward": 0.6919921636581421, + "reward_std": 0.4455491304397583, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852240085601807, + "sampling/importance_sampling_ratio/min": 0.006765482947230339, + "sampling/sampling_logp_difference/max": 4.995921611785889, + "sampling/sampling_logp_difference/mean": 0.1278304159641266, + "step": 68 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 492.0, + "completions/max_terminated_length": 492.0, + "completions/mean_length": 241.53515625, + "completions/mean_terminated_length": 241.53515625, + "completions/min_length": 89.0, + "completions/min_terminated_length": 89.0, + "entropy": 0.15711761824786663, + "epoch": 0.12212389380530973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5749773994372718, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 32008762.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989099383354187, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.12306369096040726, + "step": 69 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 381.0, + "completions/max_terminated_length": 381.0, + "completions/mean_length": 200.6640625, + "completions/mean_terminated_length": 200.6640625, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.15642414800822735, + "epoch": 0.12389380530973451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41019204934910464, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 32363556.0, + "reward": 0.699414074420929, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9870081543922424, + "sampling/importance_sampling_ratio/min": 0.018361039459705353, + "sampling/sampling_logp_difference/max": 3.9975242614746094, + "sampling/sampling_logp_difference/mean": 0.12989136576652527, + "step": 70 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 570.0, + "completions/max_terminated_length": 570.0, + "completions/mean_length": 228.4140625, + "completions/mean_terminated_length": 228.4140625, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.153860317543149, + "epoch": 0.1256637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2613845000041792, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 32825038.0, + "reward": 0.49531251192092896, + "reward_std": 0.47499996423721313, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9857495427131653, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1292147934436798, + "step": 71 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 404.0, + "completions/max_terminated_length": 404.0, + "completions/mean_length": 233.3125, + "completions/mean_terminated_length": 233.3125, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.17075590044260025, + "epoch": 0.12743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.46115708269394196, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 33636990.0, + "reward": 0.6957031488418579, + "reward_std": 0.4441350996494293, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881327152252197, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.13605903089046478, + "step": 72 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 220.0703125, + "completions/mean_terminated_length": 220.0703125, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.15813160501420498, + "epoch": 0.12920353982300886, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27069713662807277, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 34190176.0, + "reward": 0.550976574420929, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.986320972442627, + "sampling/importance_sampling_ratio/min": 0.01118975318968296, + "sampling/sampling_logp_difference/max": 4.4927568435668945, + "sampling/sampling_logp_difference/mean": 0.13204938173294067, + "step": 73 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 508.0, + "completions/max_terminated_length": 508.0, + "completions/mean_length": 252.2890625, + "completions/mean_terminated_length": 252.2890625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.18862844444811344, + "epoch": 0.13097345132743363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2288468255994566, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 34728666.0, + "reward": 0.643750011920929, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.995506763458252, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13722500205039978, + "step": 74 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 247.48046875, + "completions/mean_terminated_length": 247.48046875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.18717484176158905, + "epoch": 0.13274336283185842, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40398866495893176, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 35264965.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9933390021324158, + "sampling/importance_sampling_ratio/min": 0.02364342100918293, + "sampling/sampling_logp_difference/max": 3.7446703910827637, + "sampling/sampling_logp_difference/mean": 0.13772422075271606, + "step": 75 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 462.0, + "completions/max_terminated_length": 462.0, + "completions/mean_length": 249.34765625, + "completions/mean_terminated_length": 249.34765625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.168232761323452, + "epoch": 0.13451327433628318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3183141585051623, + "learning_rate": 1e-06, + "loss": 0.0049, + "num_tokens": 35725918.0, + "reward": 0.502734363079071, + "reward_std": 0.47540730237960815, + "rewards/execution_accuracy_EX/mean": 0.4765625, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9941282868385315, + "sampling/importance_sampling_ratio/min": 0.012834830209612846, + "sampling/sampling_logp_difference/max": 4.355592727661133, + "sampling/sampling_logp_difference/mean": 0.12626288831233978, + "step": 76 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 244.9140625, + "completions/mean_terminated_length": 244.9140625, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.17300679720938206, + "epoch": 0.13628318584070798, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2539974690397416, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 36173432.0, + "reward": 0.6437499523162842, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9919909834861755, + "sampling/importance_sampling_ratio/min": 0.017717216163873672, + "sampling/sampling_logp_difference/max": 4.0332183837890625, + "sampling/sampling_logp_difference/mean": 0.1292758584022522, + "step": 77 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 243.56640625, + "completions/mean_terminated_length": 243.56640625, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.1690126322209835, + "epoch": 0.13805309734513274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2874219749877832, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 36734777.0, + "reward": 0.3505859375, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.31640625, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9782031774520874, + "sampling/importance_sampling_ratio/mean": 0.9859957098960876, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13325968384742737, + "step": 78 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 439.0, + "completions/max_terminated_length": 439.0, + "completions/mean_length": 254.6875, + "completions/mean_terminated_length": 254.6875, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.18058686703443527, + "epoch": 0.13982300884955753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3546576154835264, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 37203609.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948158740997314, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901555776596069, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13888263702392578, + "step": 79 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 541.0, + "completions/max_terminated_length": 541.0, + "completions/mean_length": 249.5078125, + "completions/mean_terminated_length": 249.5078125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.16634603776037693, + "epoch": 0.1415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3452826158835721, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 37865579.0, + "reward": 0.5361328125, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880659580230713, + "sampling/importance_sampling_ratio/min": 0.011136550456285477, + "sampling/sampling_logp_difference/max": 4.497522830963135, + "sampling/sampling_logp_difference/mean": 0.133858323097229, + "step": 80 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 566.0, + "completions/max_terminated_length": 566.0, + "completions/mean_length": 243.84765625, + "completions/mean_terminated_length": 243.84765625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.17802287265658379, + "epoch": 0.1433628318584071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09885274663321054, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 38417524.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.991807222366333, + "sampling/importance_sampling_ratio/min": 0.014511375688016415, + "sampling/sampling_logp_difference/max": 4.232822418212891, + "sampling/sampling_logp_difference/mean": 0.13529834151268005, + "step": 81 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 231.14453125, + "completions/mean_terminated_length": 231.14453125, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.17939144186675549, + "epoch": 0.14513274336283186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30764387537599147, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 38935769.0, + "reward": 0.443359375, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.4140625, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884452819824219, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1392521858215332, + "step": 82 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 225.58984375, + "completions/mean_terminated_length": 225.58984375, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.1662676576524973, + "epoch": 0.14690265486725665, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.159489195026553, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 39425600.0, + "reward": 0.576953113079071, + "reward_std": 0.47307515144348145, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9720790386199951, + "sampling/importance_sampling_ratio/mean": 0.9862806797027588, + "sampling/importance_sampling_ratio/min": 0.023676205426454544, + "sampling/sampling_logp_difference/max": 3.7432847023010254, + "sampling/sampling_logp_difference/mean": 0.13640713691711426, + "step": 83 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 356.0, + "completions/max_terminated_length": 356.0, + "completions/mean_length": 246.4921875, + "completions/mean_terminated_length": 246.4921875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.16606100276112556, + "epoch": 0.1486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27365668606557403, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 40151502.0, + "reward": 0.6326172351837158, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9730298519134521, + "sampling/importance_sampling_ratio/mean": 0.9869570732116699, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13253940641880035, + "step": 84 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 443.0, + "completions/max_terminated_length": 443.0, + "completions/mean_length": 254.39453125, + "completions/mean_terminated_length": 254.39453125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.16300375014543533, + "epoch": 0.1504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39185776025031027, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 40474675.0, + "reward": 0.6957031488418579, + "reward_std": 0.4441350996494293, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899999499320984, + "sampling/importance_sampling_ratio/min": 0.0053865727968513966, + "sampling/sampling_logp_difference/max": 5.223845958709717, + "sampling/sampling_logp_difference/mean": 0.13284769654273987, + "step": 85 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 242.703125, + "completions/mean_terminated_length": 242.703125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.1711716093122959, + "epoch": 0.15221238938053097, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3204005745160462, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 40898775.0, + "reward": 0.5621093511581421, + "reward_std": 0.47447580099105835, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9313511848449707, + "sampling/importance_sampling_ratio/mean": 0.9848341941833496, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.14207801222801208, + "step": 86 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 228.296875, + "completions/mean_terminated_length": 228.296875, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.16796389780938625, + "epoch": 0.15398230088495576, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20056636467959543, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 41385699.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863157868385315, + "sampling/importance_sampling_ratio/min": 0.014360584318637848, + "sampling/sampling_logp_difference/max": 4.243268013000488, + "sampling/sampling_logp_difference/mean": 0.1338033676147461, + "step": 87 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 259.87890625, + "completions/mean_terminated_length": 259.87890625, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.16970933973789215, + "epoch": 0.15575221238938053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4007174339861282, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 41922164.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9893125295639038, + "sampling/importance_sampling_ratio/min": 0.015252375043928623, + "sampling/sampling_logp_difference/max": 4.183020114898682, + "sampling/sampling_logp_difference/mean": 0.13815277814865112, + "step": 88 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 567.0, + "completions/max_terminated_length": 567.0, + "completions/mean_length": 268.3828125, + "completions/mean_terminated_length": 268.3828125, + "completions/min_length": 104.0, + "completions/min_terminated_length": 104.0, + "entropy": 0.1632604207843542, + "epoch": 0.15752212389380532, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270997162343305, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 42477190.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9690579175949097, + "sampling/importance_sampling_ratio/mean": 0.9899737238883972, + "sampling/importance_sampling_ratio/min": 0.0024888834450393915, + "sampling/sampling_logp_difference/max": 5.9959211349487305, + "sampling/sampling_logp_difference/mean": 0.1313299685716629, + "step": 89 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 466.0, + "completions/max_terminated_length": 466.0, + "completions/mean_length": 275.8828125, + "completions/mean_terminated_length": 275.8828125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.16058760695159435, + "epoch": 0.1592920353982301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27833649905689195, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 42977768.0, + "reward": 0.4990234375, + "reward_std": 0.475218266248703, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903367757797241, + "sampling/importance_sampling_ratio/min": 0.008687051944434643, + "sampling/sampling_logp_difference/max": 4.745921611785889, + "sampling/sampling_logp_difference/mean": 0.12509432435035706, + "step": 90 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 247.38671875, + "completions/mean_terminated_length": 247.38671875, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.16586022078990936, + "epoch": 0.16106194690265488, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30564575311167513, + "learning_rate": 1e-06, + "loss": -0.018, + "num_tokens": 43354875.0, + "reward": 0.614062488079071, + "reward_std": 0.46748965978622437, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9854615330696106, + "sampling/importance_sampling_ratio/min": 0.004103472921997309, + "sampling/sampling_logp_difference/max": 5.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13805580139160156, + "step": 91 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 886.0, + "completions/max_terminated_length": 886.0, + "completions/mean_length": 306.890625, + "completions/mean_terminated_length": 306.890625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.16652163863182068, + "epoch": 0.16283185840707964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32796255150826914, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 43919183.0, + "reward": 0.5509765148162842, + "reward_std": 0.4752182066440582, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934643507003784, + "sampling/importance_sampling_ratio/min": 0.0143876438960433, + "sampling/sampling_logp_difference/max": 4.241385459899902, + "sampling/sampling_logp_difference/mean": 0.12311001121997833, + "step": 92 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 442.0, + "completions/max_terminated_length": 442.0, + "completions/mean_length": 233.2265625, + "completions/mean_terminated_length": 233.2265625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.16081063263118267, + "epoch": 0.16460176991150444, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3731038031078675, + "learning_rate": 1e-06, + "loss": 0.0079, + "num_tokens": 44322409.0, + "reward": 0.5546875, + "reward_std": 0.47499996423721313, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858866930007935, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.13660240173339844, + "step": 93 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 276.6484375, + "completions/mean_terminated_length": 276.6484375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.1680069100111723, + "epoch": 0.1663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2638305265804409, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 44854543.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617726325989, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871376752853394, + "sampling/importance_sampling_ratio/min": 0.011312469840049744, + "sampling/sampling_logp_difference/max": 4.481849670410156, + "sampling/sampling_logp_difference/mean": 0.1372930109500885, + "step": 94 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 575.0, + "completions/max_terminated_length": 575.0, + "completions/mean_length": 259.74609375, + "completions/mean_terminated_length": 259.74609375, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.15809837356209755, + "epoch": 0.168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4702417167635799, + "learning_rate": 1e-06, + "loss": 0.0171, + "num_tokens": 45445278.0, + "reward": 0.643750011920929, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890620708465576, + "sampling/importance_sampling_ratio/min": 0.014322527684271336, + "sampling/sampling_logp_difference/max": 4.245921611785889, + "sampling/sampling_logp_difference/mean": 0.12532582879066467, + "step": 95 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 423.0, + "completions/max_terminated_length": 423.0, + "completions/mean_length": 279.64453125, + "completions/mean_terminated_length": 279.64453125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.17670874670147896, + "epoch": 0.16991150442477876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2779130915503557, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 45965299.0, + "reward": 0.591796875, + "reward_std": 0.4712011218070984, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900187253952026, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.135993093252182, + "step": 96 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 247.79296875, + "completions/mean_terminated_length": 247.79296875, + "completions/min_length": 108.0, + "completions/min_terminated_length": 108.0, + "entropy": 0.1574590727686882, + "epoch": 0.17168141592920355, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4356211480436373, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 46372766.0, + "reward": 0.699414074420929, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880787134170532, + "sampling/importance_sampling_ratio/min": 0.014309640042483807, + "sampling/sampling_logp_difference/max": 4.246821880340576, + "sampling/sampling_logp_difference/mean": 0.12860193848609924, + "step": 97 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 280.41796875, + "completions/mean_terminated_length": 280.41796875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.16018960252404213, + "epoch": 0.17345132743362832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23488044158580396, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 46908217.0, + "reward": 0.369140625, + "reward_std": 0.44958022236824036, + "rewards/execution_accuracy_EX/mean": 0.3359375, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9887105226516724, + "sampling/importance_sampling_ratio/min": 0.011126094497740269, + "sampling/sampling_logp_difference/max": 4.498462200164795, + "sampling/sampling_logp_difference/mean": 0.1292063593864441, + "step": 98 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 265.7734375, + "completions/mean_terminated_length": 265.7734375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.1783987581729889, + "epoch": 0.1752212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.43843478273889336, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 47390831.0, + "reward": 0.614062488079071, + "reward_std": 0.46748965978622437, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901736974716187, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.1345309317111969, + "step": 99 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 591.0, + "completions/max_terminated_length": 591.0, + "completions/mean_length": 268.265625, + "completions/mean_terminated_length": 268.265625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.16882420890033245, + "epoch": 0.17699115044247787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2611995951717259, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 47884451.0, + "reward": 0.6994140148162842, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882525205612183, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.13454368710517883, + "step": 100 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 467.0, + "completions/max_terminated_length": 467.0, + "completions/mean_length": 249.390625, + "completions/mean_terminated_length": 249.390625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.1629339773207903, + "epoch": 0.17876106194690267, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30815959071529064, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 48590135.0, + "reward": 0.5138671398162842, + "reward_std": 0.4757997393608093, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862339496612549, + "sampling/importance_sampling_ratio/min": 0.0143876438960433, + "sampling/sampling_logp_difference/max": 4.241385459899902, + "sampling/sampling_logp_difference/mean": 0.1340867280960083, + "step": 101 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 268.53515625, + "completions/mean_terminated_length": 268.53515625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.17842296697199345, + "epoch": 0.18053097345132743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2630277587397829, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 49049536.0, + "reward": 0.6326172351837158, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9884659647941589, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.14042013883590698, + "step": 102 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 662.0, + "completions/max_terminated_length": 662.0, + "completions/mean_length": 283.96875, + "completions/mean_terminated_length": 283.96875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.1600775383412838, + "epoch": 0.18230088495575222, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1793625261389528, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 49641960.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879857897758484, + "sampling/importance_sampling_ratio/min": 0.011313592083752155, + "sampling/sampling_logp_difference/max": 4.48175048828125, + "sampling/sampling_logp_difference/mean": 0.12989619374275208, + "step": 103 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 420.0, + "completions/max_terminated_length": 420.0, + "completions/mean_length": 239.52734375, + "completions/mean_terminated_length": 239.52734375, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.16701813600957394, + "epoch": 0.184070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2907646272223414, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 50205615.0, + "reward": 0.651171863079071, + "reward_std": 0.45883333683013916, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849092960357666, + "sampling/importance_sampling_ratio/min": 0.011142690666019917, + "sampling/sampling_logp_difference/max": 4.496971607208252, + "sampling/sampling_logp_difference/mean": 0.13860410451889038, + "step": 104 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 275.90625, + "completions/mean_terminated_length": 275.90625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.16702409833669662, + "epoch": 0.18584070796460178, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09613632547729432, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 50781831.0, + "reward": 0.6957030892372131, + "reward_std": 0.4441350996494293, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908092021942139, + "sampling/importance_sampling_ratio/min": 0.014347114600241184, + "sampling/sampling_logp_difference/max": 4.244206428527832, + "sampling/sampling_logp_difference/mean": 0.1251516193151474, + "step": 105 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 428.0, + "completions/max_terminated_length": 428.0, + "completions/mean_length": 239.59375, + "completions/mean_terminated_length": 239.59375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.16419068723917007, + "epoch": 0.18761061946902655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39530890701734583, + "learning_rate": 1e-06, + "loss": 0.0111, + "num_tokens": 51211711.0, + "reward": 0.680859386920929, + "reward_std": 0.44958025217056274, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864935278892517, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.13334545493125916, + "step": 106 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 391.0, + "completions/max_terminated_length": 391.0, + "completions/mean_length": 243.6328125, + "completions/mean_terminated_length": 243.6328125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.1734424475580454, + "epoch": 0.18938053097345134, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30370856163159327, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 51704449.0, + "reward": 0.7884765863418579, + "reward_std": 0.3960021138191223, + "rewards/execution_accuracy_EX/mean": 0.77734375, + "rewards/execution_accuracy_EX/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841630458831787, + "sampling/importance_sampling_ratio/min": 0.005370927508920431, + "sampling/sampling_logp_difference/max": 5.226754665374756, + "sampling/sampling_logp_difference/mean": 0.14041289687156677, + "step": 107 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 243.91796875, + "completions/mean_terminated_length": 243.91796875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.1818134058266878, + "epoch": 0.1911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3495723097955485, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 52143356.0, + "reward": 0.669726550579071, + "reward_std": 0.4533010721206665, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897716045379639, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13901342451572418, + "step": 108 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 277.91015625, + "completions/mean_terminated_length": 277.91015625, + "completions/min_length": 100.0, + "completions/min_terminated_length": 100.0, + "entropy": 0.16938803531229496, + "epoch": 0.1929203539823009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3644169906885502, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 52718741.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988355815410614, + "sampling/importance_sampling_ratio/min": 0.006754652131348848, + "sampling/sampling_logp_difference/max": 4.997523784637451, + "sampling/sampling_logp_difference/mean": 0.13667330145835876, + "step": 109 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 387.0, + "completions/max_terminated_length": 387.0, + "completions/mean_length": 227.5, + "completions/mean_terminated_length": 227.5, + "completions/min_length": 92.0, + "completions/min_terminated_length": 92.0, + "entropy": 0.16376989893615246, + "epoch": 0.19469026548672566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45963110454363654, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 53235269.0, + "reward": 0.7699218988418579, + "reward_std": 0.40778404474258423, + "rewards/execution_accuracy_EX/mean": 0.7578125, + "rewards/execution_accuracy_EX/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892630577087402, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.13272030651569366, + "step": 110 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 298.20703125, + "completions/mean_terminated_length": 298.20703125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.18415713869035244, + "epoch": 0.19646017699115045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4609769686434164, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 53829322.0, + "reward": 0.502734363079071, + "reward_std": 0.47540730237960815, + "rewards/execution_accuracy_EX/mean": 0.4765625, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882371425628662, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.1410694718360901, + "step": 111 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 485.0, + "completions/max_terminated_length": 485.0, + "completions/mean_length": 261.19921875, + "completions/mean_terminated_length": 261.19921875, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.16982518509030342, + "epoch": 0.19823008849557522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3012565908690922, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 54329517.0, + "reward": 0.5992187261581421, + "reward_std": 0.47008487582206726, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867724776268005, + "sampling/importance_sampling_ratio/min": 0.009282294660806656, + "sampling/sampling_logp_difference/max": 4.6796464920043945, + "sampling/sampling_logp_difference/mean": 0.13735702633857727, + "step": 112 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 241.89453125, + "completions/mean_terminated_length": 241.89453125, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.1724399346858263, + "epoch": 0.2, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3421487665342455, + "learning_rate": 1e-06, + "loss": 0.01, + "num_tokens": 54818786.0, + "reward": 0.6957030892372131, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9555773735046387, + "sampling/importance_sampling_ratio/mean": 0.988462507724762, + "sampling/importance_sampling_ratio/min": 0.014339153654873371, + "sampling/sampling_logp_difference/max": 4.2447614669799805, + "sampling/sampling_logp_difference/mean": 0.13636521995067596, + "step": 113 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 458.0, + "completions/max_terminated_length": 458.0, + "completions/mean_length": 259.01171875, + "completions/mean_terminated_length": 259.01171875, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.1605844870209694, + "epoch": 0.20176991150442478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2657140773286703, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 55275397.0, + "reward": 0.48046875, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.453125, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896270036697388, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12312790751457214, + "step": 114 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 478.0, + "completions/max_terminated_length": 478.0, + "completions/mean_length": 271.6015625, + "completions/mean_terminated_length": 271.6015625, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.17996527440845966, + "epoch": 0.20353982300884957, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3559506881222393, + "learning_rate": 1e-06, + "loss": -0.0155, + "num_tokens": 55794847.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948158740997314, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9921287894248962, + "sampling/importance_sampling_ratio/min": 0.01839187555015087, + "sampling/sampling_logp_difference/max": 3.9958462715148926, + "sampling/sampling_logp_difference/mean": 0.1332182139158249, + "step": 115 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 486.0, + "completions/max_terminated_length": 486.0, + "completions/mean_length": 259.36328125, + "completions/mean_terminated_length": 259.36328125, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.17369364015758038, + "epoch": 0.20530973451327433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32450382246623954, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 56206572.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9917175769805908, + "sampling/importance_sampling_ratio/mean": 0.9929153919219971, + "sampling/importance_sampling_ratio/min": 0.014437063597142696, + "sampling/sampling_logp_difference/max": 4.237956523895264, + "sampling/sampling_logp_difference/mean": 0.1287001669406891, + "step": 116 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 262.88671875, + "completions/mean_terminated_length": 262.88671875, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.17147137969732285, + "epoch": 0.20707964601769913, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2168605423484229, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 56696175.0, + "reward": 0.6734374761581421, + "reward_std": 0.45209482312202454, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896504282951355, + "sampling/importance_sampling_ratio/min": 0.011144374497234821, + "sampling/sampling_logp_difference/max": 4.496820449829102, + "sampling/sampling_logp_difference/mean": 0.1343401074409485, + "step": 117 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 500.0, + "completions/max_terminated_length": 500.0, + "completions/mean_length": 269.41796875, + "completions/mean_terminated_length": 269.41796875, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.19181371852755547, + "epoch": 0.2088495575221239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2932413456666399, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 57192138.0, + "reward": 0.5101562738418579, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915359020233154, + "sampling/importance_sampling_ratio/min": 0.018446944653987885, + "sampling/sampling_logp_difference/max": 3.992856502532959, + "sampling/sampling_logp_difference/mean": 0.14277532696723938, + "step": 118 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 488.0, + "completions/max_terminated_length": 488.0, + "completions/mean_length": 256.64453125, + "completions/mean_terminated_length": 256.64453125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.1774155180901289, + "epoch": 0.21061946902654868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12749049864679354, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 57596959.0, + "reward": 0.576953113079071, + "reward_std": 0.47307512164115906, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892342686653137, + "sampling/importance_sampling_ratio/min": 0.008668428286910057, + "sampling/sampling_logp_difference/max": 4.748067855834961, + "sampling/sampling_logp_difference/mean": 0.13478384912014008, + "step": 119 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 267.2890625, + "completions/mean_terminated_length": 267.2890625, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.18676412478089333, + "epoch": 0.21238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36605313572657927, + "learning_rate": 1e-06, + "loss": -0.0062, + "num_tokens": 58133417.0, + "reward": 0.6251952648162842, + "reward_std": 0.4652217924594879, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9920075535774231, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.13781046867370605, + "step": 120 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 582.0, + "completions/max_terminated_length": 582.0, + "completions/mean_length": 255.11328125, + "completions/mean_terminated_length": 255.11328125, + "completions/min_length": 101.0, + "completions/min_terminated_length": 101.0, + "entropy": 0.1788367796689272, + "epoch": 0.21415929203539824, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3330431759383045, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 58559350.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892991781234741, + "sampling/importance_sampling_ratio/min": 0.010825454257428646, + "sampling/sampling_logp_difference/max": 4.52585506439209, + "sampling/sampling_logp_difference/mean": 0.13784188032150269, + "step": 121 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 240.33984375, + "completions/mean_terminated_length": 240.33984375, + "completions/min_length": 98.0, + "completions/min_terminated_length": 98.0, + "entropy": 0.17649833858013153, + "epoch": 0.215929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15017924825176865, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 59040813.0, + "reward": 0.6474609375, + "reward_std": 0.45984160900115967, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9871264696121216, + "sampling/importance_sampling_ratio/min": 0.008697095327079296, + "sampling/sampling_logp_difference/max": 4.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.13799268007278442, + "step": 122 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 271.26953125, + "completions/mean_terminated_length": 271.26953125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.18275177665054798, + "epoch": 0.2176991150442478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3987088809267594, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 59434258.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9922703504562378, + "sampling/importance_sampling_ratio/min": 0.011370888911187649, + "sampling/sampling_logp_difference/max": 4.476698875427246, + "sampling/sampling_logp_difference/mean": 0.132550448179245, + "step": 123 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 237.4140625, + "completions/mean_terminated_length": 237.4140625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.17214470729231834, + "epoch": 0.21946902654867256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38100763019028927, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 60053116.0, + "reward": 0.47675779461860657, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862518310546875, + "sampling/importance_sampling_ratio/min": 0.018390560522675514, + "sampling/sampling_logp_difference/max": 3.995917797088623, + "sampling/sampling_logp_difference/mean": 0.13303159177303314, + "step": 124 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 248.16796875, + "completions/mean_terminated_length": 248.16796875, + "completions/min_length": 164.0, + "completions/min_terminated_length": 164.0, + "entropy": 0.17250299267470837, + "epoch": 0.22123893805309736, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28478565039191006, + "learning_rate": 1e-06, + "loss": 0.0141, + "num_tokens": 60342135.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212619304657, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9911350607872009, + "sampling/importance_sampling_ratio/min": 0.023641232401132584, + "sampling/sampling_logp_difference/max": 3.744762897491455, + "sampling/sampling_logp_difference/mean": 0.13086634874343872, + "step": 125 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 263.34765625, + "completions/mean_terminated_length": 263.34765625, + "completions/min_length": 103.0, + "completions/min_terminated_length": 103.0, + "entropy": 0.17334261536598206, + "epoch": 0.22300884955752212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3397032515251031, + "learning_rate": 1e-06, + "loss": 0.0089, + "num_tokens": 60843104.0, + "reward": 0.5435546636581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.969807744026184, + "sampling/importance_sampling_ratio/mean": 0.9911377429962158, + "sampling/importance_sampling_ratio/min": 0.01113718282431364, + "sampling/sampling_logp_difference/max": 4.497466087341309, + "sampling/sampling_logp_difference/mean": 0.13102008402347565, + "step": 126 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 233.08203125, + "completions/mean_terminated_length": 233.08203125, + "completions/min_length": 109.0, + "completions/min_terminated_length": 109.0, + "entropy": 0.1644600834697485, + "epoch": 0.2247787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19006932124665715, + "learning_rate": 1e-06, + "loss": -0.0027, + "num_tokens": 61219861.0, + "reward": 0.6808593273162842, + "reward_std": 0.44958022236824036, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873524308204651, + "sampling/importance_sampling_ratio/min": 0.014340453781187534, + "sampling/sampling_logp_difference/max": 4.244670867919922, + "sampling/sampling_logp_difference/mean": 0.1292169690132141, + "step": 127 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 243.5234375, + "completions/mean_terminated_length": 243.5234375, + "completions/min_length": 90.0, + "completions/min_terminated_length": 90.0, + "entropy": 0.1923501305282116, + "epoch": 0.22654867256637168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3107724202625511, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 61738171.0, + "reward": 0.5064452886581421, + "reward_std": 0.47556719183921814, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9904196262359619, + "sampling/importance_sampling_ratio/min": 0.011125722900032997, + "sampling/sampling_logp_difference/max": 4.498495578765869, + "sampling/sampling_logp_difference/mean": 0.14383336901664734, + "step": 128 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 445.0, + "completions/max_terminated_length": 445.0, + "completions/mean_length": 253.171875, + "completions/mean_terminated_length": 253.171875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.17130203545093536, + "epoch": 0.22831858407079647, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24078201406903801, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 62106407.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9352301359176636, + "sampling/importance_sampling_ratio/mean": 0.9913920164108276, + "sampling/importance_sampling_ratio/min": 0.011312858201563358, + "sampling/sampling_logp_difference/max": 4.481815338134766, + "sampling/sampling_logp_difference/mean": 0.13263952732086182, + "step": 129 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 269.734375, + "completions/mean_terminated_length": 269.734375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.1678872276097536, + "epoch": 0.23008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42323639693398374, + "learning_rate": 1e-06, + "loss": 0.0095, + "num_tokens": 62519971.0, + "reward": 0.606640636920929, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896361827850342, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.12853194773197174, + "step": 130 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 444.0, + "completions/max_terminated_length": 444.0, + "completions/mean_length": 254.296875, + "completions/mean_terminated_length": 254.296875, + "completions/min_length": 119.0, + "completions/min_terminated_length": 119.0, + "entropy": 0.174601873382926, + "epoch": 0.23185840707964603, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4026612194940439, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 63299167.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600866317749023, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9881101250648499, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.13689565658569336, + "step": 131 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 525.0, + "completions/max_terminated_length": 525.0, + "completions/mean_length": 243.44921875, + "completions/mean_terminated_length": 243.44921875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.17167786695063114, + "epoch": 0.2336283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3223429311177653, + "learning_rate": 1e-06, + "loss": 0.0173, + "num_tokens": 63750914.0, + "reward": 0.6957030892372131, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898079633712769, + "sampling/importance_sampling_ratio/min": 0.016445204615592957, + "sampling/sampling_logp_difference/max": 4.107721328735352, + "sampling/sampling_logp_difference/mean": 0.13387587666511536, + "step": 132 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 256.203125, + "completions/mean_terminated_length": 256.203125, + "completions/min_length": 111.0, + "completions/min_terminated_length": 111.0, + "entropy": 0.16661197133362293, + "epoch": 0.23539823008849559, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18602760671024346, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 64151302.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267402172088623, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9915856122970581, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.1272910237312317, + "step": 133 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 276.57421875, + "completions/mean_terminated_length": 276.57421875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.16904133558273315, + "epoch": 0.23716814159292035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07880297011259281, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 64605833.0, + "reward": 0.6474609375, + "reward_std": 0.45984160900115967, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9918218851089478, + "sampling/importance_sampling_ratio/min": 0.01118436548858881, + "sampling/sampling_logp_difference/max": 4.49323844909668, + "sampling/sampling_logp_difference/mean": 0.13078564405441284, + "step": 134 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 272.41796875, + "completions/mean_terminated_length": 272.41796875, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.16213511303067207, + "epoch": 0.23893805309734514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2582515824201466, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 64966548.0, + "reward": 0.651171863079071, + "reward_std": 0.45883336663246155, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9916924834251404, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.12518557906150818, + "step": 135 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 265.6015625, + "completions/mean_terminated_length": 265.6015625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.17535862512886524, + "epoch": 0.2407079646017699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3891175004639357, + "learning_rate": 1e-06, + "loss": -0.0067, + "num_tokens": 65456558.0, + "reward": 0.40625, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876947999000549, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.136422261595726, + "step": 136 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 468.0, + "completions/max_terminated_length": 468.0, + "completions/mean_length": 277.9375, + "completions/mean_terminated_length": 277.9375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1768400277942419, + "epoch": 0.2424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29718461934694185, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 65937470.0, + "reward": 0.550976574420929, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9903590083122253, + "sampling/importance_sampling_ratio/min": 0.004103484563529491, + "sampling/sampling_logp_difference/max": 5.4959187507629395, + "sampling/sampling_logp_difference/mean": 0.13497892022132874, + "step": 137 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 609.0, + "completions/max_terminated_length": 609.0, + "completions/mean_length": 283.0234375, + "completions/mean_terminated_length": 283.0234375, + "completions/min_length": 106.0, + "completions/min_terminated_length": 106.0, + "entropy": 0.18021181970834732, + "epoch": 0.24424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3753146574870807, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 66447268.0, + "reward": 0.5658202767372131, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9930406808853149, + "sampling/importance_sampling_ratio/min": 0.0183610487729311, + "sampling/sampling_logp_difference/max": 3.997523784637451, + "sampling/sampling_logp_difference/mean": 0.13213272392749786, + "step": 138 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 419.0, + "completions/max_terminated_length": 419.0, + "completions/mean_length": 273.5859375, + "completions/mean_terminated_length": 273.5859375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.17577782832086086, + "epoch": 0.24601769911504426, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32820941355287014, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 66945450.0, + "reward": 0.576953113079071, + "reward_std": 0.47307515144348145, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882457256317139, + "sampling/importance_sampling_ratio/min": 0.01118464209139347, + "sampling/sampling_logp_difference/max": 4.493213653564453, + "sampling/sampling_logp_difference/mean": 0.13495786488056183, + "step": 139 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 259.58203125, + "completions/mean_terminated_length": 259.58203125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.1591511070728302, + "epoch": 0.24778761061946902, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36036338433285736, + "learning_rate": 1e-06, + "loss": 0.0078, + "num_tokens": 67327455.0, + "reward": 0.7513672113418579, + "reward_std": 0.4184097945690155, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880623817443848, + "sampling/importance_sampling_ratio/min": 0.0031929106917232275, + "sampling/sampling_logp_difference/max": 5.746822357177734, + "sampling/sampling_logp_difference/mean": 0.12579825520515442, + "step": 140 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 437.0, + "completions/max_terminated_length": 437.0, + "completions/mean_length": 256.45703125, + "completions/mean_terminated_length": 256.45703125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.15968062169849873, + "epoch": 0.24955752212389382, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06441388540722605, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 67878948.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889984130859375, + "sampling/importance_sampling_ratio/min": 0.0148173151537776, + "sampling/sampling_logp_difference/max": 4.211958885192871, + "sampling/sampling_logp_difference/mean": 0.12597008049488068, + "step": 141 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 276.77734375, + "completions/mean_terminated_length": 276.77734375, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.1671825349330902, + "epoch": 0.2513274336283186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34538557229592415, + "learning_rate": 1e-06, + "loss": 0.0205, + "num_tokens": 68440971.0, + "reward": 0.7105468511581421, + "reward_std": 0.43811774253845215, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988913893699646, + "sampling/importance_sampling_ratio/min": 0.011154407635331154, + "sampling/sampling_logp_difference/max": 4.495920658111572, + "sampling/sampling_logp_difference/mean": 0.12981252372264862, + "step": 142 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 236.92578125, + "completions/mean_terminated_length": 236.92578125, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.16795746237039566, + "epoch": 0.25309734513274335, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2575532753923275, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 68819368.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676456928253174, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9547626972198486, + "sampling/importance_sampling_ratio/mean": 0.98448246717453, + "sampling/importance_sampling_ratio/min": 0.014282699674367905, + "sampling/sampling_logp_difference/max": 4.248706340789795, + "sampling/sampling_logp_difference/mean": 0.13988810777664185, + "step": 143 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 453.0, + "completions/max_terminated_length": 453.0, + "completions/mean_length": 233.71875, + "completions/mean_terminated_length": 233.71875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.1615072637796402, + "epoch": 0.25486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2598507678217811, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 69187008.0, + "reward": 0.799609363079071, + "reward_std": 0.38833457231521606, + "rewards/execution_accuracy_EX/mean": 0.7890625, + "rewards/execution_accuracy_EX/std": 0.4087733030319214, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98479163646698, + "sampling/importance_sampling_ratio/min": 0.018343178555369377, + "sampling/sampling_logp_difference/max": 3.998497486114502, + "sampling/sampling_logp_difference/mean": 0.13842977583408356, + "step": 144 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 892.0, + "completions/max_terminated_length": 892.0, + "completions/mean_length": 269.00390625, + "completions/mean_terminated_length": 269.00390625, + "completions/min_length": 110.0, + "completions/min_terminated_length": 110.0, + "entropy": 0.1655492577701807, + "epoch": 0.25663716814159293, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.328328829482272, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 69518417.0, + "reward": 0.743945300579071, + "reward_std": 0.42235618829727173, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880072474479675, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.13215236365795135, + "step": 145 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 498.0, + "completions/max_terminated_length": 498.0, + "completions/mean_length": 245.9609375, + "completions/mean_terminated_length": 245.9609375, + "completions/min_length": 107.0, + "completions/min_terminated_length": 107.0, + "entropy": 0.15269946679472923, + "epoch": 0.2584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32541737697058953, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 70031431.0, + "reward": 0.8330078125, + "reward_std": 0.36231058835983276, + "rewards/execution_accuracy_EX/mean": 0.82421875, + "rewards/execution_accuracy_EX/std": 0.3813795745372772, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864132404327393, + "sampling/importance_sampling_ratio/min": 0.023737918585538864, + "sampling/sampling_logp_difference/max": 3.7406816482543945, + "sampling/sampling_logp_difference/mean": 0.1274140328168869, + "step": 146 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 425.0, + "completions/max_terminated_length": 425.0, + "completions/mean_length": 273.55078125, + "completions/mean_terminated_length": 273.55078125, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.16360680013895035, + "epoch": 0.26017699115044246, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2289894642119028, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 70396868.0, + "reward": 0.666015625, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9864295721054077, + "sampling/importance_sampling_ratio/min": 0.00015113348490558565, + "sampling/sampling_logp_difference/max": 8.797347068786621, + "sampling/sampling_logp_difference/mean": 0.13306456804275513, + "step": 147 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 287.44140625, + "completions/mean_terminated_length": 287.44140625, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.17404436320066452, + "epoch": 0.26194690265486725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3945935746219545, + "learning_rate": 1e-06, + "loss": -0.0087, + "num_tokens": 70870485.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9925150871276855, + "sampling/importance_sampling_ratio/min": 0.016433831304311752, + "sampling/sampling_logp_difference/max": 4.108413219451904, + "sampling/sampling_logp_difference/mean": 0.1313704252243042, + "step": 148 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 264.265625, + "completions/mean_terminated_length": 264.265625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.1564411912113428, + "epoch": 0.26371681415929205, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20620659707112282, + "learning_rate": 1e-06, + "loss": -0.0049, + "num_tokens": 71283801.0, + "reward": 0.595507800579071, + "reward_std": 0.47065800428390503, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9856451749801636, + "sampling/importance_sampling_ratio/min": 0.0024849013425409794, + "sampling/sampling_logp_difference/max": 5.997522354125977, + "sampling/sampling_logp_difference/mean": 0.13136112689971924, + "step": 149 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 267.58984375, + "completions/mean_terminated_length": 267.58984375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.15700889751315117, + "epoch": 0.26548672566371684, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13476004624860974, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 71625792.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9884984493255615, + "sampling/importance_sampling_ratio/mean": 0.9865537881851196, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.13188934326171875, + "step": 150 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 433.0, + "completions/max_terminated_length": 433.0, + "completions/mean_length": 244.03515625, + "completions/mean_terminated_length": 244.03515625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.16816776245832443, + "epoch": 0.2672566371681416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2702862154438436, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 72173017.0, + "reward": 0.6697266101837158, + "reward_std": 0.4533010721206665, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9873220920562744, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.13630010187625885, + "step": 151 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 489.0, + "completions/max_terminated_length": 489.0, + "completions/mean_length": 256.57421875, + "completions/mean_terminated_length": 256.57421875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.16193786077201366, + "epoch": 0.26902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3475885697485575, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 72721228.0, + "reward": 0.62890625, + "reward_std": 0.46440389752388, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9524896144866943, + "sampling/importance_sampling_ratio/mean": 0.9891369938850403, + "sampling/importance_sampling_ratio/min": 0.011183853261172771, + "sampling/sampling_logp_difference/max": 4.493284225463867, + "sampling/sampling_logp_difference/mean": 0.1292227804660797, + "step": 152 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 537.0, + "completions/max_terminated_length": 537.0, + "completions/mean_length": 282.72265625, + "completions/mean_terminated_length": 282.72265625, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.16410168260335922, + "epoch": 0.27079646017699116, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2165331236348191, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 73422693.0, + "reward": 0.6066405773162842, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891854524612427, + "sampling/importance_sampling_ratio/min": 0.008661878295242786, + "sampling/sampling_logp_difference/max": 4.748823642730713, + "sampling/sampling_logp_difference/mean": 0.13202348351478577, + "step": 153 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 311.43359375, + "completions/mean_terminated_length": 311.43359375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.16593420505523682, + "epoch": 0.27256637168141595, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3299510582742183, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 74046020.0, + "reward": 0.680859386920929, + "reward_std": 0.44958025217056274, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876483678817749, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.1317213922739029, + "step": 154 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 569.0, + "completions/max_terminated_length": 569.0, + "completions/mean_length": 268.265625, + "completions/mean_terminated_length": 268.265625, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.17522499524056911, + "epoch": 0.2743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12994389119765543, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 74565416.0, + "reward": 0.5138671398162842, + "reward_std": 0.4757997393608093, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848809242248535, + "sampling/importance_sampling_ratio/min": 0.0183610487729311, + "sampling/sampling_logp_difference/max": 3.997523784637451, + "sampling/sampling_logp_difference/mean": 0.1431981474161148, + "step": 155 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 287.70703125, + "completions/mean_terminated_length": 287.70703125, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.177983570843935, + "epoch": 0.2761061946902655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2242206823726338, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 75214829.0, + "reward": 0.6845703125, + "reward_std": 0.44827142357826233, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876867532730103, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1371629387140274, + "step": 156 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 532.0, + "completions/max_terminated_length": 532.0, + "completions/mean_length": 298.37109375, + "completions/mean_terminated_length": 298.37109375, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.18536010943353176, + "epoch": 0.2778761061946903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3347005196171514, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 75730348.0, + "reward": 0.5287109613418579, + "reward_std": 0.47591593861579895, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9898300170898438, + "sampling/importance_sampling_ratio/min": 0.011232499033212662, + "sampling/sampling_logp_difference/max": 4.488944053649902, + "sampling/sampling_logp_difference/mean": 0.141252338886261, + "step": 157 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 515.0, + "completions/max_terminated_length": 515.0, + "completions/mean_length": 274.41015625, + "completions/mean_terminated_length": 274.41015625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.1710018888115883, + "epoch": 0.27964601769911507, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2685527765456495, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 76249333.0, + "reward": 0.703125, + "reward_std": 0.4411993622779846, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989202618598938, + "sampling/importance_sampling_ratio/min": 0.0067833466455340385, + "sampling/sampling_logp_difference/max": 4.993284702301025, + "sampling/sampling_logp_difference/mean": 0.13519498705863953, + "step": 158 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 619.0, + "completions/max_terminated_length": 619.0, + "completions/mean_length": 282.72265625, + "completions/mean_terminated_length": 282.72265625, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.16912869177758694, + "epoch": 0.2814159292035398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27739637030277553, + "learning_rate": 1e-06, + "loss": 0.0047, + "num_tokens": 76804670.0, + "reward": 0.7921874523162842, + "reward_std": 0.393498033285141, + "rewards/execution_accuracy_EX/mean": 0.78125, + "rewards/execution_accuracy_EX/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852451682090759, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13660919666290283, + "step": 159 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 735.0, + "completions/max_terminated_length": 735.0, + "completions/mean_length": 292.8046875, + "completions/mean_terminated_length": 292.8046875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.1675608493387699, + "epoch": 0.2831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13822705311395195, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 77297004.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9882014989852905, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13478179275989532, + "step": 160 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 823.0, + "completions/max_terminated_length": 823.0, + "completions/mean_length": 321.77734375, + "completions/mean_terminated_length": 321.77734375, + "completions/min_length": 196.0, + "completions/min_terminated_length": 196.0, + "entropy": 0.18110738322138786, + "epoch": 0.2849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30001543636542144, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 77913779.0, + "reward": 0.6326172351837158, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877785444259644, + "sampling/importance_sampling_ratio/min": 0.011312620714306831, + "sampling/sampling_logp_difference/max": 4.481836318969727, + "sampling/sampling_logp_difference/mean": 0.13775095343589783, + "step": 161 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 881.0, + "completions/max_terminated_length": 881.0, + "completions/mean_length": 319.28515625, + "completions/mean_terminated_length": 319.28515625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.17902282439172268, + "epoch": 0.2867256637168142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28813047849773493, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 78299196.0, + "reward": 0.5435546636581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866389036178589, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.14283457398414612, + "step": 162 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 675.0, + "completions/max_terminated_length": 675.0, + "completions/mean_length": 317.91796875, + "completions/mean_terminated_length": 317.91796875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.19017246179282665, + "epoch": 0.2884955752212389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3032423743448608, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 78834887.0, + "reward": 0.595507800579071, + "reward_std": 0.4706580340862274, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9934765696525574, + "sampling/importance_sampling_ratio/min": 0.017504168674349785, + "sampling/sampling_logp_difference/max": 4.045316219329834, + "sampling/sampling_logp_difference/mean": 0.13656078279018402, + "step": 163 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 802.0, + "completions/max_terminated_length": 802.0, + "completions/mean_length": 298.24609375, + "completions/mean_terminated_length": 298.24609375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.17399132438004017, + "epoch": 0.2902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2715622490557467, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 79380054.0, + "reward": 0.6585937142372131, + "reward_std": 0.4567192792892456, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9891414642333984, + "sampling/importance_sampling_ratio/min": 0.008668584749102592, + "sampling/sampling_logp_difference/max": 4.748049736022949, + "sampling/sampling_logp_difference/mean": 0.13266518712043762, + "step": 164 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 472.0, + "completions/max_terminated_length": 472.0, + "completions/mean_length": 255.90625, + "completions/mean_terminated_length": 255.90625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.16244931519031525, + "epoch": 0.2920353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14016964939870763, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 79895086.0, + "reward": 0.7699218988418579, + "reward_std": 0.4077840745449066, + "rewards/execution_accuracy_EX/mean": 0.7578125, + "rewards/execution_accuracy_EX/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9870266914367676, + "sampling/importance_sampling_ratio/min": 0.011138302274048328, + "sampling/sampling_logp_difference/max": 4.497365474700928, + "sampling/sampling_logp_difference/mean": 0.12949419021606445, + "step": 165 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 673.0, + "completions/max_terminated_length": 673.0, + "completions/mean_length": 285.9609375, + "completions/mean_terminated_length": 285.9609375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.19027838110923767, + "epoch": 0.2938053097345133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30834547888307096, + "learning_rate": 1e-06, + "loss": 0.0218, + "num_tokens": 80428116.0, + "reward": 0.699414074420929, + "reward_std": 0.4426852762699127, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9892711639404297, + "sampling/importance_sampling_ratio/min": 0.011172729544341564, + "sampling/sampling_logp_difference/max": 4.494279384613037, + "sampling/sampling_logp_difference/mean": 0.14073659479618073, + "step": 166 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 309.46875, + "completions/mean_terminated_length": 309.46875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.20458186231553555, + "epoch": 0.29557522123893804, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27987359383856286, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 80961324.0, + "reward": 0.4136718809604645, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.3828125, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867178201675415, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.15584495663642883, + "step": 167 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 277.1640625, + "completions/mean_terminated_length": 277.1640625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.17713839747011662, + "epoch": 0.2973451327433628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2985499406165058, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 81407158.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890908002853394, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13786716759204865, + "step": 168 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 297.62109375, + "completions/mean_terminated_length": 297.62109375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.17823628522455692, + "epoch": 0.2991150442477876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14719772917799118, + "learning_rate": 1e-06, + "loss": 0.0009, + "num_tokens": 81898245.0, + "reward": 0.4619140625, + "reward_std": 0.4717142879962921, + "rewards/execution_accuracy_EX/mean": 0.43359375, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98805832862854, + "sampling/importance_sampling_ratio/min": 0.008699355646967888, + "sampling/sampling_logp_difference/max": 4.744506359100342, + "sampling/sampling_logp_difference/mean": 0.1376134157180786, + "step": 169 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 290.34375, + "completions/mean_terminated_length": 290.34375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.18065193854272366, + "epoch": 0.3008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45248351948725557, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 82287421.0, + "reward": 0.7328125238418579, + "reward_std": 0.4279654324054718, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9880305528640747, + "sampling/importance_sampling_ratio/min": 0.006766550708562136, + "sampling/sampling_logp_difference/max": 4.995763778686523, + "sampling/sampling_logp_difference/mean": 0.13845156133174896, + "step": 170 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 638.0, + "completions/max_terminated_length": 638.0, + "completions/mean_length": 305.90625, + "completions/mean_terminated_length": 305.90625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.19709263555705547, + "epoch": 0.30265486725663715, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3640781475914555, + "learning_rate": 1e-06, + "loss": 0.0195, + "num_tokens": 82663061.0, + "reward": 0.7216796875, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894841313362122, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.14712056517601013, + "step": 171 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1188.0, + "completions/max_terminated_length": 1188.0, + "completions/mean_length": 307.359375, + "completions/mean_terminated_length": 307.359375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.17272526398301125, + "epoch": 0.30442477876106194, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4270821242863743, + "learning_rate": 1e-06, + "loss": 0.0233, + "num_tokens": 83177265.0, + "reward": 0.8404296636581421, + "reward_std": 0.3558422923088074, + "rewards/execution_accuracy_EX/mean": 0.83203125, + "rewards/execution_accuracy_EX/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897280335426331, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1292266994714737, + "step": 172 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 300.59765625, + "completions/mean_terminated_length": 300.59765625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.18985807336866856, + "epoch": 0.30619469026548674, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2663472458377981, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 83831338.0, + "reward": 0.651171863079071, + "reward_std": 0.45883333683013916, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9870967268943787, + "sampling/importance_sampling_ratio/min": 0.023641685023903847, + "sampling/sampling_logp_difference/max": 3.744743824005127, + "sampling/sampling_logp_difference/mean": 0.14291070401668549, + "step": 173 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 307.375, + "completions/mean_terminated_length": 307.375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.18669777922332287, + "epoch": 0.30796460176991153, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20000621607629301, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 84321114.0, + "reward": 0.8033202886581421, + "reward_std": 0.3856732249259949, + "rewards/execution_accuracy_EX/mean": 0.79296875, + "rewards/execution_accuracy_EX/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9896946549415588, + "sampling/importance_sampling_ratio/min": 0.014280935749411583, + "sampling/sampling_logp_difference/max": 4.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1368214637041092, + "step": 174 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 622.0, + "completions/max_terminated_length": 622.0, + "completions/mean_length": 320.828125, + "completions/mean_terminated_length": 320.828125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.18010174669325352, + "epoch": 0.30973451327433627, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.248299608727396, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 84835598.0, + "reward": 0.6845703125, + "reward_std": 0.44827142357826233, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988377571105957, + "sampling/importance_sampling_ratio/min": 0.014339106157422066, + "sampling/sampling_logp_difference/max": 4.244764804840088, + "sampling/sampling_logp_difference/mean": 0.1342880129814148, + "step": 175 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 325.0625, + "completions/mean_terminated_length": 325.0625, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.20411342196166515, + "epoch": 0.31150442477876106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29122051272085764, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 85382494.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9897286891937256, + "sampling/importance_sampling_ratio/min": 0.01123241800814867, + "sampling/sampling_logp_difference/max": 4.488951206207275, + "sampling/sampling_logp_difference/mean": 0.1492016464471817, + "step": 176 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 882.0, + "completions/max_terminated_length": 882.0, + "completions/mean_length": 332.25390625, + "completions/mean_terminated_length": 332.25390625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.19850089587271214, + "epoch": 0.31327433628318585, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42684242218641294, + "learning_rate": 1e-06, + "loss": -0.0164, + "num_tokens": 86069583.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593045234680176, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907925128936768, + "sampling/importance_sampling_ratio/min": 0.00017096343799494207, + "sampling/sampling_logp_difference/max": 8.674060821533203, + "sampling/sampling_logp_difference/mean": 0.14654508233070374, + "step": 177 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 320.79296875, + "completions/mean_terminated_length": 320.79296875, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.19999371655285358, + "epoch": 0.31504424778761064, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14002915813151406, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 86605770.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9907433986663818, + "sampling/importance_sampling_ratio/min": 0.004132173955440521, + "sampling/sampling_logp_difference/max": 5.488951683044434, + "sampling/sampling_logp_difference/mean": 0.14724189043045044, + "step": 178 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 705.0, + "completions/max_terminated_length": 705.0, + "completions/mean_length": 335.09375, + "completions/mean_terminated_length": 335.09375, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.1902732029557228, + "epoch": 0.3168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21232421123556744, + "learning_rate": 1e-06, + "loss": -0.0085, + "num_tokens": 87193362.0, + "reward": 0.550976574420929, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9927315711975098, + "sampling/importance_sampling_ratio/min": 0.0025241784751415253, + "sampling/sampling_logp_difference/max": 5.981839656829834, + "sampling/sampling_logp_difference/mean": 0.13706091046333313, + "step": 179 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 624.0, + "completions/max_terminated_length": 624.0, + "completions/mean_length": 281.734375, + "completions/mean_terminated_length": 281.734375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.1805352121591568, + "epoch": 0.3185840707964602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2798660271881851, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 87866670.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212917327881, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9869506359100342, + "sampling/importance_sampling_ratio/min": 0.018529091030359268, + "sampling/sampling_logp_difference/max": 3.9884133338928223, + "sampling/sampling_logp_difference/mean": 0.1430865377187729, + "step": 180 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 273.640625, + "completions/mean_terminated_length": 273.640625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.17408888787031174, + "epoch": 0.32035398230088497, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24584557137817806, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 88212738.0, + "reward": 0.550976574420929, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.916461706161499, + "sampling/importance_sampling_ratio/mean": 0.9846568703651428, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.14325553178787231, + "step": 181 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 779.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 324.453125, + "completions/mean_terminated_length": 324.453125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.18714475817978382, + "epoch": 0.32212389380530976, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25074309060021294, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 88849622.0, + "reward": 0.666015625, + "reward_std": 0.4544737935066223, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9900903701782227, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13839155435562134, + "step": 182 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 497.0, + "completions/max_terminated_length": 497.0, + "completions/mean_length": 295.3515625, + "completions/mean_terminated_length": 295.3515625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.17940758354961872, + "epoch": 0.3238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3033041423804513, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 89208944.0, + "reward": 0.8070312738418579, + "reward_std": 0.38295724987983704, + "rewards/execution_accuracy_EX/mean": 0.796875, + "rewards/execution_accuracy_EX/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876235723495483, + "sampling/importance_sampling_ratio/min": 0.011136556044220924, + "sampling/sampling_logp_difference/max": 4.497522354125977, + "sampling/sampling_logp_difference/mean": 0.140605166554451, + "step": 183 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 750.0, + "completions/max_terminated_length": 750.0, + "completions/mean_length": 349.61328125, + "completions/mean_terminated_length": 349.61328125, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.18564481288194656, + "epoch": 0.3256637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13190575739360055, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 89806845.0, + "reward": 0.48417967557907104, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9901508092880249, + "sampling/importance_sampling_ratio/min": 0.008705098181962967, + "sampling/sampling_logp_difference/max": 4.743846416473389, + "sampling/sampling_logp_difference/mean": 0.13562259078025818, + "step": 184 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2023.0, + "completions/max_terminated_length": 2023.0, + "completions/mean_length": 327.1015625, + "completions/mean_terminated_length": 327.1015625, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.18473662436008453, + "epoch": 0.3274336283185841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17540688305886065, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 90277463.0, + "reward": 0.6103515625, + "reward_std": 0.46818408370018005, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885948300361633, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13864898681640625, + "step": 185 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 620.0, + "completions/max_terminated_length": 620.0, + "completions/mean_length": 321.01953125, + "completions/mean_terminated_length": 321.01953125, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.1843369174748659, + "epoch": 0.3292035398230089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25954323312459404, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 90832780.0, + "reward": 0.5101562738418579, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888851642608643, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.13978828489780426, + "step": 186 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 315.40234375, + "completions/mean_terminated_length": 315.40234375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.18486985377967358, + "epoch": 0.3309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4176601204881519, + "learning_rate": 1e-06, + "loss": -0.0059, + "num_tokens": 91302851.0, + "reward": 0.5101562738418579, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875400066375732, + "sampling/importance_sampling_ratio/min": 0.014315781183540821, + "sampling/sampling_logp_difference/max": 4.246392726898193, + "sampling/sampling_logp_difference/mean": 0.14399956166744232, + "step": 187 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 279.3984375, + "completions/mean_terminated_length": 279.3984375, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.17375775426626205, + "epoch": 0.3327433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3199652743452319, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 92033257.0, + "reward": 0.7291015386581421, + "reward_std": 0.4297545552253723, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862713813781738, + "sampling/importance_sampling_ratio/min": 0.014322636649012566, + "sampling/sampling_logp_difference/max": 4.245913982391357, + "sampling/sampling_logp_difference/mean": 0.13989365100860596, + "step": 188 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 316.79296875, + "completions/mean_terminated_length": 316.79296875, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.18547409027814865, + "epoch": 0.3345132743362832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25144295439565795, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 92600740.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.988899290561676, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.14020931720733643, + "step": 189 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 318.97265625, + "completions/mean_terminated_length": 318.97265625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.1898004561662674, + "epoch": 0.336283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23477650025521635, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 92979277.0, + "reward": 0.4990234375, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9899483919143677, + "sampling/importance_sampling_ratio/min": 0.011142679490149021, + "sampling/sampling_logp_difference/max": 4.496972560882568, + "sampling/sampling_logp_difference/mean": 0.13974857330322266, + "step": 190 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 332.5703125, + "completions/mean_terminated_length": 332.5703125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.1874419655650854, + "epoch": 0.3380530973451327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3249122649226148, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 93469487.0, + "reward": 0.40253907442092896, + "reward_std": 0.4598415791988373, + "rewards/execution_accuracy_EX/mean": 0.37109375, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9894042015075684, + "sampling/importance_sampling_ratio/min": 0.01430966705083847, + "sampling/sampling_logp_difference/max": 4.246819972991943, + "sampling/sampling_logp_difference/mean": 0.1404990702867508, + "step": 191 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 298.58203125, + "completions/mean_terminated_length": 298.58203125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.17528344877064228, + "epoch": 0.3398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3052954277647955, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 93908388.0, + "reward": 0.532421886920929, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9889025688171387, + "sampling/importance_sampling_ratio/min": 0.018411943688988686, + "sampling/sampling_logp_difference/max": 3.994755744934082, + "sampling/sampling_logp_difference/mean": 0.1326378434896469, + "step": 192 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 303.515625, + "completions/mean_terminated_length": 303.515625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.18384411185979843, + "epoch": 0.3415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21229100105606544, + "learning_rate": 1e-06, + "loss": 0.0036, + "num_tokens": 94443736.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593042254447937, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9866931438446045, + "sampling/importance_sampling_ratio/min": 0.006754668429493904, + "sampling/sampling_logp_difference/max": 4.99752140045166, + "sampling/sampling_logp_difference/mean": 0.14097186923027039, + "step": 193 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 502.0, + "completions/max_terminated_length": 502.0, + "completions/mean_length": 293.19921875, + "completions/mean_terminated_length": 293.19921875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.17807519063353539, + "epoch": 0.3433628318584071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22216409705678244, + "learning_rate": 1e-06, + "loss": 0.0067, + "num_tokens": 94991131.0, + "reward": 0.8255859613418579, + "reward_std": 0.36851534247398376, + "rewards/execution_accuracy_EX/mean": 0.81640625, + "rewards/execution_accuracy_EX/std": 0.387910932302475, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9888792037963867, + "sampling/importance_sampling_ratio/min": 0.014794101007282734, + "sampling/sampling_logp_difference/max": 4.213526725769043, + "sampling/sampling_logp_difference/mean": 0.1355728954076767, + "step": 194 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 309.8984375, + "completions/mean_terminated_length": 309.8984375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.18850590474903584, + "epoch": 0.34513274336283184, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31659579865543813, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 95424417.0, + "reward": 0.588085949420929, + "reward_std": 0.4717142879962921, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9890519380569458, + "sampling/importance_sampling_ratio/min": 0.009062101133167744, + "sampling/sampling_logp_difference/max": 4.7036542892456055, + "sampling/sampling_logp_difference/mean": 0.1400558054447174, + "step": 195 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 788.0, + "completions/max_terminated_length": 788.0, + "completions/mean_length": 304.72265625, + "completions/mean_terminated_length": 304.72265625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1978746559470892, + "epoch": 0.34690265486725663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19081941531426047, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 95891546.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867796301841736, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.14825952053070068, + "step": 196 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 646.0, + "completions/max_terminated_length": 646.0, + "completions/mean_length": 331.51953125, + "completions/mean_terminated_length": 331.51953125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.18945630080997944, + "epoch": 0.3486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2720554752858141, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 96555071.0, + "reward": 0.49531248211860657, + "reward_std": 0.47499996423721313, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9908668994903564, + "sampling/importance_sampling_ratio/min": 0.01839049905538559, + "sampling/sampling_logp_difference/max": 3.9959211349487305, + "sampling/sampling_logp_difference/mean": 0.1396811604499817, + "step": 197 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 539.0, + "completions/max_terminated_length": 539.0, + "completions/mean_length": 299.14453125, + "completions/mean_terminated_length": 299.14453125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.16827655397355556, + "epoch": 0.3504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15794168883137946, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 97219860.0, + "reward": 0.569531261920929, + "reward_std": 0.473834365606308, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841498136520386, + "sampling/importance_sampling_ratio/min": 0.011208107694983482, + "sampling/sampling_logp_difference/max": 4.49111795425415, + "sampling/sampling_logp_difference/mean": 0.13843360543251038, + "step": 198 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 315.87890625, + "completions/mean_terminated_length": 315.87890625, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.19387279450893402, + "epoch": 0.35221238938053095, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3254534248620124, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 97785973.0, + "reward": 0.5064452886581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.989777147769928, + "sampling/importance_sampling_ratio/min": 0.006762932054698467, + "sampling/sampling_logp_difference/max": 4.996298789978027, + "sampling/sampling_logp_difference/mean": 0.14550894498825073, + "step": 199 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 529.0, + "completions/max_terminated_length": 529.0, + "completions/mean_length": 308.40234375, + "completions/mean_terminated_length": 308.40234375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.17666900530457497, + "epoch": 0.35398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23838401560311998, + "learning_rate": 1e-06, + "loss": 0.0072, + "num_tokens": 98297340.0, + "reward": 0.666015625, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9885034561157227, + "sampling/importance_sampling_ratio/min": 0.005275059957057238, + "sampling/sampling_logp_difference/max": 5.244765281677246, + "sampling/sampling_logp_difference/mean": 0.13679823279380798, + "step": 200 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 293.91796875, + "completions/mean_terminated_length": 293.91796875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.17220030166208744, + "epoch": 0.35575221238938054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2635345589288893, + "learning_rate": 1e-06, + "loss": 0.0137, + "num_tokens": 98771959.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676450967788696, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877549409866333, + "sampling/importance_sampling_ratio/min": 0.011232408694922924, + "sampling/sampling_logp_difference/max": 4.488952159881592, + "sampling/sampling_logp_difference/mean": 0.13670043647289276, + "step": 201 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 512.0, + "completions/max_terminated_length": 512.0, + "completions/mean_length": 284.97265625, + "completions/mean_terminated_length": 284.97265625, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.1810233872383833, + "epoch": 0.35752212389380533, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27506573550641483, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 99245024.0, + "reward": 0.6623046398162842, + "reward_std": 0.4556131064891815, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841594099998474, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.14774857461452484, + "step": 202 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 727.0, + "completions/max_terminated_length": 727.0, + "completions/mean_length": 290.62109375, + "completions/mean_terminated_length": 290.62109375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.16862306743860245, + "epoch": 0.35929203539823007, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22912455934549136, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 99713983.0, + "reward": 0.62890625, + "reward_std": 0.46440389752388, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.956079363822937, + "sampling/importance_sampling_ratio/mean": 0.9820801019668579, + "sampling/importance_sampling_ratio/min": 0.014339372515678406, + "sampling/sampling_logp_difference/max": 4.244746208190918, + "sampling/sampling_logp_difference/mean": 0.14202579855918884, + "step": 203 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 283.74609375, + "completions/mean_terminated_length": 283.74609375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.17279032245278358, + "epoch": 0.36106194690265486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2651987711798711, + "learning_rate": 1e-06, + "loss": 0.0044, + "num_tokens": 100210862.0, + "reward": 0.7105468511581421, + "reward_std": 0.43811774253845215, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9854060411453247, + "sampling/importance_sampling_ratio/min": 0.011183853261172771, + "sampling/sampling_logp_difference/max": 4.493284225463867, + "sampling/sampling_logp_difference/mean": 0.1415708065032959, + "step": 204 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 286.84375, + "completions/mean_terminated_length": 286.84375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.1592701580375433, + "epoch": 0.36283185840707965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3158506886213264, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 100524550.0, + "reward": 0.7810547351837158, + "reward_std": 0.40085992217063904, + "rewards/execution_accuracy_EX/mean": 0.76953125, + "rewards/execution_accuracy_EX/std": 0.4219578504562378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9852101802825928, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.13460731506347656, + "step": 205 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 721.0, + "completions/max_terminated_length": 721.0, + "completions/mean_length": 305.68359375, + "completions/mean_terminated_length": 305.68359375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.16670113801956177, + "epoch": 0.36460176991150445, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25008339032931437, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 100935333.0, + "reward": 0.6474609375, + "reward_std": 0.4598415791988373, + "rewards/execution_accuracy_EX/mean": 0.62890625, + "rewards/execution_accuracy_EX/std": 0.48404383659362793, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877009391784668, + "sampling/importance_sampling_ratio/min": 0.008668432012200356, + "sampling/sampling_logp_difference/max": 4.748067378997803, + "sampling/sampling_logp_difference/mean": 0.13257518410682678, + "step": 206 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 280.5390625, + "completions/mean_terminated_length": 280.5390625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.16295744851231575, + "epoch": 0.3663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1806119419910666, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 101529167.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982230544090271, + "sampling/importance_sampling_ratio/min": 0.011154789477586746, + "sampling/sampling_logp_difference/max": 4.495886325836182, + "sampling/sampling_logp_difference/mean": 0.13882498443126678, + "step": 207 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 297.73046875, + "completions/mean_terminated_length": 297.73046875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.15727944858372211, + "epoch": 0.368141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22748012679787208, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 101903258.0, + "reward": 0.5064452886581421, + "reward_std": 0.47556719183921814, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867826700210571, + "sampling/importance_sampling_ratio/min": 0.011182126589119434, + "sampling/sampling_logp_difference/max": 4.493438720703125, + "sampling/sampling_logp_difference/mean": 0.12806008756160736, + "step": 208 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 580.0, + "completions/max_terminated_length": 580.0, + "completions/mean_length": 295.12890625, + "completions/mean_terminated_length": 295.12890625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.16728811338543892, + "epoch": 0.36991150442477877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36207057041573587, + "learning_rate": 1e-06, + "loss": 0.0088, + "num_tokens": 102396491.0, + "reward": 0.5546875, + "reward_std": 0.4749999940395355, + "rewards/execution_accuracy_EX/mean": 0.53125, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9869174957275391, + "sampling/importance_sampling_ratio/min": 1.6849011444719508e-05, + "sampling/sampling_logp_difference/max": 10.991218566894531, + "sampling/sampling_logp_difference/mean": 0.13170340657234192, + "step": 209 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 311.76953125, + "completions/mean_terminated_length": 311.76953125, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.16339107044041157, + "epoch": 0.37168141592920356, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24713288062062294, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 102701504.0, + "reward": 0.5992187261581421, + "reward_std": 0.47008490562438965, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879264831542969, + "sampling/importance_sampling_ratio/min": 0.008668403141200542, + "sampling/sampling_logp_difference/max": 4.74807071685791, + "sampling/sampling_logp_difference/mean": 0.13112956285476685, + "step": 210 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 740.0, + "completions/max_terminated_length": 740.0, + "completions/mean_length": 309.16015625, + "completions/mean_terminated_length": 309.16015625, + "completions/min_length": 118.0, + "completions/min_terminated_length": 118.0, + "entropy": 0.1704132743179798, + "epoch": 0.3734513274336283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20578594883163423, + "learning_rate": 1e-06, + "loss": 0.0022, + "num_tokens": 103171033.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840466976165771, + "sampling/importance_sampling_ratio/min": 0.011193334124982357, + "sampling/sampling_logp_difference/max": 4.49243688583374, + "sampling/sampling_logp_difference/mean": 0.13917119801044464, + "step": 211 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 494.0, + "completions/max_terminated_length": 494.0, + "completions/mean_length": 293.46484375, + "completions/mean_terminated_length": 293.46484375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.16136296652257442, + "epoch": 0.3752212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37329204569315916, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 103767616.0, + "reward": 0.699414074420929, + "reward_std": 0.4426852762699127, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859615564346313, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1347169727087021, + "step": 212 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 293.265625, + "completions/mean_terminated_length": 293.265625, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.1644913423806429, + "epoch": 0.3769911504424779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2170555057496567, + "learning_rate": 1e-06, + "loss": -0.0055, + "num_tokens": 104458020.0, + "reward": 0.36542969942092896, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.33203125, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9585728645324707, + "sampling/importance_sampling_ratio/mean": 0.9880084991455078, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.1330907642841339, + "step": 213 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 519.0, + "completions/max_terminated_length": 519.0, + "completions/mean_length": 306.6484375, + "completions/mean_terminated_length": 306.6484375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.15919774770736694, + "epoch": 0.3787610619469027, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30691417661315207, + "learning_rate": 1e-06, + "loss": -0.0017, + "num_tokens": 104781370.0, + "reward": 0.606640636920929, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9755750894546509, + "sampling/importance_sampling_ratio/mean": 0.9810018539428711, + "sampling/importance_sampling_ratio/min": 0.01113677304238081, + "sampling/sampling_logp_difference/max": 4.49750280380249, + "sampling/sampling_logp_difference/mean": 0.1426813304424286, + "step": 214 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 496.0, + "completions/max_terminated_length": 496.0, + "completions/mean_length": 292.5, + "completions/mean_terminated_length": 292.5, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.15499872714281082, + "epoch": 0.3805309734513274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19549322050060405, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 105198698.0, + "reward": 0.643750011920929, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982332170009613, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1391640603542328, + "step": 215 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 288.47265625, + "completions/mean_terminated_length": 288.47265625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.15948248468339443, + "epoch": 0.3823008849557522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3898832423180779, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 105588195.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816546440124512, + "sampling/importance_sampling_ratio/min": 0.01118500530719757, + "sampling/sampling_logp_difference/max": 4.493181228637695, + "sampling/sampling_logp_difference/mean": 0.13936170935630798, + "step": 216 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 588.0, + "completions/max_terminated_length": 588.0, + "completions/mean_length": 300.71484375, + "completions/mean_terminated_length": 300.71484375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.14708868972957134, + "epoch": 0.384070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21553840642650957, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 106114490.0, + "reward": 0.725390613079071, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822921752929688, + "sampling/importance_sampling_ratio/min": 0.01114028412848711, + "sampling/sampling_logp_difference/max": 4.497187614440918, + "sampling/sampling_logp_difference/mean": 0.1331436038017273, + "step": 217 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 568.0, + "completions/max_terminated_length": 568.0, + "completions/mean_length": 315.27734375, + "completions/mean_terminated_length": 315.27734375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.1578066125512123, + "epoch": 0.3858407079646018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42582165404524236, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 106655121.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849480390548706, + "sampling/importance_sampling_ratio/min": 0.011137150228023529, + "sampling/sampling_logp_difference/max": 4.497468948364258, + "sampling/sampling_logp_difference/mean": 0.1338350921869278, + "step": 218 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 479.0, + "completions/max_terminated_length": 479.0, + "completions/mean_length": 291.64453125, + "completions/mean_terminated_length": 291.64453125, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.17081346735358238, + "epoch": 0.38761061946902653, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25850475824639435, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 107048630.0, + "reward": 0.569531261920929, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837124943733215, + "sampling/importance_sampling_ratio/min": 0.005275054834783077, + "sampling/sampling_logp_difference/max": 5.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.14238256216049194, + "step": 219 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 323.19921875, + "completions/mean_terminated_length": 323.19921875, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.15954815410077572, + "epoch": 0.3893805309734513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2405263523095734, + "learning_rate": 1e-06, + "loss": 0.0056, + "num_tokens": 107513225.0, + "reward": 0.6585937738418579, + "reward_std": 0.4567192792892456, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837608337402344, + "sampling/importance_sampling_ratio/min": 0.001958009321242571, + "sampling/sampling_logp_difference/max": 6.2358269691467285, + "sampling/sampling_logp_difference/mean": 0.14068523049354553, + "step": 220 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 516.0, + "completions/max_terminated_length": 516.0, + "completions/mean_length": 312.58984375, + "completions/mean_terminated_length": 312.58984375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.161805747076869, + "epoch": 0.3911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2756924935552442, + "learning_rate": 1e-06, + "loss": 0.0129, + "num_tokens": 107975808.0, + "reward": 0.5472656488418579, + "reward_std": 0.47540730237960815, + "rewards/execution_accuracy_EX/mean": 0.5234375, + "rewards/execution_accuracy_EX/std": 0.5004287362098694, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832560420036316, + "sampling/importance_sampling_ratio/min": 0.008837749250233173, + "sampling/sampling_logp_difference/max": 4.728723049163818, + "sampling/sampling_logp_difference/mean": 0.1391669511795044, + "step": 221 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 295.43359375, + "completions/mean_terminated_length": 295.43359375, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.15584200993180275, + "epoch": 0.3929203539823009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30114187415305094, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 108476511.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799416661262512, + "sampling/importance_sampling_ratio/min": 0.005254166200757027, + "sampling/sampling_logp_difference/max": 5.248733997344971, + "sampling/sampling_logp_difference/mean": 0.14466512203216553, + "step": 222 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 644.0, + "completions/max_terminated_length": 644.0, + "completions/mean_length": 319.4296875, + "completions/mean_terminated_length": 319.4296875, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.15634971857070923, + "epoch": 0.39469026548672564, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.201304991818416, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 109067853.0, + "reward": 0.4990234375, + "reward_std": 0.4752182364463806, + "rewards/execution_accuracy_EX/mean": 0.47265625, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9853391647338867, + "sampling/importance_sampling_ratio/min": 0.008726605214178562, + "sampling/sampling_logp_difference/max": 4.7413787841796875, + "sampling/sampling_logp_difference/mean": 0.13357853889465332, + "step": 223 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 327.59375, + "completions/mean_terminated_length": 327.59375, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.16013413295149803, + "epoch": 0.39646017699115044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3197331562916165, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 109578021.0, + "reward": 0.5138671398162842, + "reward_std": 0.4757997393608093, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839286804199219, + "sampling/importance_sampling_ratio/min": 0.014280935749411583, + "sampling/sampling_logp_difference/max": 4.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1407632827758789, + "step": 224 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 692.0, + "completions/max_terminated_length": 692.0, + "completions/mean_length": 343.03125, + "completions/mean_terminated_length": 343.03125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.1597651895135641, + "epoch": 0.39823008849557523, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2844288120097009, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 109993165.0, + "reward": 0.5361328125, + "reward_std": 0.4757997393608093, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9862627387046814, + "sampling/importance_sampling_ratio/min": 0.0032341775950044394, + "sampling/sampling_logp_difference/max": 5.733980655670166, + "sampling/sampling_logp_difference/mean": 0.13292866945266724, + "step": 225 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 311.453125, + "completions/mean_terminated_length": 311.453125, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.16206582821905613, + "epoch": 0.4, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28634636089839877, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 110330561.0, + "reward": 0.443359375, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.4140625, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9361305236816406, + "sampling/importance_sampling_ratio/mean": 0.981826663017273, + "sampling/importance_sampling_ratio/min": 0.011188104748725891, + "sampling/sampling_logp_difference/max": 4.492904186248779, + "sampling/sampling_logp_difference/mean": 0.14049078524112701, + "step": 226 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 357.38671875, + "completions/mean_terminated_length": 357.38671875, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.17608636245131493, + "epoch": 0.40176991150442476, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.395427305686896, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 110757860.0, + "reward": 0.6177734732627869, + "reward_std": 0.46676450967788696, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860670566558838, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.1404687464237213, + "step": 227 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 315.37890625, + "completions/mean_terminated_length": 315.37890625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.16146675869822502, + "epoch": 0.40353982300884955, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2997694980144929, + "learning_rate": 1e-06, + "loss": -0.0102, + "num_tokens": 111470133.0, + "reward": 0.6623046398162842, + "reward_std": 0.4556131064891815, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9435782432556152, + "sampling/importance_sampling_ratio/mean": 0.9826586246490479, + "sampling/importance_sampling_ratio/min": 0.0015113846166059375, + "sampling/sampling_logp_difference/max": 6.494729042053223, + "sampling/sampling_logp_difference/mean": 0.14155453443527222, + "step": 228 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 597.0, + "completions/max_terminated_length": 597.0, + "completions/mean_length": 327.3828125, + "completions/mean_terminated_length": 327.3828125, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.17297273315489292, + "epoch": 0.40530973451327434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20821698597680982, + "learning_rate": 1e-06, + "loss": -0.0047, + "num_tokens": 111948439.0, + "reward": 0.6623046398162842, + "reward_std": 0.4556131064891815, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845682382583618, + "sampling/importance_sampling_ratio/min": 0.0067481910809874535, + "sampling/sampling_logp_difference/max": 4.998480796813965, + "sampling/sampling_logp_difference/mean": 0.14390261471271515, + "step": 229 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 666.0, + "completions/max_terminated_length": 666.0, + "completions/mean_length": 347.421875, + "completions/mean_terminated_length": 347.421875, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.17069001123309135, + "epoch": 0.40707964601769914, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28389833200173975, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 112647955.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.985714316368103, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1376417577266693, + "step": 230 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 552.0, + "completions/max_terminated_length": 552.0, + "completions/mean_length": 329.04296875, + "completions/mean_terminated_length": 329.04296875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.1747161727398634, + "epoch": 0.4088495575221239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3246365694686947, + "learning_rate": 1e-06, + "loss": 0.003, + "num_tokens": 113204430.0, + "reward": 0.773632824420929, + "reward_std": 0.40552324056625366, + "rewards/execution_accuracy_EX/mean": 0.76171875, + "rewards/execution_accuracy_EX/std": 0.4268665909767151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9791052341461182, + "sampling/importance_sampling_ratio/mean": 0.9838868975639343, + "sampling/importance_sampling_ratio/min": 0.004096913617104292, + "sampling/sampling_logp_difference/max": 5.49752140045166, + "sampling/sampling_logp_difference/mean": 0.14427624642848969, + "step": 231 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 661.0, + "completions/max_terminated_length": 661.0, + "completions/mean_length": 336.01953125, + "completions/mean_terminated_length": 336.01953125, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.18153868429362774, + "epoch": 0.41061946902654867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15723601636185833, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 113646803.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9836591482162476, + "sampling/importance_sampling_ratio/min": 0.018361039459705353, + "sampling/sampling_logp_difference/max": 3.9975242614746094, + "sampling/sampling_logp_difference/mean": 0.1475595384836197, + "step": 232 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 594.0, + "completions/max_terminated_length": 594.0, + "completions/mean_length": 327.80859375, + "completions/mean_terminated_length": 327.80859375, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.18472612835466862, + "epoch": 0.41238938053097346, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24229994102963986, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 114200626.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879395365715027, + "sampling/importance_sampling_ratio/min": 0.011183853261172771, + "sampling/sampling_logp_difference/max": 4.493284225463867, + "sampling/sampling_logp_difference/mean": 0.14690710604190826, + "step": 233 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 884.0, + "completions/max_terminated_length": 884.0, + "completions/mean_length": 350.03515625, + "completions/mean_terminated_length": 350.03515625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.19433023780584335, + "epoch": 0.41415929203539825, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39571142053642266, + "learning_rate": 1e-06, + "loss": 0.0063, + "num_tokens": 114684171.0, + "reward": 0.5101562738418579, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859143495559692, + "sampling/importance_sampling_ratio/min": 0.011154402047395706, + "sampling/sampling_logp_difference/max": 4.4959211349487305, + "sampling/sampling_logp_difference/mean": 0.1544232964515686, + "step": 234 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 690.0, + "completions/max_terminated_length": 690.0, + "completions/mean_length": 320.578125, + "completions/mean_terminated_length": 320.578125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.18371405452489853, + "epoch": 0.415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.41757357450018373, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 115064591.0, + "reward": 0.7365233898162842, + "reward_std": 0.426136314868927, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850724935531616, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1499912142753601, + "step": 235 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 583.0, + "completions/max_terminated_length": 583.0, + "completions/mean_length": 332.62890625, + "completions/mean_terminated_length": 332.62890625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.17574547603726387, + "epoch": 0.4176991150442478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23931763668179823, + "learning_rate": 1e-06, + "loss": -0.0041, + "num_tokens": 115538416.0, + "reward": 0.44707033038139343, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.41796875, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9719640016555786, + "sampling/importance_sampling_ratio/mean": 0.9866256713867188, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.13938941061496735, + "step": 236 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 540.0, + "completions/max_terminated_length": 540.0, + "completions/mean_length": 311.1875, + "completions/mean_terminated_length": 311.1875, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.17596963420510292, + "epoch": 0.4194690265486726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27855657077561136, + "learning_rate": 1e-06, + "loss": -0.0008, + "num_tokens": 116191104.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817546606063843, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.15246939659118652, + "step": 237 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 602.0, + "completions/max_terminated_length": 602.0, + "completions/mean_length": 333.82421875, + "completions/mean_terminated_length": 333.82421875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.17818690091371536, + "epoch": 0.42123893805309737, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22771774066606904, + "learning_rate": 1e-06, + "loss": 0.0062, + "num_tokens": 116660739.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877098798751831, + "sampling/importance_sampling_ratio/min": 0.011156401596963406, + "sampling/sampling_logp_difference/max": 4.495741844177246, + "sampling/sampling_logp_difference/mean": 0.14342039823532104, + "step": 238 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 351.83984375, + "completions/mean_terminated_length": 351.83984375, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.18319394811987877, + "epoch": 0.4230088495575221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22092969991800032, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 117055402.0, + "reward": 0.4730468690395355, + "reward_std": 0.47307512164115906, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9876081943511963, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.1455935686826706, + "step": 239 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 543.0, + "completions/max_terminated_length": 543.0, + "completions/mean_length": 287.22265625, + "completions/mean_terminated_length": 287.22265625, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.182206267490983, + "epoch": 0.4247787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3294145446674225, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 117603347.0, + "reward": 0.5695312023162842, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850952625274658, + "sampling/importance_sampling_ratio/min": 0.011199885047972202, + "sampling/sampling_logp_difference/max": 4.491851806640625, + "sampling/sampling_logp_difference/mean": 0.14532500505447388, + "step": 240 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 725.0, + "completions/max_terminated_length": 725.0, + "completions/mean_length": 341.5859375, + "completions/mean_terminated_length": 341.5859375, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.17782233096659184, + "epoch": 0.4265486725663717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38340099039298753, + "learning_rate": 1e-06, + "loss": 0.0125, + "num_tokens": 118058649.0, + "reward": 0.6029296517372131, + "reward_std": 0.46948158740997314, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9879711866378784, + "sampling/importance_sampling_ratio/min": 0.011154710315167904, + "sampling/sampling_logp_difference/max": 4.495893478393555, + "sampling/sampling_logp_difference/mean": 0.13813140988349915, + "step": 241 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 557.0, + "completions/max_terminated_length": 557.0, + "completions/mean_length": 297.38671875, + "completions/mean_terminated_length": 297.38671875, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.1814308688044548, + "epoch": 0.4283185840707965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31709294912088215, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 118582044.0, + "reward": 0.6771484017372131, + "reward_std": 0.4508545994758606, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9851583242416382, + "sampling/importance_sampling_ratio/min": 0.010942353866994381, + "sampling/sampling_logp_difference/max": 4.5151143074035645, + "sampling/sampling_logp_difference/mean": 0.14765186607837677, + "step": 242 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 845.0, + "completions/max_terminated_length": 845.0, + "completions/mean_length": 333.078125, + "completions/mean_terminated_length": 333.078125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.19774733111262321, + "epoch": 0.4300884955752212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36169428919513547, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 119040768.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9877321124076843, + "sampling/importance_sampling_ratio/min": 0.003735928563401103, + "sampling/sampling_logp_difference/max": 5.58975887298584, + "sampling/sampling_logp_difference/mean": 0.1538747102022171, + "step": 243 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 629.0, + "completions/max_terminated_length": 629.0, + "completions/mean_length": 316.59765625, + "completions/mean_terminated_length": 316.59765625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.18043517507612705, + "epoch": 0.431858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17978239581415803, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 119574377.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867154359817505, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.14469178020954132, + "step": 244 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 505.0, + "completions/max_terminated_length": 505.0, + "completions/mean_length": 322.3125, + "completions/mean_terminated_length": 322.3125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.18301732279360294, + "epoch": 0.4336283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2973983629660076, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 120106969.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860332012176514, + "sampling/importance_sampling_ratio/min": 0.006767266895622015, + "sampling/sampling_logp_difference/max": 4.995657920837402, + "sampling/sampling_logp_difference/mean": 0.14499324560165405, + "step": 245 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 495.0, + "completions/max_terminated_length": 495.0, + "completions/mean_length": 285.00390625, + "completions/mean_terminated_length": 285.00390625, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.16773789562284946, + "epoch": 0.4353982300884956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25450630742592706, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 120518362.0, + "reward": 0.740234375, + "reward_std": 0.4242667853832245, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9257066249847412, + "sampling/importance_sampling_ratio/mean": 0.9835507869720459, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.14187446236610413, + "step": 246 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 321.71484375, + "completions/mean_terminated_length": 321.71484375, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.179510110989213, + "epoch": 0.43716814159292033, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21790185822960076, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 121005025.0, + "reward": 0.49160152673721313, + "reward_std": 0.4747525155544281, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9858831167221069, + "sampling/importance_sampling_ratio/min": 0.01842484436929226, + "sampling/sampling_logp_difference/max": 3.9940552711486816, + "sampling/sampling_logp_difference/mean": 0.14202405512332916, + "step": 247 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 550.0, + "completions/max_terminated_length": 550.0, + "completions/mean_length": 322.28515625, + "completions/mean_terminated_length": 322.28515625, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.15929356217384338, + "epoch": 0.4389380530973451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16521585929931884, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 121496202.0, + "reward": 0.595507800579071, + "reward_std": 0.47065800428390503, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9857891798019409, + "sampling/importance_sampling_ratio/min": 0.011142482981085777, + "sampling/sampling_logp_difference/max": 4.496990203857422, + "sampling/sampling_logp_difference/mean": 0.12880417704582214, + "step": 248 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 630.0, + "completions/max_terminated_length": 630.0, + "completions/mean_length": 353.7578125, + "completions/mean_terminated_length": 353.7578125, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.1773077417165041, + "epoch": 0.4407079646017699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19366559964761546, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 122071564.0, + "reward": 0.40625, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9875608682632446, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.13819408416748047, + "step": 249 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 289.19140625, + "completions/mean_terminated_length": 289.19140625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.1673753820359707, + "epoch": 0.4424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38442505347754724, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 122564893.0, + "reward": 0.6697266101837158, + "reward_std": 0.4533011019229889, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9876739978790283, + "sampling/importance_sampling_ratio/mean": 0.9846241474151611, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.13869065046310425, + "step": 250 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 577.0, + "completions/max_terminated_length": 577.0, + "completions/mean_length": 304.8046875, + "completions/mean_terminated_length": 304.8046875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.16497358679771423, + "epoch": 0.44424778761061945, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27246430304229335, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 123116107.0, + "reward": 0.7216796875, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840646982192993, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1388365626335144, + "step": 251 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 722.0, + "completions/max_terminated_length": 722.0, + "completions/mean_length": 312.69140625, + "completions/mean_terminated_length": 312.69140625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.1695473212748766, + "epoch": 0.44601769911504424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3348062185289668, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 123649148.0, + "reward": 0.6251952648162842, + "reward_std": 0.4652217924594879, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839234352111816, + "sampling/importance_sampling_ratio/min": 0.01440563052892685, + "sampling/sampling_logp_difference/max": 4.24013614654541, + "sampling/sampling_logp_difference/mean": 0.13801856338977814, + "step": 252 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 739.0, + "completions/max_terminated_length": 739.0, + "completions/mean_length": 334.0390625, + "completions/mean_terminated_length": 334.0390625, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.16055668145418167, + "epoch": 0.44778761061946903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16926424042417285, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 123986230.0, + "reward": 0.625195324420929, + "reward_std": 0.4652218222618103, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984278678894043, + "sampling/importance_sampling_ratio/min": 0.011232423596084118, + "sampling/sampling_logp_difference/max": 4.488950729370117, + "sampling/sampling_logp_difference/mean": 0.1337374448776245, + "step": 253 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 307.45703125, + "completions/mean_terminated_length": 307.45703125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.16157903522253036, + "epoch": 0.4495575221238938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26307459078243706, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 124512651.0, + "reward": 0.6771484613418579, + "reward_std": 0.450854629278183, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9835008382797241, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13886550068855286, + "step": 254 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 460.0, + "completions/max_terminated_length": 460.0, + "completions/mean_length": 301.69140625, + "completions/mean_terminated_length": 301.69140625, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.14815949089825153, + "epoch": 0.45132743362831856, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13015455893654557, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 124877356.0, + "reward": 0.5101562738418579, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983404278755188, + "sampling/importance_sampling_ratio/min": 0.006765482947230339, + "sampling/sampling_logp_difference/max": 4.995921611785889, + "sampling/sampling_logp_difference/mean": 0.1309133768081665, + "step": 255 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 824.0, + "completions/max_terminated_length": 824.0, + "completions/mean_length": 322.87109375, + "completions/mean_terminated_length": 322.87109375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.16214213706552982, + "epoch": 0.45309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32262561716499366, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 125303627.0, + "reward": 0.5064452886581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820327162742615, + "sampling/importance_sampling_ratio/min": 0.01214428897947073, + "sampling/sampling_logp_difference/max": 4.410896301269531, + "sampling/sampling_logp_difference/mean": 0.14292088150978088, + "step": 256 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 694.0, + "completions/max_terminated_length": 694.0, + "completions/mean_length": 300.06640625, + "completions/mean_terminated_length": 300.06640625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.1590729933232069, + "epoch": 0.45486725663716815, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2181450860924923, + "learning_rate": 1e-06, + "loss": 0.0035, + "num_tokens": 125907052.0, + "reward": 0.7253906726837158, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9819579124450684, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13893938064575195, + "step": 257 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 323.55859375, + "completions/mean_terminated_length": 323.55859375, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.15674197487533092, + "epoch": 0.45663716814159294, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3799954308113515, + "learning_rate": 1e-06, + "loss": 0.0087, + "num_tokens": 126344987.0, + "reward": 0.606640636920929, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9868935942649841, + "sampling/importance_sampling_ratio/min": 0.008657840080559254, + "sampling/sampling_logp_difference/max": 4.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.13239502906799316, + "step": 258 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 490.0, + "completions/max_terminated_length": 490.0, + "completions/mean_length": 315.70703125, + "completions/mean_terminated_length": 315.70703125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.15376710519194603, + "epoch": 0.4584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08317564004118783, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 126751984.0, + "reward": 0.6400390267372131, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9747483730316162, + "sampling/importance_sampling_ratio/mean": 0.9853553175926208, + "sampling/importance_sampling_ratio/min": 0.006758376490324736, + "sampling/sampling_logp_difference/max": 4.996972560882568, + "sampling/sampling_logp_difference/mean": 0.1312606930732727, + "step": 259 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 720.0, + "completions/max_terminated_length": 720.0, + "completions/mean_length": 361.0859375, + "completions/mean_terminated_length": 361.0859375, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.17142193764448166, + "epoch": 0.46017699115044247, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39186474780294517, + "learning_rate": 1e-06, + "loss": 0.002, + "num_tokens": 127317318.0, + "reward": 0.5621093511581421, + "reward_std": 0.47447583079338074, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859663248062134, + "sampling/importance_sampling_ratio/min": 0.008855233900249004, + "sampling/sampling_logp_difference/max": 4.726746559143066, + "sampling/sampling_logp_difference/mean": 0.139002725481987, + "step": 260 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 556.0, + "completions/max_terminated_length": 556.0, + "completions/mean_length": 319.04296875, + "completions/mean_terminated_length": 319.04296875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.15111440606415272, + "epoch": 0.46194690265486726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1436068113243909, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 127721953.0, + "reward": 0.7513672113418579, + "reward_std": 0.4184097647666931, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850014448165894, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13237908482551575, + "step": 261 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 313.9921875, + "completions/mean_terminated_length": 313.9921875, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.1543640997260809, + "epoch": 0.46371681415929206, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30275091627194683, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 128086383.0, + "reward": 0.755078136920929, + "reward_std": 0.41637277603149414, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9859907627105713, + "sampling/importance_sampling_ratio/mean": 0.9828374981880188, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.1326378881931305, + "step": 262 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 335.09375, + "completions/mean_terminated_length": 335.09375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.16963398829102516, + "epoch": 0.4654867256637168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18390525599939167, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 128510423.0, + "reward": 0.532421886920929, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.5078125, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9863378405570984, + "sampling/importance_sampling_ratio/min": 0.014339092187583447, + "sampling/sampling_logp_difference/max": 4.244765758514404, + "sampling/sampling_logp_difference/mean": 0.14008203148841858, + "step": 263 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 449.0, + "completions/max_terminated_length": 449.0, + "completions/mean_length": 274.6484375, + "completions/mean_terminated_length": 274.6484375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.15089268796145916, + "epoch": 0.4672566371681416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2774874574360264, + "learning_rate": 1e-06, + "loss": 0.0025, + "num_tokens": 129029341.0, + "reward": 0.7328125238418579, + "reward_std": 0.4279654622077942, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9828546047210693, + "sampling/importance_sampling_ratio/min": 0.01623484492301941, + "sampling/sampling_logp_difference/max": 4.120595455169678, + "sampling/sampling_logp_difference/mean": 0.13830263912677765, + "step": 264 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 336.5625, + "completions/mean_terminated_length": 336.5625, + "completions/min_length": 201.0, + "completions/min_terminated_length": 201.0, + "entropy": 0.15643581748008728, + "epoch": 0.4690265486725664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26534418305967483, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 129427373.0, + "reward": 0.7291015386581421, + "reward_std": 0.4297545552253723, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9860031008720398, + "sampling/importance_sampling_ratio/min": 0.004114307928830385, + "sampling/sampling_logp_difference/max": 5.493284702301025, + "sampling/sampling_logp_difference/mean": 0.13032007217407227, + "step": 265 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 635.0, + "completions/max_terminated_length": 635.0, + "completions/mean_length": 349.44140625, + "completions/mean_terminated_length": 349.44140625, + "completions/min_length": 216.0, + "completions/min_terminated_length": 216.0, + "entropy": 0.1617734730243683, + "epoch": 0.47079646017699117, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30356793750029853, + "learning_rate": 1e-06, + "loss": 0.0127, + "num_tokens": 129892142.0, + "reward": 0.666015625, + "reward_std": 0.4544737935066223, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848423004150391, + "sampling/importance_sampling_ratio/min": 0.01123324315994978, + "sampling/sampling_logp_difference/max": 4.488877773284912, + "sampling/sampling_logp_difference/mean": 0.13442406058311462, + "step": 266 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 357.18359375, + "completions/mean_terminated_length": 357.18359375, + "completions/min_length": 241.0, + "completions/min_terminated_length": 241.0, + "entropy": 0.1611873358488083, + "epoch": 0.4725663716814159, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28912781415223693, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 130367613.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.961456060409546, + "sampling/importance_sampling_ratio/mean": 0.9876816272735596, + "sampling/importance_sampling_ratio/min": 0.011342701502144337, + "sampling/sampling_logp_difference/max": 4.479180812835693, + "sampling/sampling_logp_difference/mean": 0.13081085681915283, + "step": 267 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 709.0, + "completions/max_terminated_length": 709.0, + "completions/mean_length": 335.24609375, + "completions/mean_terminated_length": 335.24609375, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.15762670896947384, + "epoch": 0.4743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 130714908.0, + "reward": 0.5843749642372131, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845083951950073, + "sampling/importance_sampling_ratio/min": 0.003295636037364602, + "sampling/sampling_logp_difference/max": 5.715156078338623, + "sampling/sampling_logp_difference/mean": 0.1356452852487564, + "step": 268 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 526.0, + "completions/max_terminated_length": 526.0, + "completions/mean_length": 318.171875, + "completions/mean_terminated_length": 318.171875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.16317665576934814, + "epoch": 0.4761061946902655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21386895021090507, + "learning_rate": 1e-06, + "loss": 0.0013, + "num_tokens": 131188024.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9177110195159912, + "sampling/importance_sampling_ratio/mean": 0.9831749200820923, + "sampling/importance_sampling_ratio/min": 0.0028142144437879324, + "sampling/sampling_logp_difference/max": 5.873072147369385, + "sampling/sampling_logp_difference/mean": 0.14006644487380981, + "step": 269 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 600.0, + "completions/max_terminated_length": 600.0, + "completions/mean_length": 318.9921875, + "completions/mean_terminated_length": 318.9921875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.15535980463027954, + "epoch": 0.4778761061946903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42667060502469817, + "learning_rate": 1e-06, + "loss": 0.0048, + "num_tokens": 131626358.0, + "reward": 0.7105468511581421, + "reward_std": 0.43811774253845215, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823930263519287, + "sampling/importance_sampling_ratio/min": 0.01123306155204773, + "sampling/sampling_logp_difference/max": 4.488893985748291, + "sampling/sampling_logp_difference/mean": 0.1382257342338562, + "step": 270 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 677.0, + "completions/max_terminated_length": 677.0, + "completions/mean_length": 330.00390625, + "completions/mean_terminated_length": 330.00390625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.15658987313508987, + "epoch": 0.479646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0727885528061531, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 132231719.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9706207513809204, + "sampling/importance_sampling_ratio/mean": 0.9849157333374023, + "sampling/importance_sampling_ratio/min": 0.018390489742159843, + "sampling/sampling_logp_difference/max": 3.9959216117858887, + "sampling/sampling_logp_difference/mean": 0.13399212062358856, + "step": 271 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 745.0, + "completions/max_terminated_length": 745.0, + "completions/mean_length": 329.05859375, + "completions/mean_terminated_length": 329.05859375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.17054855078458786, + "epoch": 0.4814159292035398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2497833425558903, + "learning_rate": 1e-06, + "loss": -0.0048, + "num_tokens": 132795046.0, + "reward": 0.5101562142372131, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9843830466270447, + "sampling/importance_sampling_ratio/min": 0.014309653080999851, + "sampling/sampling_logp_difference/max": 4.24682092666626, + "sampling/sampling_logp_difference/mean": 0.1444835662841797, + "step": 272 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 599.0, + "completions/max_terminated_length": 599.0, + "completions/mean_length": 363.8359375, + "completions/mean_terminated_length": 363.8359375, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.16747299954295158, + "epoch": 0.4831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26470996479696285, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 133427404.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838159084320068, + "sampling/importance_sampling_ratio/min": 0.01124486792832613, + "sampling/sampling_logp_difference/max": 4.4878435134887695, + "sampling/sampling_logp_difference/mean": 0.1417458951473236, + "step": 273 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 631.0, + "completions/max_terminated_length": 631.0, + "completions/mean_length": 325.0390625, + "completions/mean_terminated_length": 325.0390625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.1463457401841879, + "epoch": 0.4849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11071843423636837, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 133968694.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9844371676445007, + "sampling/importance_sampling_ratio/min": 0.014294745400547981, + "sampling/sampling_logp_difference/max": 4.247863292694092, + "sampling/sampling_logp_difference/mean": 0.1296854466199875, + "step": 274 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 711.0, + "completions/max_terminated_length": 711.0, + "completions/mean_length": 311.1796875, + "completions/mean_terminated_length": 311.1796875, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.14501554891467094, + "epoch": 0.48672566371681414, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12567377006957567, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 134473476.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830794334411621, + "sampling/importance_sampling_ratio/min": 0.014313235878944397, + "sampling/sampling_logp_difference/max": 4.246570587158203, + "sampling/sampling_logp_difference/mean": 0.1320023238658905, + "step": 275 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 547.0, + "completions/max_terminated_length": 547.0, + "completions/mean_length": 321.14453125, + "completions/mean_terminated_length": 321.14453125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.14876681938767433, + "epoch": 0.48849557522123893, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20503261543219914, + "learning_rate": 1e-06, + "loss": -0.0025, + "num_tokens": 134942761.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824311137199402, + "sampling/importance_sampling_ratio/min": 0.0143876438960433, + "sampling/sampling_logp_difference/max": 4.241385459899902, + "sampling/sampling_logp_difference/mean": 0.13451460003852844, + "step": 276 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 480.0, + "completions/max_terminated_length": 480.0, + "completions/mean_length": 300.83203125, + "completions/mean_terminated_length": 300.83203125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.14324429631233215, + "epoch": 0.4902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22762235222323923, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 135427998.0, + "reward": 0.740234375, + "reward_std": 0.4242667853832245, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978734016418457, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.14036285877227783, + "step": 277 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 621.0, + "completions/max_terminated_length": 621.0, + "completions/mean_length": 339.51953125, + "completions/mean_terminated_length": 339.51953125, + "completions/min_length": 242.0, + "completions/min_terminated_length": 242.0, + "entropy": 0.13792935758829117, + "epoch": 0.4920353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22668747679870122, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 136001715.0, + "reward": 0.6771484613418579, + "reward_std": 0.450854629278183, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820734262466431, + "sampling/importance_sampling_ratio/min": 0.008748387917876244, + "sampling/sampling_logp_difference/max": 4.738885879516602, + "sampling/sampling_logp_difference/mean": 0.12939125299453735, + "step": 278 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 312.015625, + "completions/mean_terminated_length": 312.015625, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.1376123521476984, + "epoch": 0.49380530973451325, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2882416970979156, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 136525559.0, + "reward": 0.6808593273162842, + "reward_std": 0.44958025217056274, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782224297523499, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13522344827651978, + "step": 279 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 839.0, + "completions/max_terminated_length": 839.0, + "completions/mean_length": 338.38671875, + "completions/mean_terminated_length": 338.38671875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.1472346018999815, + "epoch": 0.49557522123893805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32968065198006247, + "learning_rate": 1e-06, + "loss": 0.0157, + "num_tokens": 137101626.0, + "reward": 0.740234375, + "reward_std": 0.4242667853832245, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984591007232666, + "sampling/importance_sampling_ratio/min": 0.011125885881483555, + "sampling/sampling_logp_difference/max": 4.498480796813965, + "sampling/sampling_logp_difference/mean": 0.13329678773880005, + "step": 280 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 461.0, + "completions/max_terminated_length": 461.0, + "completions/mean_length": 288.8046875, + "completions/mean_terminated_length": 288.8046875, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.13036941178143024, + "epoch": 0.49734513274336284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2993157126682758, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 137503208.0, + "reward": 0.725390613079071, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811557531356812, + "sampling/importance_sampling_ratio/min": 0.008661936968564987, + "sampling/sampling_logp_difference/max": 4.748816967010498, + "sampling/sampling_logp_difference/mean": 0.12963128089904785, + "step": 281 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 308.6953125, + "completions/mean_terminated_length": 308.6953125, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.13289682008326054, + "epoch": 0.49911504424778763, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 137919930.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788047075271606, + "sampling/importance_sampling_ratio/min": 0.01113810669630766, + "sampling/sampling_logp_difference/max": 4.497383117675781, + "sampling/sampling_logp_difference/mean": 0.1310933232307434, + "step": 282 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 625.0, + "completions/max_terminated_length": 625.0, + "completions/mean_length": 334.78125, + "completions/mean_terminated_length": 334.78125, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.1308655822649598, + "epoch": 0.5008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28872882366535085, + "learning_rate": 1e-06, + "loss": -0.004, + "num_tokens": 138393138.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593045234680176, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815727472305298, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.12588366866111755, + "step": 283 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 553.0, + "completions/max_terminated_length": 553.0, + "completions/mean_length": 339.453125, + "completions/mean_terminated_length": 339.453125, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.1365262884646654, + "epoch": 0.5026548672566372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22924419395278106, + "learning_rate": 1e-06, + "loss": 0.0032, + "num_tokens": 138840390.0, + "reward": 0.6103515625, + "reward_std": 0.46818408370018005, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9821449518203735, + "sampling/importance_sampling_ratio/min": 0.014339229092001915, + "sampling/sampling_logp_difference/max": 4.24475622177124, + "sampling/sampling_logp_difference/mean": 0.1286645233631134, + "step": 284 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 280.16796875, + "completions/mean_terminated_length": 280.16796875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.12094041053205729, + "epoch": 0.504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 139351585.0, + "reward": 0.762499988079071, + "reward_std": 0.41216787695884705, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793006181716919, + "sampling/importance_sampling_ratio/min": 0.011343728750944138, + "sampling/sampling_logp_difference/max": 4.479090213775635, + "sampling/sampling_logp_difference/mean": 0.12439154088497162, + "step": 285 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 546.0, + "completions/max_terminated_length": 546.0, + "completions/mean_length": 297.28125, + "completions/mean_terminated_length": 297.28125, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.12189241219311953, + "epoch": 0.5061946902654867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06708248641559404, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 139885689.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9511847496032715, + "sampling/importance_sampling_ratio/mean": 0.9776397943496704, + "sampling/importance_sampling_ratio/min": 0.011178990826010704, + "sampling/sampling_logp_difference/max": 4.493719100952148, + "sampling/sampling_logp_difference/mean": 0.12825796008110046, + "step": 286 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 608.0, + "completions/max_terminated_length": 608.0, + "completions/mean_length": 333.3125, + "completions/mean_terminated_length": 333.3125, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.13882357813417912, + "epoch": 0.5079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11393450014408663, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 140461433.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9792859554290771, + "sampling/importance_sampling_ratio/min": 0.011136699467897415, + "sampling/sampling_logp_difference/max": 4.497509479522705, + "sampling/sampling_logp_difference/mean": 0.1329355239868164, + "step": 287 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 451.0, + "completions/max_terminated_length": 451.0, + "completions/mean_length": 304.63671875, + "completions/mean_terminated_length": 304.63671875, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.11912460718303919, + "epoch": 0.5097345132743363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24897134072012178, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 140816876.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.996685266494751, + "sampling/importance_sampling_ratio/mean": 0.9798433780670166, + "sampling/importance_sampling_ratio/min": 0.011232483200728893, + "sampling/sampling_logp_difference/max": 4.488945484161377, + "sampling/sampling_logp_difference/mean": 0.12297806143760681, + "step": 288 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 562.0, + "completions/max_terminated_length": 562.0, + "completions/mean_length": 297.28515625, + "completions/mean_terminated_length": 297.28515625, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.13027150463312864, + "epoch": 0.511504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34148070514391965, + "learning_rate": 1e-06, + "loss": -0.0068, + "num_tokens": 141478917.0, + "reward": 0.7736327648162842, + "reward_std": 0.40552324056625366, + "rewards/execution_accuracy_EX/mean": 0.76171875, + "rewards/execution_accuracy_EX/std": 0.4268665909767151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9046118259429932, + "sampling/importance_sampling_ratio/mean": 0.9805927276611328, + "sampling/importance_sampling_ratio/min": 0.00674409931525588, + "sampling/sampling_logp_difference/max": 4.999087333679199, + "sampling/sampling_logp_difference/mean": 0.1281258761882782, + "step": 289 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 640.0, + "completions/max_terminated_length": 640.0, + "completions/mean_length": 347.13671875, + "completions/mean_terminated_length": 347.13671875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.1376639176160097, + "epoch": 0.5132743362831859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18460042307419683, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 142237640.0, + "reward": 0.5138671398162842, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9834855794906616, + "sampling/importance_sampling_ratio/min": 0.0007139044464565814, + "sampling/sampling_logp_difference/max": 7.2447614669799805, + "sampling/sampling_logp_difference/mean": 0.12558966875076294, + "step": 290 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 530.0, + "completions/max_terminated_length": 530.0, + "completions/mean_length": 316.39453125, + "completions/mean_terminated_length": 316.39453125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.14952068030834198, + "epoch": 0.5150442477876106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4095413319425253, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 143017469.0, + "reward": 0.40625, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.375, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810730218887329, + "sampling/importance_sampling_ratio/min": 0.0004330030642449856, + "sampling/sampling_logp_difference/max": 7.744765758514404, + "sampling/sampling_logp_difference/mean": 0.1375647485256195, + "step": 291 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 702.0, + "completions/max_terminated_length": 702.0, + "completions/mean_length": 318.98828125, + "completions/mean_terminated_length": 318.98828125, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.1301272250711918, + "epoch": 0.5168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29732860154924123, + "learning_rate": 1e-06, + "loss": -0.0053, + "num_tokens": 143526138.0, + "reward": 0.762499988079071, + "reward_std": 0.41216787695884705, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799952507019043, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.12830430269241333, + "step": 292 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 524.0, + "completions/max_terminated_length": 524.0, + "completions/mean_length": 297.74609375, + "completions/mean_terminated_length": 297.74609375, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.13298062421381474, + "epoch": 0.5185840707964602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2787990158226181, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 144205865.0, + "reward": 0.7365233898162842, + "reward_std": 0.4261363446712494, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9447996616363525, + "sampling/importance_sampling_ratio/mean": 0.980080783367157, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.13017098605632782, + "step": 293 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 549.0, + "completions/max_terminated_length": 549.0, + "completions/mean_length": 324.515625, + "completions/mean_terminated_length": 324.515625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.145736213773489, + "epoch": 0.5203539823008849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22014542190198488, + "learning_rate": 1e-06, + "loss": 0.0017, + "num_tokens": 144691277.0, + "reward": 0.48417967557907104, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811964631080627, + "sampling/importance_sampling_ratio/min": 0.008673182688653469, + "sampling/sampling_logp_difference/max": 4.747519493103027, + "sampling/sampling_logp_difference/mean": 0.1368313431739807, + "step": 294 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 431.0, + "completions/max_terminated_length": 431.0, + "completions/mean_length": 278.1796875, + "completions/mean_terminated_length": 278.1796875, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.11698152963072062, + "epoch": 0.5221238938053098, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31372558641082093, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 145100715.0, + "reward": 0.5695312023162842, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978211522102356, + "sampling/importance_sampling_ratio/min": 0.006767896935343742, + "sampling/sampling_logp_difference/max": 4.995564937591553, + "sampling/sampling_logp_difference/mean": 0.12563587725162506, + "step": 295 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 617.0, + "completions/max_terminated_length": 617.0, + "completions/mean_length": 303.96875, + "completions/mean_terminated_length": 303.96875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.12474698945879936, + "epoch": 0.5238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2729089845891089, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 145483427.0, + "reward": 0.6029296517372131, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778717756271362, + "sampling/importance_sampling_ratio/min": 0.01152029074728489, + "sampling/sampling_logp_difference/max": 4.4636454582214355, + "sampling/sampling_logp_difference/mean": 0.1309039443731308, + "step": 296 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 285.16796875, + "completions/mean_terminated_length": 285.16796875, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.1238878509029746, + "epoch": 0.5256637168141592, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2991484627446501, + "learning_rate": 1e-06, + "loss": 0.0155, + "num_tokens": 146028926.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9792416095733643, + "sampling/importance_sampling_ratio/min": 0.00682168360799551, + "sampling/sampling_logp_difference/max": 4.987648963928223, + "sampling/sampling_logp_difference/mean": 0.1251850724220276, + "step": 297 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 657.0, + "completions/max_terminated_length": 657.0, + "completions/mean_length": 273.83984375, + "completions/mean_terminated_length": 273.83984375, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.12272048555314541, + "epoch": 0.5274336283185841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24746918048771818, + "learning_rate": 1e-06, + "loss": 0.0026, + "num_tokens": 146340501.0, + "reward": 0.42851561307907104, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.3984375, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7686686515808105, + "sampling/importance_sampling_ratio/mean": 0.9759113788604736, + "sampling/importance_sampling_ratio/min": 0.008663984946906567, + "sampling/sampling_logp_difference/max": 4.748580455780029, + "sampling/sampling_logp_difference/mean": 0.13672521710395813, + "step": 298 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 523.0, + "completions/max_terminated_length": 523.0, + "completions/mean_length": 291.1484375, + "completions/mean_terminated_length": 291.1484375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.12394737359136343, + "epoch": 0.5292035398230088, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23592129131052397, + "learning_rate": 1e-06, + "loss": 0.0033, + "num_tokens": 146827131.0, + "reward": 0.539843738079071, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9780433177947998, + "sampling/importance_sampling_ratio/min": 0.011163590475916862, + "sampling/sampling_logp_difference/max": 4.495097637176514, + "sampling/sampling_logp_difference/mean": 0.13204070925712585, + "step": 299 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 507.0, + "completions/max_terminated_length": 507.0, + "completions/mean_length": 283.06640625, + "completions/mean_terminated_length": 283.06640625, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.10977283492684364, + "epoch": 0.5309734513274337, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39569112200650275, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 147205932.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9754796028137207, + "sampling/importance_sampling_ratio/min": 0.004210295621305704, + "sampling/sampling_logp_difference/max": 5.470222473144531, + "sampling/sampling_logp_difference/mean": 0.12663070857524872, + "step": 300 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 612.0, + "completions/max_terminated_length": 612.0, + "completions/mean_length": 293.30078125, + "completions/mean_terminated_length": 293.30078125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.13259944692254066, + "epoch": 0.5327433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23623998480101524, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 147868361.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826499223709106, + "sampling/importance_sampling_ratio/min": 0.009238926693797112, + "sampling/sampling_logp_difference/max": 4.684329509735107, + "sampling/sampling_logp_difference/mean": 0.12703940272331238, + "step": 301 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 421.0, + "completions/max_terminated_length": 421.0, + "completions/mean_length": 283.91796875, + "completions/mean_terminated_length": 283.91796875, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.1126785958185792, + "epoch": 0.5345132743362832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25116616262053426, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 148352836.0, + "reward": 0.614062488079071, + "reward_std": 0.46748965978622437, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9756138920783997, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.12942509353160858, + "step": 302 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 491.0, + "completions/max_terminated_length": 491.0, + "completions/mean_length": 291.55859375, + "completions/mean_terminated_length": 291.55859375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.12431715428829193, + "epoch": 0.536283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3050077209973281, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 148898403.0, + "reward": 0.558398425579071, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9719961881637573, + "sampling/importance_sampling_ratio/mean": 0.9764449596405029, + "sampling/importance_sampling_ratio/min": 0.0024954539258033037, + "sampling/sampling_logp_difference/max": 5.993284702301025, + "sampling/sampling_logp_difference/mean": 0.13283288478851318, + "step": 303 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 446.0, + "completions/max_terminated_length": 446.0, + "completions/mean_length": 287.00390625, + "completions/mean_terminated_length": 287.00390625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.1052580252289772, + "epoch": 0.5380530973451327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2822301487141552, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 149474948.0, + "reward": 0.7365233898162842, + "reward_std": 0.426136314868927, + "rewards/execution_accuracy_EX/mean": 0.72265625, + "rewards/execution_accuracy_EX/std": 0.4485645890235901, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817387461662292, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.11145463585853577, + "step": 304 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 376.0, + "completions/max_terminated_length": 376.0, + "completions/mean_length": 273.0703125, + "completions/mean_terminated_length": 273.0703125, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.0955903958529234, + "epoch": 0.5398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.335108241926158, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 149883126.0, + "reward": 0.6734374761581421, + "reward_std": 0.45209479331970215, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8455147743225098, + "sampling/importance_sampling_ratio/mean": 0.9789060354232788, + "sampling/importance_sampling_ratio/min": 0.004095620010048151, + "sampling/sampling_logp_difference/max": 5.497837066650391, + "sampling/sampling_logp_difference/mean": 0.11343404650688171, + "step": 305 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 576.0, + "completions/max_terminated_length": 576.0, + "completions/mean_length": 263.23828125, + "completions/mean_terminated_length": 263.23828125, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.10447422694414854, + "epoch": 0.5415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3523130921969951, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 150463939.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9138906002044678, + "sampling/importance_sampling_ratio/mean": 0.9792771935462952, + "sampling/importance_sampling_ratio/min": 0.004135684575885534, + "sampling/sampling_logp_difference/max": 5.488102436065674, + "sampling/sampling_logp_difference/mean": 0.11896039545536041, + "step": 306 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 616.0, + "completions/max_terminated_length": 616.0, + "completions/mean_length": 282.62890625, + "completions/mean_terminated_length": 282.62890625, + "completions/min_length": 177.0, + "completions/min_terminated_length": 177.0, + "entropy": 0.11410106346011162, + "epoch": 0.5433628318584071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3785351339284642, + "learning_rate": 1e-06, + "loss": -0.0069, + "num_tokens": 151005332.0, + "reward": 0.77734375, + "reward_std": 0.4032154679298401, + "rewards/execution_accuracy_EX/mean": 0.765625, + "rewards/execution_accuracy_EX/std": 0.42443734407424927, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782164096832275, + "sampling/importance_sampling_ratio/min": 0.008668584749102592, + "sampling/sampling_logp_difference/max": 4.748049736022949, + "sampling/sampling_logp_difference/mean": 0.12442274391651154, + "step": 307 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 487.0, + "completions/max_terminated_length": 487.0, + "completions/mean_length": 302.93359375, + "completions/mean_terminated_length": 302.93359375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.10900202672928572, + "epoch": 0.5451327433628319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2965077105083676, + "learning_rate": 1e-06, + "loss": -0.0026, + "num_tokens": 151567075.0, + "reward": 0.7476562261581421, + "reward_std": 0.4204040467739105, + "rewards/execution_accuracy_EX/mean": 0.734375, + "rewards/execution_accuracy_EX/std": 0.4425306022167206, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9635331630706787, + "sampling/importance_sampling_ratio/mean": 0.9799326658248901, + "sampling/importance_sampling_ratio/min": 0.013507682830095291, + "sampling/sampling_logp_difference/max": 4.304496765136719, + "sampling/sampling_logp_difference/mean": 0.1177440732717514, + "step": 308 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 648.0, + "completions/max_terminated_length": 648.0, + "completions/mean_length": 297.703125, + "completions/mean_terminated_length": 297.703125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.10871629603207111, + "epoch": 0.5469026548672566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1966184489579502, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 151994919.0, + "reward": 0.5101562142372131, + "reward_std": 0.47569799423217773, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9745786190032959, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1306045800447464, + "step": 309 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 278.81640625, + "completions/mean_terminated_length": 278.81640625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.09989846311509609, + "epoch": 0.5486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28743303121757885, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 152490808.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9809783697128296, + "sampling/importance_sampling_ratio/mean": 0.9790998697280884, + "sampling/importance_sampling_ratio/min": 0.008679677732288837, + "sampling/sampling_logp_difference/max": 4.746770858764648, + "sampling/sampling_logp_difference/mean": 0.11842896044254303, + "step": 310 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 448.0, + "completions/max_terminated_length": 448.0, + "completions/mean_length": 264.90234375, + "completions/mean_terminated_length": 264.90234375, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.09828067570924759, + "epoch": 0.5504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17912085939098438, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 152934223.0, + "reward": 0.6808593273162842, + "reward_std": 0.44958025217056274, + "rewards/execution_accuracy_EX/mean": 0.6640625, + "rewards/execution_accuracy_EX/std": 0.4732423722743988, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8792515993118286, + "sampling/importance_sampling_ratio/mean": 0.9768507480621338, + "sampling/importance_sampling_ratio/min": 0.00525788776576519, + "sampling/sampling_logp_difference/max": 5.248025894165039, + "sampling/sampling_logp_difference/mean": 0.12117663770914078, + "step": 311 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 545.0, + "completions/max_terminated_length": 545.0, + "completions/mean_length": 272.1953125, + "completions/mean_terminated_length": 272.1953125, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.09398173447698355, + "epoch": 0.552212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.254011635424865, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 153340865.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212619304657, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8599828481674194, + "sampling/importance_sampling_ratio/mean": 0.9765916466712952, + "sampling/importance_sampling_ratio/min": 0.006748223211616278, + "sampling/sampling_logp_difference/max": 4.998476028442383, + "sampling/sampling_logp_difference/mean": 0.11689199507236481, + "step": 312 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 263.62890625, + "completions/mean_terminated_length": 263.62890625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.09625769220292568, + "epoch": 0.5539823008849557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20811602609326846, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 153646226.0, + "reward": 0.6771484613418579, + "reward_std": 0.450854629278183, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8367109298706055, + "sampling/importance_sampling_ratio/mean": 0.9744427800178528, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.12279993295669556, + "step": 313 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 685.0, + "completions/max_terminated_length": 685.0, + "completions/mean_length": 258.38671875, + "completions/mean_terminated_length": 258.38671875, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.09994737897068262, + "epoch": 0.5557522123893806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31314542087485775, + "learning_rate": 1e-06, + "loss": -0.0074, + "num_tokens": 154093365.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9763374328613281, + "sampling/importance_sampling_ratio/min": 0.005260583944618702, + "sampling/sampling_logp_difference/max": 5.247513294219971, + "sampling/sampling_logp_difference/mean": 0.1211942732334137, + "step": 314 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 464.0, + "completions/max_terminated_length": 464.0, + "completions/mean_length": 240.60546875, + "completions/mean_terminated_length": 240.60546875, + "completions/min_length": 142.0, + "completions/min_terminated_length": 142.0, + "entropy": 0.09025467094033957, + "epoch": 0.5575221238938053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31219309775486603, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 154605888.0, + "reward": 0.6734374761581421, + "reward_std": 0.45209482312202454, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8792815208435059, + "sampling/importance_sampling_ratio/mean": 0.9757258892059326, + "sampling/importance_sampling_ratio/min": 0.0035351368132978678, + "sampling/sampling_logp_difference/max": 5.645003318786621, + "sampling/sampling_logp_difference/mean": 0.12007880210876465, + "step": 315 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 509.0, + "completions/max_terminated_length": 509.0, + "completions/mean_length": 250.88671875, + "completions/mean_terminated_length": 250.88671875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.10382918268442154, + "epoch": 0.5592920353982301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4815627276678539, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 155197251.0, + "reward": 0.6734374761581421, + "reward_std": 0.45209482312202454, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9763296246528625, + "sampling/importance_sampling_ratio/min": 0.011155710555613041, + "sampling/sampling_logp_difference/max": 4.4958038330078125, + "sampling/sampling_logp_difference/mean": 0.12063725292682648, + "step": 316 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 245.35546875, + "completions/mean_terminated_length": 245.35546875, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.09619526844471693, + "epoch": 0.5610619469026549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4402110009806778, + "learning_rate": 1e-06, + "loss": 0.0071, + "num_tokens": 155720206.0, + "reward": 0.688281238079071, + "reward_std": 0.44692784547805786, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9741077423095703, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.125055730342865, + "step": 317 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 564.0, + "completions/max_terminated_length": 564.0, + "completions/mean_length": 250.2265625, + "completions/mean_terminated_length": 250.2265625, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.10002064984291792, + "epoch": 0.5628318584070796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2587463217461931, + "learning_rate": 1e-06, + "loss": 0.0018, + "num_tokens": 156195000.0, + "reward": 0.5843749642372131, + "reward_std": 0.47219762206077576, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.97621750831604, + "sampling/importance_sampling_ratio/min": 0.011119124479591846, + "sampling/sampling_logp_difference/max": 4.499088764190674, + "sampling/sampling_logp_difference/mean": 0.11931677907705307, + "step": 318 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 555.0, + "completions/max_terminated_length": 555.0, + "completions/mean_length": 269.61328125, + "completions/mean_terminated_length": 269.61328125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.10486160963773727, + "epoch": 0.5646017699115045, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2857999286716648, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 156651493.0, + "reward": 0.4916015565395355, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782221913337708, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.11965882033109665, + "step": 319 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 382.0, + "completions/max_terminated_length": 382.0, + "completions/mean_length": 219.46484375, + "completions/mean_terminated_length": 219.46484375, + "completions/min_length": 105.0, + "completions/min_terminated_length": 105.0, + "entropy": 0.07706360146403313, + "epoch": 0.5663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7846440050613834, + "learning_rate": 1e-06, + "loss": -0.0144, + "num_tokens": 157122444.0, + "reward": 0.9332031011581421, + "reward_std": 0.243365079164505, + "rewards/execution_accuracy_EX/mean": 0.9296875, + "rewards/execution_accuracy_EX/std": 0.2561737895011902, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8161041736602783, + "sampling/importance_sampling_ratio/mean": 0.9782350063323975, + "sampling/importance_sampling_ratio/min": 0.006755828391760588, + "sampling/sampling_logp_difference/max": 4.997349739074707, + "sampling/sampling_logp_difference/mean": 0.10431890189647675, + "step": 320 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 560.0, + "completions/max_terminated_length": 560.0, + "completions/mean_length": 270.3203125, + "completions/mean_terminated_length": 270.3203125, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.09842854365706444, + "epoch": 0.5681415929203539, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25287181076132914, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 157775678.0, + "reward": 0.49531251192092896, + "reward_std": 0.47499996423721313, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794289469718933, + "sampling/importance_sampling_ratio/min": 0.011159423738718033, + "sampling/sampling_logp_difference/max": 4.495471000671387, + "sampling/sampling_logp_difference/mean": 0.10976652801036835, + "step": 321 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 406.0, + "completions/max_terminated_length": 406.0, + "completions/mean_length": 239.75390625, + "completions/mean_terminated_length": 239.75390625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.0821539806202054, + "epoch": 0.5699115044247788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5025369058197589, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 158274831.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7514076232910156, + "sampling/importance_sampling_ratio/mean": 0.9740892648696899, + "sampling/importance_sampling_ratio/min": 0.008661828935146332, + "sampling/sampling_logp_difference/max": 4.748829364776611, + "sampling/sampling_logp_difference/mean": 0.1122419685125351, + "step": 322 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 589.0, + "completions/max_terminated_length": 589.0, + "completions/mean_length": 250.7109375, + "completions/mean_terminated_length": 250.7109375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.10371797811239958, + "epoch": 0.5716814159292035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.45759429321599143, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 158743541.0, + "reward": 0.706835925579071, + "reward_std": 0.43967700004577637, + "rewards/execution_accuracy_EX/mean": 0.69140625, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778295755386353, + "sampling/importance_sampling_ratio/min": 0.007033617235720158, + "sampling/sampling_logp_difference/max": 4.957054138183594, + "sampling/sampling_logp_difference/mean": 0.12171629071235657, + "step": 323 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 729.0, + "completions/max_terminated_length": 729.0, + "completions/mean_length": 259.421875, + "completions/mean_terminated_length": 259.421875, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.10069741401821375, + "epoch": 0.5734513274336284, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42692721403591716, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 159162193.0, + "reward": 0.7291015386581421, + "reward_std": 0.4297545552253723, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8864747285842896, + "sampling/importance_sampling_ratio/mean": 0.9758027791976929, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.12244510650634766, + "step": 324 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 585.0, + "completions/max_terminated_length": 585.0, + "completions/mean_length": 244.5234375, + "completions/mean_terminated_length": 244.5234375, + "completions/min_length": 134.0, + "completions/min_terminated_length": 134.0, + "entropy": 0.09591101668775082, + "epoch": 0.5752212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 159625319.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9343760013580322, + "sampling/importance_sampling_ratio/mean": 0.9767451286315918, + "sampling/importance_sampling_ratio/min": 0.006755660753697157, + "sampling/sampling_logp_difference/max": 4.997374534606934, + "sampling/sampling_logp_difference/mean": 0.12048230320215225, + "step": 325 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 627.0, + "completions/max_terminated_length": 627.0, + "completions/mean_length": 260.671875, + "completions/mean_terminated_length": 260.671875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09766383562237024, + "epoch": 0.5769911504424778, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3238322691384502, + "learning_rate": 1e-06, + "loss": 0.0119, + "num_tokens": 160079603.0, + "reward": 0.539843738079071, + "reward_std": 0.4756980240345001, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8316380977630615, + "sampling/importance_sampling_ratio/mean": 0.9780088663101196, + "sampling/importance_sampling_ratio/min": 0.004358318634331226, + "sampling/sampling_logp_difference/max": 5.4356689453125, + "sampling/sampling_logp_difference/mean": 0.11697864532470703, + "step": 326 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 601.0, + "completions/max_terminated_length": 601.0, + "completions/mean_length": 260.49609375, + "completions/mean_terminated_length": 260.49609375, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.09727056697010994, + "epoch": 0.5787610619469027, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12152999501259097, + "learning_rate": 1e-06, + "loss": 0.0006, + "num_tokens": 160609698.0, + "reward": 0.4136718511581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.3828125, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9785568714141846, + "sampling/importance_sampling_ratio/min": 0.008679252117872238, + "sampling/sampling_logp_difference/max": 4.746819972991943, + "sampling/sampling_logp_difference/mean": 0.11541437357664108, + "step": 327 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 513.0, + "completions/max_terminated_length": 513.0, + "completions/mean_length": 267.3046875, + "completions/mean_terminated_length": 267.3046875, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.09658203925937414, + "epoch": 0.5805309734513274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.16257796850582978, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 161201104.0, + "reward": 0.4544922113418579, + "reward_std": 0.47065800428390503, + "rewards/execution_accuracy_EX/mean": 0.42578125, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9462175369262695, + "sampling/importance_sampling_ratio/mean": 0.9753714799880981, + "sampling/importance_sampling_ratio/min": 0.008661828935146332, + "sampling/sampling_logp_difference/max": 4.748829364776611, + "sampling/sampling_logp_difference/mean": 0.12179063260555267, + "step": 328 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 430.0, + "completions/max_terminated_length": 430.0, + "completions/mean_length": 231.80078125, + "completions/mean_terminated_length": 231.80078125, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.09262214880436659, + "epoch": 0.5823008849557522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29201357395420724, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 161737357.0, + "reward": 0.814453125, + "reward_std": 0.37735676765441895, + "rewards/execution_accuracy_EX/mean": 0.8046875, + "rewards/execution_accuracy_EX/std": 0.39721766114234924, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9784231185913086, + "sampling/importance_sampling_ratio/min": 0.006784507539123297, + "sampling/sampling_logp_difference/max": 4.9931135177612305, + "sampling/sampling_logp_difference/mean": 0.11321832984685898, + "step": 329 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 481.0, + "completions/max_terminated_length": 481.0, + "completions/mean_length": 245.95703125, + "completions/mean_terminated_length": 245.95703125, + "completions/min_length": 113.0, + "completions/min_terminated_length": 113.0, + "entropy": 0.09585994947701693, + "epoch": 0.584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2578667322795693, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 162234770.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8975902795791626, + "sampling/importance_sampling_ratio/mean": 0.9785220623016357, + "sampling/importance_sampling_ratio/min": 0.008152726106345654, + "sampling/sampling_logp_difference/max": 4.809402942657471, + "sampling/sampling_logp_difference/mean": 0.11667103320360184, + "step": 330 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 242.01171875, + "completions/mean_terminated_length": 242.01171875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.09702995885163546, + "epoch": 0.5858407079646017, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5710229349636504, + "learning_rate": 1e-06, + "loss": 0.0107, + "num_tokens": 162718165.0, + "reward": 0.6845703125, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797040224075317, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1162787675857544, + "step": 331 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 611.0, + "completions/max_terminated_length": 611.0, + "completions/mean_length": 250.1640625, + "completions/mean_terminated_length": 250.1640625, + "completions/min_length": 151.0, + "completions/min_terminated_length": 151.0, + "entropy": 0.1122655738145113, + "epoch": 0.5876106194690266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4722082937173414, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 163255727.0, + "reward": 0.6326172351837158, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.922654390335083, + "sampling/importance_sampling_ratio/mean": 0.9778604507446289, + "sampling/importance_sampling_ratio/min": 0.014302237890660763, + "sampling/sampling_logp_difference/max": 4.247339248657227, + "sampling/sampling_logp_difference/mean": 0.12552200257778168, + "step": 332 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 463.0, + "completions/max_terminated_length": 463.0, + "completions/mean_length": 268.4375, + "completions/mean_terminated_length": 268.4375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.10480065550655127, + "epoch": 0.5893805309734513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2888702807425869, + "learning_rate": 1e-06, + "loss": -0.0016, + "num_tokens": 163635503.0, + "reward": 0.42851561307907104, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.3984375, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8170230388641357, + "sampling/importance_sampling_ratio/mean": 0.9762803912162781, + "sampling/importance_sampling_ratio/min": 0.006748165003955364, + "sampling/sampling_logp_difference/max": 4.9984846115112305, + "sampling/sampling_logp_difference/mean": 0.12657693028450012, + "step": 333 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1035.0, + "completions/max_terminated_length": 1035.0, + "completions/mean_length": 312.25, + "completions/mean_terminated_length": 312.25, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.12133604194968939, + "epoch": 0.5911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37753305590241776, + "learning_rate": 1e-06, + "loss": -0.0337, + "num_tokens": 164132943.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800047874450684, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.12678708136081696, + "step": 334 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 643.0, + "completions/max_terminated_length": 643.0, + "completions/mean_length": 280.16796875, + "completions/mean_terminated_length": 280.16796875, + "completions/min_length": 174.0, + "completions/min_terminated_length": 174.0, + "entropy": 0.12003406789153814, + "epoch": 0.5929203539823009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19497734889866827, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 164540906.0, + "reward": 0.6585937142372131, + "reward_std": 0.456719309091568, + "rewards/execution_accuracy_EX/mean": 0.640625, + "rewards/execution_accuracy_EX/std": 0.4807571768760681, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.866982102394104, + "sampling/importance_sampling_ratio/mean": 0.9788284301757812, + "sampling/importance_sampling_ratio/min": 0.002553603844717145, + "sampling/sampling_logp_difference/max": 5.970249652862549, + "sampling/sampling_logp_difference/mean": 0.12541426718235016, + "step": 335 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 768.0, + "completions/max_terminated_length": 768.0, + "completions/mean_length": 265.65234375, + "completions/mean_terminated_length": 265.65234375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.11629681382328272, + "epoch": 0.5946902654867257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19946792412467465, + "learning_rate": 1e-06, + "loss": 0.0029, + "num_tokens": 165054241.0, + "reward": 0.6957030892372131, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9283543825149536, + "sampling/importance_sampling_ratio/mean": 0.9771718978881836, + "sampling/importance_sampling_ratio/min": 0.008706767112016678, + "sampling/sampling_logp_difference/max": 4.743654727935791, + "sampling/sampling_logp_difference/mean": 0.12751495838165283, + "step": 336 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 587.0, + "completions/max_terminated_length": 587.0, + "completions/mean_length": 270.5703125, + "completions/mean_terminated_length": 270.5703125, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.11773484852164984, + "epoch": 0.5964601769911504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2508809026152861, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 165478163.0, + "reward": 0.6623046398162842, + "reward_std": 0.4556131064891815, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786386489868164, + "sampling/importance_sampling_ratio/min": 0.014295632019639015, + "sampling/sampling_logp_difference/max": 4.247801303863525, + "sampling/sampling_logp_difference/mean": 0.12480713427066803, + "step": 337 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 504.0, + "completions/max_terminated_length": 504.0, + "completions/mean_length": 266.6875, + "completions/mean_terminated_length": 266.6875, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.10925499815493822, + "epoch": 0.5982300884955752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 165908579.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9785383343696594, + "sampling/importance_sampling_ratio/min": 0.011943034827709198, + "sampling/sampling_logp_difference/max": 4.42760705947876, + "sampling/sampling_logp_difference/mean": 0.12010498344898224, + "step": 338 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 438.0, + "completions/max_terminated_length": 438.0, + "completions/mean_length": 289.578125, + "completions/mean_terminated_length": 289.578125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.12177309859544039, + "epoch": 0.6, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1742291895418494, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 166535463.0, + "reward": 0.6882811784744263, + "reward_std": 0.44692784547805786, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8843876123428345, + "sampling/importance_sampling_ratio/mean": 0.9795670509338379, + "sampling/importance_sampling_ratio/min": 0.008726546540856361, + "sampling/sampling_logp_difference/max": 4.741385459899902, + "sampling/sampling_logp_difference/mean": 0.12084391713142395, + "step": 339 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 436.0, + "completions/max_terminated_length": 436.0, + "completions/mean_length": 270.84375, + "completions/mean_terminated_length": 270.84375, + "completions/min_length": 161.0, + "completions/min_terminated_length": 161.0, + "entropy": 0.12302173301577568, + "epoch": 0.6017699115044248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2681129890989471, + "learning_rate": 1e-06, + "loss": 0.0037, + "num_tokens": 167112255.0, + "reward": 0.46562498807907104, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.4375, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.822189450263977, + "sampling/importance_sampling_ratio/mean": 0.9797269105911255, + "sampling/importance_sampling_ratio/min": 0.005811800714582205, + "sampling/sampling_logp_difference/max": 5.147864818572998, + "sampling/sampling_logp_difference/mean": 0.12662267684936523, + "step": 340 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 586.0, + "completions/max_terminated_length": 586.0, + "completions/mean_length": 270.91015625, + "completions/mean_terminated_length": 270.91015625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.12622435204684734, + "epoch": 0.6035398230088496, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15821104933695412, + "learning_rate": 1e-06, + "loss": -0.0014, + "num_tokens": 167711560.0, + "reward": 0.7587890625, + "reward_std": 0.4142923057079315, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9329793453216553, + "sampling/importance_sampling_ratio/mean": 0.9792646169662476, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.1252666562795639, + "step": 341 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 579.0, + "completions/max_terminated_length": 579.0, + "completions/mean_length": 285.59375, + "completions/mean_terminated_length": 285.59375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.13098933827131987, + "epoch": 0.6053097345132743, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3805116896194998, + "learning_rate": 1e-06, + "loss": 0.0102, + "num_tokens": 168161728.0, + "reward": 0.717968761920929, + "reward_std": 0.4348871409893036, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8983339071273804, + "sampling/importance_sampling_ratio/mean": 0.9839341640472412, + "sampling/importance_sampling_ratio/min": 0.014309640042483807, + "sampling/sampling_logp_difference/max": 4.246821880340576, + "sampling/sampling_logp_difference/mean": 0.12266640365123749, + "step": 342 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 459.0, + "completions/max_terminated_length": 459.0, + "completions/mean_length": 311.59765625, + "completions/mean_terminated_length": 311.59765625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.13412226364016533, + "epoch": 0.6070796460176991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44996187058916765, + "learning_rate": 1e-06, + "loss": 0.0074, + "num_tokens": 168695081.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676450967788696, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826523065567017, + "sampling/importance_sampling_ratio/min": 0.008697489276528358, + "sampling/sampling_logp_difference/max": 4.744720935821533, + "sampling/sampling_logp_difference/mean": 0.12485301494598389, + "step": 343 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 518.0, + "completions/max_terminated_length": 518.0, + "completions/mean_length": 309.85546875, + "completions/mean_terminated_length": 309.85546875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.14732183329761028, + "epoch": 0.6088495575221239, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.38312070738900167, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 169420772.0, + "reward": 0.521289050579071, + "reward_std": 0.47591593861579895, + "rewards/execution_accuracy_EX/mean": 0.49609375, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9817765951156616, + "sampling/importance_sampling_ratio/min": 0.014309640042483807, + "sampling/sampling_logp_difference/max": 4.246821880340576, + "sampling/sampling_logp_difference/mean": 0.1348007321357727, + "step": 344 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 942.0, + "completions/max_terminated_length": 942.0, + "completions/mean_length": 309.87890625, + "completions/mean_terminated_length": 309.87890625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.14372923597693443, + "epoch": 0.6106194690265486, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3621342816710659, + "learning_rate": 1e-06, + "loss": 0.0241, + "num_tokens": 169841093.0, + "reward": 0.6957031488418579, + "reward_std": 0.4441350996494293, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9832894206047058, + "sampling/importance_sampling_ratio/min": 0.008700923062860966, + "sampling/sampling_logp_difference/max": 4.744326114654541, + "sampling/sampling_logp_difference/mean": 0.13119953870773315, + "step": 345 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 752.0, + "completions/max_terminated_length": 752.0, + "completions/mean_length": 299.38671875, + "completions/mean_terminated_length": 299.38671875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.14318578131496906, + "epoch": 0.6123893805309735, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24299852408912867, + "learning_rate": 1e-06, + "loss": 0.0023, + "num_tokens": 170237352.0, + "reward": 0.792187511920929, + "reward_std": 0.393498033285141, + "rewards/execution_accuracy_EX/mean": 0.78125, + "rewards/execution_accuracy_EX/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.928519606590271, + "sampling/importance_sampling_ratio/mean": 0.9804319143295288, + "sampling/importance_sampling_ratio/min": 0.006759620737284422, + "sampling/sampling_logp_difference/max": 4.996788501739502, + "sampling/sampling_logp_difference/mean": 0.1348951756954193, + "step": 346 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 385.0, + "completions/max_terminated_length": 385.0, + "completions/mean_length": 272.375, + "completions/mean_terminated_length": 272.375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.13611670583486557, + "epoch": 0.6141592920353982, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17150394795151383, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 170787464.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212619304657, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802919626235962, + "sampling/importance_sampling_ratio/min": 0.011360770091414452, + "sampling/sampling_logp_difference/max": 4.477589130401611, + "sampling/sampling_logp_difference/mean": 0.1316777467727661, + "step": 347 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 782.0, + "completions/max_terminated_length": 782.0, + "completions/mean_length": 326.8984375, + "completions/mean_terminated_length": 326.8984375, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.16614997386932373, + "epoch": 0.6159292035398231, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2131704702543648, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 171351518.0, + "reward": 0.3060546815395355, + "reward_std": 0.42235618829727173, + "rewards/execution_accuracy_EX/mean": 0.26953125, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815507531166077, + "sampling/importance_sampling_ratio/min": 0.011136953718960285, + "sampling/sampling_logp_difference/max": 4.497486591339111, + "sampling/sampling_logp_difference/mean": 0.14674508571624756, + "step": 348 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 475.0, + "completions/max_terminated_length": 475.0, + "completions/mean_length": 303.4609375, + "completions/mean_terminated_length": 303.4609375, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.14835299737751484, + "epoch": 0.6176991150442478, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29762721413951043, + "learning_rate": 1e-06, + "loss": -0.0051, + "num_tokens": 171970196.0, + "reward": 0.4693359136581421, + "reward_std": 0.4726511836051941, + "rewards/execution_accuracy_EX/mean": 0.44140625, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9478397369384766, + "sampling/importance_sampling_ratio/mean": 0.9828253984451294, + "sampling/importance_sampling_ratio/min": 0.0022946451790630817, + "sampling/sampling_logp_difference/max": 6.077177047729492, + "sampling/sampling_logp_difference/mean": 0.13308770954608917, + "step": 349 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 440.0, + "completions/max_terminated_length": 440.0, + "completions/mean_length": 299.88671875, + "completions/mean_terminated_length": 299.88671875, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.15259709022939205, + "epoch": 0.6194690265486725, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3279121660327367, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 172545751.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815083742141724, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.13767719268798828, + "step": 350 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 456.0, + "completions/max_terminated_length": 456.0, + "completions/mean_length": 311.00390625, + "completions/mean_terminated_length": 311.00390625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.14528056234121323, + "epoch": 0.6212389380530974, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23019182346710979, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 173100168.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9438985586166382, + "sampling/importance_sampling_ratio/mean": 0.9813041687011719, + "sampling/importance_sampling_ratio/min": 0.011136539280414581, + "sampling/sampling_logp_difference/max": 4.497523784637451, + "sampling/sampling_logp_difference/mean": 0.13343140482902527, + "step": 351 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 792.0, + "completions/max_terminated_length": 792.0, + "completions/mean_length": 308.8203125, + "completions/mean_terminated_length": 308.8203125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.17054642364382744, + "epoch": 0.6230088495575221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2460299764669613, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 173559146.0, + "reward": 0.6771484613418579, + "reward_std": 0.4508545994758606, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823266267776489, + "sampling/importance_sampling_ratio/min": 0.01840067282319069, + "sampling/sampling_logp_difference/max": 3.995368003845215, + "sampling/sampling_logp_difference/mean": 0.1443067193031311, + "step": 352 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 844.0, + "completions/max_terminated_length": 844.0, + "completions/mean_length": 333.90625, + "completions/mean_terminated_length": 333.90625, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.17115960828959942, + "epoch": 0.6247787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.11972538118286806, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 173985234.0, + "reward": 0.5064452886581421, + "reward_std": 0.47556719183921814, + "rewards/execution_accuracy_EX/mean": 0.48046875, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9859043955802917, + "sampling/importance_sampling_ratio/min": 0.0011984164593741298, + "sampling/sampling_logp_difference/max": 6.726754188537598, + "sampling/sampling_logp_difference/mean": 0.14230495691299438, + "step": 353 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 649.0, + "completions/max_terminated_length": 649.0, + "completions/mean_length": 279.91015625, + "completions/mean_terminated_length": 279.91015625, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.15714056976139545, + "epoch": 0.6265486725663717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 174461883.0, + "reward": 0.762499988079071, + "reward_std": 0.41216787695884705, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8311058282852173, + "sampling/importance_sampling_ratio/mean": 0.9806782007217407, + "sampling/importance_sampling_ratio/min": 0.006941431201994419, + "sampling/sampling_logp_difference/max": 4.970247268676758, + "sampling/sampling_logp_difference/mean": 0.13960732519626617, + "step": 354 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 473.0, + "completions/max_terminated_length": 473.0, + "completions/mean_length": 280.109375, + "completions/mean_terminated_length": 280.109375, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.15817486122250557, + "epoch": 0.6283185840707964, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28922512002492845, + "learning_rate": 1e-06, + "loss": -0.0022, + "num_tokens": 174927415.0, + "reward": 0.7328124642372131, + "reward_std": 0.4279654622077942, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9616512060165405, + "sampling/importance_sampling_ratio/mean": 0.9811063408851624, + "sampling/importance_sampling_ratio/min": 0.018573788926005363, + "sampling/sampling_logp_difference/max": 3.986003875732422, + "sampling/sampling_logp_difference/mean": 0.1422651708126068, + "step": 355 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 760.0, + "completions/max_terminated_length": 760.0, + "completions/mean_length": 316.3828125, + "completions/mean_terminated_length": 316.3828125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.16602585650980473, + "epoch": 0.6300884955752213, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21675306464316224, + "learning_rate": 1e-06, + "loss": 0.0108, + "num_tokens": 175397865.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212917327881, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9855636358261108, + "sampling/importance_sampling_ratio/min": 0.008775142952799797, + "sampling/sampling_logp_difference/max": 4.735832214355469, + "sampling/sampling_logp_difference/mean": 0.13811041414737701, + "step": 356 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 531.0, + "completions/max_terminated_length": 531.0, + "completions/mean_length": 301.63671875, + "completions/mean_terminated_length": 301.63671875, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.17440044321119785, + "epoch": 0.631858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23015549064829782, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 175926764.0, + "reward": 0.669726550579071, + "reward_std": 0.4533011019229889, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849185943603516, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1428661346435547, + "step": 357 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 536.0, + "completions/max_terminated_length": 536.0, + "completions/mean_length": 290.98046875, + "completions/mean_terminated_length": 290.98046875, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.17171570286154747, + "epoch": 0.6336283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2443959815433609, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 176443751.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267402172088623, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9632278680801392, + "sampling/importance_sampling_ratio/mean": 0.9822627305984497, + "sampling/importance_sampling_ratio/min": 0.013144331984221935, + "sampling/sampling_logp_difference/max": 4.3317646980285645, + "sampling/sampling_logp_difference/mean": 0.14267569780349731, + "step": 358 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 503.0, + "completions/max_terminated_length": 503.0, + "completions/mean_length": 292.84375, + "completions/mean_terminated_length": 292.84375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.1765604056417942, + "epoch": 0.6353982300884956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3131359599874531, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 177196543.0, + "reward": 0.7216796875, + "reward_std": 0.4332149922847748, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979896068572998, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.15033632516860962, + "step": 359 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 511.0, + "completions/max_terminated_length": 511.0, + "completions/mean_length": 272.4609375, + "completions/mean_terminated_length": 272.4609375, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.15468990057706833, + "epoch": 0.6371681415929203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39250898384012695, + "learning_rate": 1e-06, + "loss": -0.0061, + "num_tokens": 177658261.0, + "reward": 0.792187511920929, + "reward_std": 0.393498033285141, + "rewards/execution_accuracy_EX/mean": 0.78125, + "rewards/execution_accuracy_EX/std": 0.41420844197273254, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9108880758285522, + "sampling/importance_sampling_ratio/mean": 0.9805917739868164, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.13606077432632446, + "step": 360 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 561.0, + "completions/max_terminated_length": 561.0, + "completions/mean_length": 298.0234375, + "completions/mean_terminated_length": 298.0234375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.18882922641932964, + "epoch": 0.6389380530973451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30951042598403783, + "learning_rate": 1e-06, + "loss": -0.0093, + "num_tokens": 178143723.0, + "reward": 0.7142578363418579, + "reward_std": 0.4365212619304657, + "rewards/execution_accuracy_EX/mean": 0.69921875, + "rewards/execution_accuracy_EX/std": 0.45949608087539673, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841555953025818, + "sampling/importance_sampling_ratio/min": 0.01430963259190321, + "sampling/sampling_logp_difference/max": 4.246822357177734, + "sampling/sampling_logp_difference/mean": 0.15244781970977783, + "step": 361 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 457.0, + "completions/max_terminated_length": 457.0, + "completions/mean_length": 285.67578125, + "completions/mean_terminated_length": 285.67578125, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.1687780823558569, + "epoch": 0.6407079646017699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1541661540541937, + "learning_rate": 1e-06, + "loss": 0.0015, + "num_tokens": 178615368.0, + "reward": 0.706835925579071, + "reward_std": 0.439676970243454, + "rewards/execution_accuracy_EX/mean": 0.69140625, + "rewards/execution_accuracy_EX/std": 0.46281787753105164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9848163723945618, + "sampling/importance_sampling_ratio/min": 0.01105455867946148, + "sampling/sampling_logp_difference/max": 4.504912376403809, + "sampling/sampling_logp_difference/mean": 0.14022907614707947, + "step": 362 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 730.0, + "completions/max_terminated_length": 730.0, + "completions/mean_length": 326.1484375, + "completions/mean_terminated_length": 326.1484375, + "completions/min_length": 186.0, + "completions/min_terminated_length": 186.0, + "entropy": 0.19181636348366737, + "epoch": 0.6424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4408793451721107, + "learning_rate": 1e-06, + "loss": -0.0045, + "num_tokens": 179121054.0, + "reward": 0.576953113079071, + "reward_std": 0.47307512164115906, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.984829306602478, + "sampling/importance_sampling_ratio/min": 0.00874820351600647, + "sampling/sampling_logp_difference/max": 4.7389068603515625, + "sampling/sampling_logp_difference/mean": 0.15180866420269012, + "step": 363 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 533.0, + "completions/max_terminated_length": 533.0, + "completions/mean_length": 289.96484375, + "completions/mean_terminated_length": 289.96484375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.16963029652833939, + "epoch": 0.6442477876106195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12739688224075013, + "learning_rate": 1e-06, + "loss": -0.0035, + "num_tokens": 179656229.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797656536102295, + "sampling/importance_sampling_ratio/min": 0.018439162522554398, + "sampling/sampling_logp_difference/max": 3.9932785034179688, + "sampling/sampling_logp_difference/mean": 0.14988864958286285, + "step": 364 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 866.0, + "completions/max_terminated_length": 866.0, + "completions/mean_length": 330.34375, + "completions/mean_terminated_length": 330.34375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.182080315425992, + "epoch": 0.6460176991150443, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19637362753621984, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 180251981.0, + "reward": 0.7216796875, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9820328950881958, + "sampling/importance_sampling_ratio/mean": 0.9836443662643433, + "sampling/importance_sampling_ratio/min": 0.014291773550212383, + "sampling/sampling_logp_difference/max": 4.248071193695068, + "sampling/sampling_logp_difference/mean": 0.14891904592514038, + "step": 365 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 870.0, + "completions/max_terminated_length": 870.0, + "completions/mean_length": 342.00390625, + "completions/mean_terminated_length": 342.00390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.1771495994180441, + "epoch": 0.647787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30658590682268666, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 180701886.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9865796566009521, + "sampling/importance_sampling_ratio/min": 0.00869713630527258, + "sampling/sampling_logp_difference/max": 4.7447614669799805, + "sampling/sampling_logp_difference/mean": 0.14339247345924377, + "step": 366 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 701.0, + "completions/max_terminated_length": 701.0, + "completions/mean_length": 333.34375, + "completions/mean_terminated_length": 333.34375, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1673466358333826, + "epoch": 0.6495575221238938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3282424362319832, + "learning_rate": 1e-06, + "loss": -0.0096, + "num_tokens": 181082454.0, + "reward": 0.743945300579071, + "reward_std": 0.42235618829727173, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846060276031494, + "sampling/importance_sampling_ratio/min": 0.011136539280414581, + "sampling/sampling_logp_difference/max": 4.497523784637451, + "sampling/sampling_logp_difference/mean": 0.13978558778762817, + "step": 367 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1178.0, + "completions/max_terminated_length": 1178.0, + "completions/mean_length": 357.84375, + "completions/mean_terminated_length": 357.84375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.1844386588782072, + "epoch": 0.6513274336283186, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37451782258654803, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 181645694.0, + "reward": 0.606640636920929, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9867744445800781, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.14332488179206848, + "step": 368 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 715.0, + "completions/max_terminated_length": 715.0, + "completions/mean_length": 336.61328125, + "completions/mean_terminated_length": 336.61328125, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.175746975466609, + "epoch": 0.6530973451327433, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2723377936995241, + "learning_rate": 1e-06, + "loss": -0.003, + "num_tokens": 182268331.0, + "reward": 0.5658202767372131, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.995046615600586, + "sampling/importance_sampling_ratio/mean": 0.9859664440155029, + "sampling/importance_sampling_ratio/min": 0.018332336097955704, + "sampling/sampling_logp_difference/max": 3.999088764190674, + "sampling/sampling_logp_difference/mean": 0.14141210913658142, + "step": 369 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 603.0, + "completions/max_terminated_length": 603.0, + "completions/mean_length": 306.1484375, + "completions/mean_terminated_length": 306.1484375, + "completions/min_length": 182.0, + "completions/min_terminated_length": 182.0, + "entropy": 0.17632250115275383, + "epoch": 0.6548672566371682, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27545191815465125, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 182616609.0, + "reward": 0.5621093511581421, + "reward_std": 0.47447580099105835, + "rewards/execution_accuracy_EX/mean": 0.5390625, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9145256280899048, + "sampling/importance_sampling_ratio/mean": 0.9843444228172302, + "sampling/importance_sampling_ratio/min": 0.011136539280414581, + "sampling/sampling_logp_difference/max": 4.497523784637451, + "sampling/sampling_logp_difference/mean": 0.1441006362438202, + "step": 370 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 905.0, + "completions/max_terminated_length": 905.0, + "completions/mean_length": 320.12890625, + "completions/mean_terminated_length": 320.12890625, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.19141799211502075, + "epoch": 0.6566371681415929, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42132400591255087, + "learning_rate": 1e-06, + "loss": 0.0019, + "num_tokens": 183033346.0, + "reward": 0.666015625, + "reward_std": 0.4544737935066223, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803478717803955, + "sampling/importance_sampling_ratio/min": 0.018343178555369377, + "sampling/sampling_logp_difference/max": 3.998497486114502, + "sampling/sampling_logp_difference/mean": 0.16229598224163055, + "step": 371 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 647.0, + "completions/max_terminated_length": 647.0, + "completions/mean_length": 324.56640625, + "completions/mean_terminated_length": 324.56640625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.1893010027706623, + "epoch": 0.6584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21672127442107716, + "learning_rate": 1e-06, + "loss": 0.0091, + "num_tokens": 183743939.0, + "reward": 0.666015625, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842427968978882, + "sampling/importance_sampling_ratio/min": 0.014364885166287422, + "sampling/sampling_logp_difference/max": 4.242968559265137, + "sampling/sampling_logp_difference/mean": 0.1574934720993042, + "step": 372 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 670.0, + "completions/max_terminated_length": 670.0, + "completions/mean_length": 398.5, + "completions/mean_terminated_length": 398.5, + "completions/min_length": 235.0, + "completions/min_terminated_length": 235.0, + "entropy": 0.20206800289452076, + "epoch": 0.6601769911504425, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28906542018159676, + "learning_rate": 1e-06, + "loss": -0.0013, + "num_tokens": 184220771.0, + "reward": 0.5138671398162842, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9886964559555054, + "sampling/importance_sampling_ratio/min": 0.023641198873519897, + "sampling/sampling_logp_difference/max": 3.7447643280029297, + "sampling/sampling_logp_difference/mean": 0.15359437465667725, + "step": 373 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1031.0, + "completions/max_terminated_length": 1031.0, + "completions/mean_length": 348.703125, + "completions/mean_terminated_length": 348.703125, + "completions/min_length": 214.0, + "completions/min_terminated_length": 214.0, + "entropy": 0.1811294611543417, + "epoch": 0.6619469026548672, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3619688170617479, + "learning_rate": 1e-06, + "loss": 0.0028, + "num_tokens": 184793095.0, + "reward": 0.818164050579071, + "reward_std": 0.3744697570800781, + "rewards/execution_accuracy_EX/mean": 0.80859375, + "rewards/execution_accuracy_EX/std": 0.39417871832847595, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9510424137115479, + "sampling/importance_sampling_ratio/mean": 0.985849142074585, + "sampling/importance_sampling_ratio/min": 0.02371858060359955, + "sampling/sampling_logp_difference/max": 3.7414965629577637, + "sampling/sampling_logp_difference/mean": 0.14919239282608032, + "step": 374 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 551.0, + "completions/max_terminated_length": 551.0, + "completions/mean_length": 322.453125, + "completions/mean_terminated_length": 322.453125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.18431797996163368, + "epoch": 0.6637168141592921, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20322991160833945, + "learning_rate": 1e-06, + "loss": 0.0055, + "num_tokens": 185365515.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9828222990036011, + "sampling/importance_sampling_ratio/min": 0.018439654260873795, + "sampling/sampling_logp_difference/max": 3.9932518005371094, + "sampling/sampling_logp_difference/mean": 0.15434765815734863, + "step": 375 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 841.0, + "completions/max_terminated_length": 841.0, + "completions/mean_length": 367.359375, + "completions/mean_terminated_length": 367.359375, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.18431459739804268, + "epoch": 0.6654867256637168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3488093480850353, + "learning_rate": 1e-06, + "loss": -0.0007, + "num_tokens": 185861191.0, + "reward": 0.558398425579071, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850502014160156, + "sampling/importance_sampling_ratio/min": 0.01433913316577673, + "sampling/sampling_logp_difference/max": 4.244762897491455, + "sampling/sampling_logp_difference/mean": 0.15114128589630127, + "step": 376 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 684.0, + "completions/max_terminated_length": 684.0, + "completions/mean_length": 341.76171875, + "completions/mean_terminated_length": 341.76171875, + "completions/min_length": 226.0, + "completions/min_terminated_length": 226.0, + "entropy": 0.16793095134198666, + "epoch": 0.6672566371681415, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20028002512110296, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 186490618.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9677222967147827, + "sampling/importance_sampling_ratio/mean": 0.9848576784133911, + "sampling/importance_sampling_ratio/min": 0.018390806391835213, + "sampling/sampling_logp_difference/max": 3.9959044456481934, + "sampling/sampling_logp_difference/mean": 0.14287824928760529, + "step": 377 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 964.0, + "completions/max_terminated_length": 964.0, + "completions/mean_length": 391.6171875, + "completions/mean_terminated_length": 391.6171875, + "completions/min_length": 202.0, + "completions/min_terminated_length": 202.0, + "entropy": 0.18167337216436863, + "epoch": 0.6690265486725664, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.44731050248057425, + "learning_rate": 1e-06, + "loss": 0.0213, + "num_tokens": 186940952.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838932752609253, + "sampling/importance_sampling_ratio/min": 0.014340986497700214, + "sampling/sampling_logp_difference/max": 4.244633674621582, + "sampling/sampling_logp_difference/mean": 0.15083645284175873, + "step": 378 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1039.0, + "completions/mean_length": 349.71484375, + "completions/mean_terminated_length": 335.0235595703125, + "completions/min_length": 188.0, + "completions/min_terminated_length": 188.0, + "entropy": 0.18162584863603115, + "epoch": 0.6707964601769911, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12641193978532114, + "learning_rate": 1e-06, + "loss": -0.0186, + "num_tokens": 187507311.0, + "reward": 0.7623046636581421, + "reward_std": 0.41251853108406067, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822250604629517, + "sampling/importance_sampling_ratio/min": 0.011371039785444736, + "sampling/sampling_logp_difference/max": 4.476685523986816, + "sampling/sampling_logp_difference/mean": 0.15520714223384857, + "step": 379 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 744.0, + "completions/max_terminated_length": 744.0, + "completions/mean_length": 318.67578125, + "completions/mean_terminated_length": 318.67578125, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.16668269224464893, + "epoch": 0.672566371681416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1811797559427384, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 188113132.0, + "reward": 0.688281238079071, + "reward_std": 0.4469278156757355, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808695912361145, + "sampling/importance_sampling_ratio/min": 0.014337287284433842, + "sampling/sampling_logp_difference/max": 4.24489164352417, + "sampling/sampling_logp_difference/mean": 0.14662903547286987, + "step": 380 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 347.63671875, + "completions/mean_terminated_length": 347.63671875, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.17708031088113785, + "epoch": 0.6743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29415766923403913, + "learning_rate": 1e-06, + "loss": 0.0116, + "num_tokens": 188563679.0, + "reward": 0.666015625, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9815072417259216, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.15357543528079987, + "step": 381 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1074.0, + "completions/max_terminated_length": 1074.0, + "completions/mean_length": 369.50390625, + "completions/mean_terminated_length": 369.50390625, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.1768922433257103, + "epoch": 0.6761061946902654, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26338379961442426, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 189036928.0, + "reward": 0.6845703125, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9431287050247192, + "sampling/importance_sampling_ratio/mean": 0.984576940536499, + "sampling/importance_sampling_ratio/min": 0.008697203360497952, + "sampling/sampling_logp_difference/max": 4.744753837585449, + "sampling/sampling_logp_difference/mean": 0.14685526490211487, + "step": 382 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 818.0, + "completions/max_terminated_length": 818.0, + "completions/mean_length": 355.2734375, + "completions/mean_terminated_length": 355.2734375, + "completions/min_length": 213.0, + "completions/min_terminated_length": 213.0, + "entropy": 0.17549428157508373, + "epoch": 0.6778761061946903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3562485539515273, + "learning_rate": 1e-06, + "loss": 0.0016, + "num_tokens": 189634886.0, + "reward": 0.666015625, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.6484375, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9829525947570801, + "sampling/importance_sampling_ratio/min": 0.018439073115587234, + "sampling/sampling_logp_difference/max": 3.993283271789551, + "sampling/sampling_logp_difference/mean": 0.1483614444732666, + "step": 383 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1076.0, + "completions/max_terminated_length": 1076.0, + "completions/mean_length": 333.75390625, + "completions/mean_terminated_length": 333.75390625, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.1677437536418438, + "epoch": 0.679646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23258807614770383, + "learning_rate": 1e-06, + "loss": -0.0157, + "num_tokens": 190079831.0, + "reward": 0.6771484613418579, + "reward_std": 0.450854629278183, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9809868335723877, + "sampling/importance_sampling_ratio/min": 0.014412390999495983, + "sampling/sampling_logp_difference/max": 4.239666938781738, + "sampling/sampling_logp_difference/mean": 0.14908547699451447, + "step": 384 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1260.0, + "completions/max_terminated_length": 1260.0, + "completions/mean_length": 366.09375, + "completions/mean_terminated_length": 366.09375, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.17234939150512218, + "epoch": 0.6814159292035398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.07922133000913145, + "learning_rate": 1e-06, + "loss": -0.0018, + "num_tokens": 190520879.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9845023155212402, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.1468074917793274, + "step": 385 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 344.98046875, + "completions/mean_terminated_length": 344.98046875, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.16980272345244884, + "epoch": 0.6831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3334577712274148, + "learning_rate": 1e-06, + "loss": 0.0134, + "num_tokens": 191015850.0, + "reward": 0.7328124642372131, + "reward_std": 0.4279654324054718, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.924397587776184, + "sampling/importance_sampling_ratio/mean": 0.9805434346199036, + "sampling/importance_sampling_ratio/min": 0.01062503457069397, + "sampling/sampling_logp_difference/max": 4.54454231262207, + "sampling/sampling_logp_difference/mean": 0.152784526348114, + "step": 386 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1235.0, + "completions/max_terminated_length": 1235.0, + "completions/mean_length": 378.21875, + "completions/mean_terminated_length": 378.21875, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.16122937947511673, + "epoch": 0.6849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14798100699710418, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 191360466.0, + "reward": 0.6994140148162842, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820222854614258, + "sampling/importance_sampling_ratio/min": 0.014339085668325424, + "sampling/sampling_logp_difference/max": 4.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.14505022764205933, + "step": 387 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 699.0, + "completions/max_terminated_length": 699.0, + "completions/mean_length": 320.3046875, + "completions/mean_terminated_length": 320.3046875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.15651732683181763, + "epoch": 0.6867256637168142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1303891658844245, + "learning_rate": 1e-06, + "loss": 0.0043, + "num_tokens": 192174448.0, + "reward": 0.6919921636581421, + "reward_std": 0.4455491602420807, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9775676131248474, + "sampling/importance_sampling_ratio/min": 0.008661825209856033, + "sampling/sampling_logp_difference/max": 4.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.1476084291934967, + "step": 388 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 751.0, + "completions/max_terminated_length": 751.0, + "completions/mean_length": 301.3984375, + "completions/mean_terminated_length": 301.3984375, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.1464787721633911, + "epoch": 0.6884955752212389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3208387829515269, + "learning_rate": 1e-06, + "loss": -0.0137, + "num_tokens": 192575654.0, + "reward": 0.8404296636581421, + "reward_std": 0.3558422923088074, + "rewards/execution_accuracy_EX/mean": 0.83203125, + "rewards/execution_accuracy_EX/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.946167230606079, + "sampling/importance_sampling_ratio/mean": 0.9796954393386841, + "sampling/importance_sampling_ratio/min": 0.011154407635331154, + "sampling/sampling_logp_difference/max": 4.495920658111572, + "sampling/sampling_logp_difference/mean": 0.14021122455596924, + "step": 389 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1214.0, + "completions/max_terminated_length": 1214.0, + "completions/mean_length": 372.05859375, + "completions/mean_terminated_length": 372.05859375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.1822884976863861, + "epoch": 0.6902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25459226196081364, + "learning_rate": 1e-06, + "loss": -0.0001, + "num_tokens": 193008021.0, + "reward": 0.6845703125, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9802509546279907, + "sampling/importance_sampling_ratio/min": 0.011154396459460258, + "sampling/sampling_logp_difference/max": 4.495921611785889, + "sampling/sampling_logp_difference/mean": 0.1628512442111969, + "step": 390 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1050.0, + "completions/max_terminated_length": 1050.0, + "completions/mean_length": 375.18359375, + "completions/mean_terminated_length": 375.18359375, + "completions/min_length": 178.0, + "completions/min_terminated_length": 178.0, + "entropy": 0.16373853757977486, + "epoch": 0.6920353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24275023309596502, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 193467636.0, + "reward": 0.5658202767372131, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8604068756103516, + "sampling/importance_sampling_ratio/mean": 0.9817614555358887, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.14821916818618774, + "step": 391 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1359.0, + "completions/max_terminated_length": 1359.0, + "completions/mean_length": 458.9921875, + "completions/mean_terminated_length": 458.9921875, + "completions/min_length": 231.0, + "completions/min_terminated_length": 231.0, + "entropy": 0.18126251175999641, + "epoch": 0.6938053097345133, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26367888656482547, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 193961314.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9846888780593872, + "sampling/importance_sampling_ratio/min": 0.0178952906280756, + "sampling/sampling_logp_difference/max": 4.023217678070068, + "sampling/sampling_logp_difference/mean": 0.1535714566707611, + "step": 392 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1021.0, + "completions/max_terminated_length": 1021.0, + "completions/mean_length": 355.2890625, + "completions/mean_terminated_length": 355.2890625, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.1500838827341795, + "epoch": 0.695575221238938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15445753666811401, + "learning_rate": 1e-06, + "loss": -0.0036, + "num_tokens": 194386748.0, + "reward": 0.6437499523162842, + "reward_std": 0.4608176648616791, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9157530069351196, + "sampling/importance_sampling_ratio/mean": 0.979602575302124, + "sampling/importance_sampling_ratio/min": 0.014344037510454655, + "sampling/sampling_logp_difference/max": 4.244421005249023, + "sampling/sampling_logp_difference/mean": 0.14788876473903656, + "step": 393 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1278.0, + "completions/max_terminated_length": 1278.0, + "completions/mean_length": 402.8359375, + "completions/mean_terminated_length": 402.8359375, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.16516896896064281, + "epoch": 0.6973451327433628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26684222396392127, + "learning_rate": 1e-06, + "loss": 0.0077, + "num_tokens": 194943170.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948158740997314, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9838463068008423, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.1454889476299286, + "step": 394 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1270.0, + "completions/mean_length": 504.04296875, + "completions/mean_terminated_length": 475.75982666015625, + "completions/min_length": 212.0, + "completions/min_terminated_length": 212.0, + "entropy": 0.19731133989989758, + "epoch": 0.6991150442477876, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21571475021879108, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 195425757.0, + "reward": 0.6025390625, + "reward_std": 0.4699639081954956, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9842385649681091, + "sampling/importance_sampling_ratio/min": 0.008363359607756138, + "sampling/sampling_logp_difference/max": 4.783895015716553, + "sampling/sampling_logp_difference/mean": 0.1636572629213333, + "step": 395 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 876.0, + "completions/max_terminated_length": 876.0, + "completions/mean_length": 340.890625, + "completions/mean_terminated_length": 340.890625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.1476024929434061, + "epoch": 0.7008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20505814438933806, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 195909905.0, + "reward": 0.781054675579071, + "reward_std": 0.40085992217063904, + "rewards/execution_accuracy_EX/mean": 0.76953125, + "rewards/execution_accuracy_EX/std": 0.4219578504562378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.952225923538208, + "sampling/importance_sampling_ratio/mean": 0.9757999181747437, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.15290255844593048, + "step": 396 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1082.0, + "completions/max_terminated_length": 1082.0, + "completions/mean_length": 407.89453125, + "completions/mean_terminated_length": 407.89453125, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.16637560166418552, + "epoch": 0.7026548672566372, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1967975586757834, + "learning_rate": 1e-06, + "loss": 0.0101, + "num_tokens": 196611574.0, + "reward": 0.43964844942092896, + "reward_std": 0.46818408370018005, + "rewards/execution_accuracy_EX/mean": 0.41015625, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9798034429550171, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.1534593105316162, + "step": 397 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1075.0, + "completions/max_terminated_length": 1075.0, + "completions/mean_length": 343.5859375, + "completions/mean_terminated_length": 343.5859375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.14015005342662334, + "epoch": 0.7044247787610619, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2878680426578357, + "learning_rate": 1e-06, + "loss": 0.0054, + "num_tokens": 197036092.0, + "reward": 0.8070312738418579, + "reward_std": 0.38295724987983704, + "rewards/execution_accuracy_EX/mean": 0.796875, + "rewards/execution_accuracy_EX/std": 0.40311288833618164, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774739742279053, + "sampling/importance_sampling_ratio/min": 0.006744090002030134, + "sampling/sampling_logp_difference/max": 4.999088764190674, + "sampling/sampling_logp_difference/mean": 0.14382323622703552, + "step": 398 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1172.0, + "completions/max_terminated_length": 1172.0, + "completions/mean_length": 361.1796875, + "completions/mean_terminated_length": 361.1796875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.14296685345470905, + "epoch": 0.7061946902654868, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.12198919653195027, + "learning_rate": 1e-06, + "loss": -0.0006, + "num_tokens": 197449178.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9053452014923096, + "sampling/importance_sampling_ratio/mean": 0.9786155223846436, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.14207744598388672, + "step": 399 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1015.0, + "completions/max_terminated_length": 1015.0, + "completions/mean_length": 369.94921875, + "completions/mean_terminated_length": 369.94921875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.14750699326395988, + "epoch": 0.7079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29651302408551683, + "learning_rate": 1e-06, + "loss": 0.0164, + "num_tokens": 197806301.0, + "reward": 0.632617175579071, + "reward_std": 0.4635546803474426, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8656444549560547, + "sampling/importance_sampling_ratio/mean": 0.98008793592453, + "sampling/importance_sampling_ratio/min": 0.011136534623801708, + "sampling/sampling_logp_difference/max": 4.497524261474609, + "sampling/sampling_logp_difference/mean": 0.14344340562820435, + "step": 400 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 781.0, + "completions/max_terminated_length": 781.0, + "completions/mean_length": 346.69140625, + "completions/mean_terminated_length": 346.69140625, + "completions/min_length": 198.0, + "completions/min_terminated_length": 198.0, + "entropy": 0.1517967376857996, + "epoch": 0.7097345132743362, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1732909052946054, + "learning_rate": 1e-06, + "loss": -0.0079, + "num_tokens": 198185710.0, + "reward": 0.558398425579071, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.53515625, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788327217102051, + "sampling/importance_sampling_ratio/min": 0.011139354668557644, + "sampling/sampling_logp_difference/max": 4.4972710609436035, + "sampling/sampling_logp_difference/mean": 0.14681211113929749, + "step": 401 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1020.0, + "completions/max_terminated_length": 1020.0, + "completions/mean_length": 354.61328125, + "completions/mean_terminated_length": 354.61328125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.15006682462990284, + "epoch": 0.7115044247787611, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24945580913104384, + "learning_rate": 1e-06, + "loss": 0.0059, + "num_tokens": 198746491.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600866317749023, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9758408069610596, + "sampling/importance_sampling_ratio/min": 0.00866839848458767, + "sampling/sampling_logp_difference/max": 4.748071193695068, + "sampling/sampling_logp_difference/mean": 0.1516970694065094, + "step": 402 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1337.0, + "completions/max_terminated_length": 1337.0, + "completions/mean_length": 340.38671875, + "completions/mean_terminated_length": 340.38671875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.15468825958669186, + "epoch": 0.7132743362831858, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.35526546154810484, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 199405054.0, + "reward": 0.5843750238418579, + "reward_std": 0.47219759225845337, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8626350164413452, + "sampling/importance_sampling_ratio/mean": 0.9773305654525757, + "sampling/importance_sampling_ratio/min": 0.0004330030642449856, + "sampling/sampling_logp_difference/max": 7.744765758514404, + "sampling/sampling_logp_difference/mean": 0.15096834301948547, + "step": 403 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1938.0, + "completions/max_terminated_length": 1938.0, + "completions/mean_length": 343.01171875, + "completions/mean_terminated_length": 343.01171875, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.12916157208383083, + "epoch": 0.7150442477876107, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37986404361298337, + "learning_rate": 1e-06, + "loss": 0.0532, + "num_tokens": 200009857.0, + "reward": 0.8033202886581421, + "reward_std": 0.3856732249259949, + "rewards/execution_accuracy_EX/mean": 0.79296875, + "rewards/execution_accuracy_EX/std": 0.40597182512283325, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800818562507629, + "sampling/importance_sampling_ratio/min": 0.011119124479591846, + "sampling/sampling_logp_difference/max": 4.499088764190674, + "sampling/sampling_logp_difference/mean": 0.13622796535491943, + "step": 404 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1070.0, + "completions/max_terminated_length": 1070.0, + "completions/mean_length": 310.34375, + "completions/mean_terminated_length": 310.34375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.13033237494528294, + "epoch": 0.7168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.42270490346299705, + "learning_rate": 1e-06, + "loss": -0.0318, + "num_tokens": 200554713.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.892430067062378, + "sampling/importance_sampling_ratio/mean": 0.9765554666519165, + "sampling/importance_sampling_ratio/min": 0.005313629750162363, + "sampling/sampling_logp_difference/max": 5.237480163574219, + "sampling/sampling_logp_difference/mean": 0.14130495488643646, + "step": 405 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1401.0, + "completions/max_terminated_length": 1401.0, + "completions/mean_length": 426.2265625, + "completions/mean_terminated_length": 426.2265625, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.1708923950791359, + "epoch": 0.7185840707964601, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19902699891012435, + "learning_rate": 1e-06, + "loss": 0.0069, + "num_tokens": 201120515.0, + "reward": 0.5509765148162842, + "reward_std": 0.475218266248703, + "rewards/execution_accuracy_EX/mean": 0.52734375, + "rewards/execution_accuracy_EX/std": 0.5002297759056091, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9810044765472412, + "sampling/importance_sampling_ratio/min": 0.005353882443159819, + "sampling/sampling_logp_difference/max": 5.229933261871338, + "sampling/sampling_logp_difference/mean": 0.16047163307666779, + "step": 406 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 779.0, + "completions/mean_length": 353.83984375, + "completions/mean_terminated_length": 339.16473388671875, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.1377250738441944, + "epoch": 0.720353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26194346338740876, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 201699914.0, + "reward": 0.4580077826976776, + "reward_std": 0.471381276845932, + "rewards/execution_accuracy_EX/mean": 0.4296875, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9756262898445129, + "sampling/importance_sampling_ratio/min": 0.008682269603013992, + "sampling/sampling_logp_difference/max": 4.746472358703613, + "sampling/sampling_logp_difference/mean": 0.14412958920001984, + "step": 407 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2590.0, + "completions/max_terminated_length": 2590.0, + "completions/mean_length": 426.4765625, + "completions/mean_terminated_length": 426.4765625, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.1585888434201479, + "epoch": 0.7221238938053097, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24600787563356571, + "learning_rate": 1e-06, + "loss": -0.0396, + "num_tokens": 202409444.0, + "reward": 0.49160152673721313, + "reward_std": 0.4747525453567505, + "rewards/execution_accuracy_EX/mean": 0.46484375, + "rewards/execution_accuracy_EX/std": 0.49973952770233154, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9811248779296875, + "sampling/importance_sampling_ratio/min": 0.0052652242593467236, + "sampling/sampling_logp_difference/max": 5.246631622314453, + "sampling/sampling_logp_difference/mean": 0.1513960063457489, + "step": 408 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 976.0, + "completions/max_terminated_length": 976.0, + "completions/mean_length": 320.90234375, + "completions/mean_terminated_length": 320.90234375, + "completions/min_length": 197.0, + "completions/min_terminated_length": 197.0, + "entropy": 0.12159352377057076, + "epoch": 0.7238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2573446286270932, + "learning_rate": 1e-06, + "loss": -0.0077, + "num_tokens": 202815371.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593045234680176, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761395454406738, + "sampling/importance_sampling_ratio/min": 0.011183848604559898, + "sampling/sampling_logp_difference/max": 4.493284702301025, + "sampling/sampling_logp_difference/mean": 0.13443967700004578, + "step": 409 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1837.0, + "completions/mean_length": 376.0078125, + "completions/mean_terminated_length": 361.41961669921875, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.13782457448542118, + "epoch": 0.7256637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26690984263834117, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 203581469.0, + "reward": 0.5248047113418579, + "reward_std": 0.47613638639450073, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774079918861389, + "sampling/importance_sampling_ratio/min": 0.0020053053740411997, + "sampling/sampling_logp_difference/max": 6.211958885192871, + "sampling/sampling_logp_difference/mean": 0.14452630281448364, + "step": 410 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1237.0, + "completions/mean_length": 405.08203125, + "completions/mean_terminated_length": 390.6078796386719, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.13971972465515137, + "epoch": 0.727433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1622591965922765, + "learning_rate": 1e-06, + "loss": -0.0044, + "num_tokens": 204176818.0, + "reward": 0.5693359375, + "reward_std": 0.4740595817565918, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 1.9836039543151855, + "sampling/importance_sampling_ratio/mean": 0.9794758558273315, + "sampling/importance_sampling_ratio/min": 0.0025437939912080765, + "sampling/sampling_logp_difference/max": 5.9740986824035645, + "sampling/sampling_logp_difference/mean": 0.13981348276138306, + "step": 411 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 974.0, + "completions/max_terminated_length": 974.0, + "completions/mean_length": 338.421875, + "completions/mean_terminated_length": 338.421875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.12366738822311163, + "epoch": 0.7292035398230089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5099800386456143, + "learning_rate": 1e-06, + "loss": 0.0058, + "num_tokens": 204702526.0, + "reward": 0.725390613079071, + "reward_std": 0.4315042495727539, + "rewards/execution_accuracy_EX/mean": 0.7109375, + "rewards/execution_accuracy_EX/std": 0.45421501994132996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9769878387451172, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13305461406707764, + "step": 412 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 668.0, + "completions/max_terminated_length": 668.0, + "completions/mean_length": 296.91015625, + "completions/mean_terminated_length": 296.91015625, + "completions/min_length": 211.0, + "completions/min_terminated_length": 211.0, + "entropy": 0.11305101774632931, + "epoch": 0.7309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32362625799769246, + "learning_rate": 1e-06, + "loss": -0.0, + "num_tokens": 205312423.0, + "reward": 0.7513671517372131, + "reward_std": 0.4184097647666931, + "rewards/execution_accuracy_EX/mean": 0.73828125, + "rewards/execution_accuracy_EX/std": 0.4404313564300537, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774864912033081, + "sampling/importance_sampling_ratio/min": 0.006741675082594156, + "sampling/sampling_logp_difference/max": 4.999446868896484, + "sampling/sampling_logp_difference/mean": 0.12930026650428772, + "step": 413 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1164.0, + "completions/max_terminated_length": 1164.0, + "completions/mean_length": 301.6953125, + "completions/mean_terminated_length": 301.6953125, + "completions/min_length": 138.0, + "completions/min_terminated_length": 138.0, + "entropy": 0.12935538683086634, + "epoch": 0.7327433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30840177983682704, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 205727561.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9756845831871033, + "sampling/importance_sampling_ratio/min": 0.0052536651492118835, + "sampling/sampling_logp_difference/max": 5.248829364776611, + "sampling/sampling_logp_difference/mean": 0.14531688392162323, + "step": 414 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1171.0, + "completions/max_terminated_length": 1171.0, + "completions/mean_length": 308.29296875, + "completions/mean_terminated_length": 308.29296875, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.11433220468461514, + "epoch": 0.7345132743362832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32537855064281407, + "learning_rate": 1e-06, + "loss": -0.005, + "num_tokens": 206225012.0, + "reward": 0.699414074420929, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9792124032974243, + "sampling/importance_sampling_ratio/min": 0.001520410762168467, + "sampling/sampling_logp_difference/max": 6.48877477645874, + "sampling/sampling_logp_difference/mean": 0.128114253282547, + "step": 415 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 879.0, + "completions/max_terminated_length": 879.0, + "completions/mean_length": 330.4765625, + "completions/mean_terminated_length": 330.4765625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.11765911988914013, + "epoch": 0.736283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13226726347536988, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 206701230.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782488346099854, + "sampling/importance_sampling_ratio/min": 0.005450984928756952, + "sampling/sampling_logp_difference/max": 5.211958885192871, + "sampling/sampling_logp_difference/mean": 0.1290372759103775, + "step": 416 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1738.0, + "completions/max_terminated_length": 1738.0, + "completions/mean_length": 331.71484375, + "completions/mean_terminated_length": 331.71484375, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.1193550219759345, + "epoch": 0.7380530973451327, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1995712844401606, + "learning_rate": 1e-06, + "loss": 0.0034, + "num_tokens": 207215781.0, + "reward": 0.669726550579071, + "reward_std": 0.4533011019229889, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9766120910644531, + "sampling/importance_sampling_ratio/min": 0.005258153658360243, + "sampling/sampling_logp_difference/max": 5.2479753494262695, + "sampling/sampling_logp_difference/mean": 0.13726986944675446, + "step": 417 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1433.0, + "completions/mean_length": 390.8203125, + "completions/mean_terminated_length": 361.6456604003906, + "completions/min_length": 159.0, + "completions/min_terminated_length": 159.0, + "entropy": 0.132203945890069, + "epoch": 0.7398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2415576277838756, + "learning_rate": 1e-06, + "loss": -0.0011, + "num_tokens": 207701815.0, + "reward": 0.717578113079071, + "reward_std": 0.43551141023635864, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799232482910156, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.13808873295783997, + "step": 418 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1667.0, + "completions/max_terminated_length": 1667.0, + "completions/mean_length": 314.91015625, + "completions/mean_terminated_length": 314.91015625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.11113210208714008, + "epoch": 0.7415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09458253582339968, + "learning_rate": 1e-06, + "loss": -0.0015, + "num_tokens": 208343072.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8623942136764526, + "sampling/importance_sampling_ratio/mean": 0.9757764935493469, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.13390567898750305, + "step": 419 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1395.0, + "completions/max_terminated_length": 1395.0, + "completions/mean_length": 391.25, + "completions/mean_terminated_length": 391.25, + "completions/min_length": 180.0, + "completions/min_terminated_length": 180.0, + "entropy": 0.12792749982327223, + "epoch": 0.7433628318584071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2103665825522292, + "learning_rate": 1e-06, + "loss": 0.005, + "num_tokens": 209057408.0, + "reward": 0.4730468690395355, + "reward_std": 0.47307512164115906, + "rewards/execution_accuracy_EX/mean": 0.4453125, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786797165870667, + "sampling/importance_sampling_ratio/min": 0.011125700548291206, + "sampling/sampling_logp_difference/max": 4.498497486114502, + "sampling/sampling_logp_difference/mean": 0.1359177827835083, + "step": 420 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1192.0, + "completions/max_terminated_length": 1192.0, + "completions/mean_length": 309.5390625, + "completions/mean_terminated_length": 309.5390625, + "completions/min_length": 172.0, + "completions/min_terminated_length": 172.0, + "entropy": 0.10244856681674719, + "epoch": 0.7451327433628319, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13597446755628906, + "learning_rate": 1e-06, + "loss": 0.0002, + "num_tokens": 209494746.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9756293296813965, + "sampling/importance_sampling_ratio/min": 0.011125764809548855, + "sampling/sampling_logp_difference/max": 4.4984917640686035, + "sampling/sampling_logp_difference/mean": 0.13026078045368195, + "step": 421 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 962.0, + "completions/max_terminated_length": 962.0, + "completions/mean_length": 339.15234375, + "completions/mean_terminated_length": 339.15234375, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.11116266716271639, + "epoch": 0.7469026548672566, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24474655526352976, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 210003345.0, + "reward": 0.740234375, + "reward_std": 0.4242667853832245, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9771470427513123, + "sampling/importance_sampling_ratio/min": 0.005275054834783077, + "sampling/sampling_logp_difference/max": 5.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.1310897022485733, + "step": 422 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1166.0, + "completions/max_terminated_length": 1166.0, + "completions/mean_length": 323.17578125, + "completions/mean_terminated_length": 323.17578125, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.10073120426386595, + "epoch": 0.7486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2185985904271473, + "learning_rate": 1e-06, + "loss": 0.0093, + "num_tokens": 210497358.0, + "reward": 0.6066405773162842, + "reward_std": 0.46884801983833313, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788148403167725, + "sampling/importance_sampling_ratio/min": 0.008668468333780766, + "sampling/sampling_logp_difference/max": 4.748063087463379, + "sampling/sampling_logp_difference/mean": 0.12084464728832245, + "step": 423 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 842.0, + "completions/max_terminated_length": 842.0, + "completions/mean_length": 336.5390625, + "completions/mean_terminated_length": 336.5390625, + "completions/min_length": 200.0, + "completions/min_terminated_length": 200.0, + "entropy": 0.09866121783852577, + "epoch": 0.7504424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3646291725281725, + "learning_rate": 1e-06, + "loss": -0.0184, + "num_tokens": 210924232.0, + "reward": 0.7105468511581421, + "reward_std": 0.43811774253845215, + "rewards/execution_accuracy_EX/mean": 0.6953125, + "rewards/execution_accuracy_EX/std": 0.4611765742301941, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9776594638824463, + "sampling/importance_sampling_ratio/min": 0.006850760895758867, + "sampling/sampling_logp_difference/max": 4.983395576477051, + "sampling/sampling_logp_difference/mean": 0.11906301975250244, + "step": 424 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 623.0, + "completions/max_terminated_length": 623.0, + "completions/mean_length": 309.0, + "completions/mean_terminated_length": 309.0, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.10941006522625685, + "epoch": 0.7522123893805309, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1041127715544993, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 211448024.0, + "reward": 0.6957031488418579, + "reward_std": 0.4441350996494293, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9391770362854004, + "sampling/importance_sampling_ratio/mean": 0.9759188890457153, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1324661374092102, + "step": 425 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1479.0, + "completions/mean_length": 370.87890625, + "completions/mean_terminated_length": 356.2705993652344, + "completions/min_length": 195.0, + "completions/min_terminated_length": 195.0, + "entropy": 0.12021107878535986, + "epoch": 0.7539823008849558, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2874033325364916, + "learning_rate": 1e-06, + "loss": -0.0066, + "num_tokens": 211892185.0, + "reward": 0.6917968392372131, + "reward_std": 0.44584256410598755, + "rewards/execution_accuracy_EX/mean": 0.67578125, + "rewards/execution_accuracy_EX/std": 0.46899911761283875, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 1.8821347951889038, + "sampling/importance_sampling_ratio/mean": 0.9791556596755981, + "sampling/importance_sampling_ratio/min": 0.008726546540856361, + "sampling/sampling_logp_difference/max": 4.741385459899902, + "sampling/sampling_logp_difference/mean": 0.13618966937065125, + "step": 426 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1255.0, + "completions/max_terminated_length": 1255.0, + "completions/mean_length": 318.7265625, + "completions/mean_terminated_length": 318.7265625, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.09897692129015923, + "epoch": 0.7557522123893805, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17419818865642023, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 212368499.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803605675697327, + "sampling/importance_sampling_ratio/min": 0.005269622430205345, + "sampling/sampling_logp_difference/max": 5.2457966804504395, + "sampling/sampling_logp_difference/mean": 0.11493834853172302, + "step": 427 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 867.0, + "completions/max_terminated_length": 867.0, + "completions/mean_length": 332.6796875, + "completions/mean_terminated_length": 332.6796875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.09865881130099297, + "epoch": 0.7575221238938054, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26984354637504, + "learning_rate": 1e-06, + "loss": 0.0011, + "num_tokens": 212933489.0, + "reward": 0.595507800579071, + "reward_std": 0.47065800428390503, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9766672253608704, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.12172359228134155, + "step": 428 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1338.0, + "completions/max_terminated_length": 1338.0, + "completions/mean_length": 342.16796875, + "completions/mean_terminated_length": 342.16796875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.11332171130925417, + "epoch": 0.7592920353982301, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5388262429331899, + "learning_rate": 1e-06, + "loss": -0.0284, + "num_tokens": 213391980.0, + "reward": 0.6363281011581421, + "reward_std": 0.46267399191856384, + "rewards/execution_accuracy_EX/mean": 0.6171875, + "rewards/execution_accuracy_EX/std": 0.48702529072761536, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824634790420532, + "sampling/importance_sampling_ratio/min": 0.008850703947246075, + "sampling/sampling_logp_difference/max": 4.727258205413818, + "sampling/sampling_logp_difference/mean": 0.12259416282176971, + "step": 429 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1026.0, + "completions/mean_length": 339.33984375, + "completions/mean_terminated_length": 324.60784912109375, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.10638696234673262, + "epoch": 0.7610619469026548, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.40723506148938576, + "learning_rate": 1e-06, + "loss": -0.0012, + "num_tokens": 214108387.0, + "reward": 0.4839843809604645, + "reward_std": 0.47435954213142395, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9761378169059753, + "sampling/importance_sampling_ratio/min": 0.005292918533086777, + "sampling/sampling_logp_difference/max": 5.241385459899902, + "sampling/sampling_logp_difference/mean": 0.1323973536491394, + "step": 430 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 957.0, + "completions/mean_length": 323.34375, + "completions/mean_terminated_length": 293.6377868652344, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.09915001504123211, + "epoch": 0.7628318584070797, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2657950774754114, + "learning_rate": 1e-06, + "loss": -0.0054, + "num_tokens": 214724827.0, + "reward": 0.62109375, + "reward_std": 0.466510146856308, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9778218269348145, + "sampling/importance_sampling_ratio/min": 0.0023839690256863832, + "sampling/sampling_logp_difference/max": 6.0389885902404785, + "sampling/sampling_logp_difference/mean": 0.12448465079069138, + "step": 431 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 755.0, + "completions/max_terminated_length": 755.0, + "completions/mean_length": 253.078125, + "completions/mean_terminated_length": 253.078125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.0758161349222064, + "epoch": 0.7646017699115044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.36656361065128307, + "learning_rate": 1e-06, + "loss": 0.001, + "num_tokens": 215296431.0, + "reward": 0.743945300579071, + "reward_std": 0.42235618829727173, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.996958613395691, + "sampling/importance_sampling_ratio/mean": 0.9752987623214722, + "sampling/importance_sampling_ratio/min": 0.008661850355565548, + "sampling/sampling_logp_difference/max": 4.74882698059082, + "sampling/sampling_logp_difference/mean": 0.11455568671226501, + "step": 432 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 937.0, + "completions/max_terminated_length": 937.0, + "completions/mean_length": 316.26953125, + "completions/mean_terminated_length": 316.26953125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.08657894004136324, + "epoch": 0.7663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3329185795384755, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 215715492.0, + "reward": 0.7291015386581421, + "reward_std": 0.4297545552253723, + "rewards/execution_accuracy_EX/mean": 0.71484375, + "rewards/execution_accuracy_EX/std": 0.4523732364177704, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779499769210815, + "sampling/importance_sampling_ratio/min": 0.002480123657733202, + "sampling/sampling_logp_difference/max": 5.999446868896484, + "sampling/sampling_logp_difference/mean": 0.11364603787660599, + "step": 433 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 298.28515625, + "completions/mean_terminated_length": 298.28515625, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.08486288227140903, + "epoch": 0.768141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.37818714546091925, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 216232189.0, + "reward": 0.6845703125, + "reward_std": 0.44827139377593994, + "rewards/execution_accuracy_EX/mean": 0.66796875, + "rewards/execution_accuracy_EX/std": 0.4718646705150604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9771759510040283, + "sampling/importance_sampling_ratio/min": 0.005371001549065113, + "sampling/sampling_logp_difference/max": 5.226740837097168, + "sampling/sampling_logp_difference/mean": 0.11518388986587524, + "step": 434 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1415.0, + "completions/max_terminated_length": 1415.0, + "completions/mean_length": 314.4921875, + "completions/mean_terminated_length": 314.4921875, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.08155077509582043, + "epoch": 0.7699115044247787, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2974299365827861, + "learning_rate": 1e-06, + "loss": -0.012, + "num_tokens": 216687803.0, + "reward": 0.49531248211860657, + "reward_std": 0.4749999940395355, + "rewards/execution_accuracy_EX/mean": 0.46875, + "rewards/execution_accuracy_EX/std": 0.5, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9413729906082153, + "sampling/importance_sampling_ratio/mean": 0.9788044095039368, + "sampling/importance_sampling_ratio/min": 0.0024801555555313826, + "sampling/sampling_logp_difference/max": 5.999433994293213, + "sampling/sampling_logp_difference/mean": 0.10834769159555435, + "step": 435 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 572.0, + "completions/max_terminated_length": 572.0, + "completions/mean_length": 269.78125, + "completions/mean_terminated_length": 269.78125, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.0770259564742446, + "epoch": 0.7716814159292036, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3477427421826566, + "learning_rate": 1e-06, + "loss": 0.0024, + "num_tokens": 217213539.0, + "reward": 0.7179687023162842, + "reward_std": 0.4348871409893036, + "rewards/execution_accuracy_EX/mean": 0.703125, + "rewards/execution_accuracy_EX/std": 0.45777595043182373, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9755126237869263, + "sampling/importance_sampling_ratio/min": 0.005266683176159859, + "sampling/sampling_logp_difference/max": 5.246354579925537, + "sampling/sampling_logp_difference/mean": 0.11614066362380981, + "step": 436 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 999.0, + "completions/max_terminated_length": 999.0, + "completions/mean_length": 309.98828125, + "completions/mean_terminated_length": 309.98828125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.08747177477926016, + "epoch": 0.7734513274336283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23042955091273926, + "learning_rate": 1e-06, + "loss": 0.0064, + "num_tokens": 217712144.0, + "reward": 0.5992187261581421, + "reward_std": 0.47008487582206726, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9772858619689941, + "sampling/importance_sampling_ratio/min": 0.006758376490324736, + "sampling/sampling_logp_difference/max": 4.996972560882568, + "sampling/sampling_logp_difference/mean": 0.11812114715576172, + "step": 437 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1162.0, + "completions/max_terminated_length": 1162.0, + "completions/mean_length": 336.046875, + "completions/mean_terminated_length": 336.046875, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.095473762601614, + "epoch": 0.7752212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10586711541233572, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 218379916.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9747204780578613, + "sampling/importance_sampling_ratio/min": 0.004099207930266857, + "sampling/sampling_logp_difference/max": 5.49696159362793, + "sampling/sampling_logp_difference/mean": 0.1270812749862671, + "step": 438 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1652.0, + "completions/max_terminated_length": 1652.0, + "completions/mean_length": 277.58984375, + "completions/mean_terminated_length": 277.58984375, + "completions/min_length": 137.0, + "completions/min_terminated_length": 137.0, + "entropy": 0.07738334592431784, + "epoch": 0.7769911504424779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3806241018322237, + "learning_rate": 1e-06, + "loss": -0.0345, + "num_tokens": 219015459.0, + "reward": 0.47675782442092896, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9766601920127869, + "sampling/importance_sampling_ratio/min": 0.008679235354065895, + "sampling/sampling_logp_difference/max": 4.746821880340576, + "sampling/sampling_logp_difference/mean": 0.11376699805259705, + "step": 439 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1640.0, + "completions/max_terminated_length": 1640.0, + "completions/mean_length": 348.04296875, + "completions/mean_terminated_length": 348.04296875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09423344023525715, + "epoch": 0.7787610619469026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4084732418284597, + "learning_rate": 1e-06, + "loss": 0.0065, + "num_tokens": 219464046.0, + "reward": 0.6437499523162842, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8886933326721191, + "sampling/importance_sampling_ratio/mean": 0.9797013998031616, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.11707621067762375, + "step": 440 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1454.0, + "completions/max_terminated_length": 1454.0, + "completions/mean_length": 330.74609375, + "completions/mean_terminated_length": 330.74609375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.0814787931740284, + "epoch": 0.7805309734513274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4846264441212243, + "learning_rate": 1e-06, + "loss": -0.0116, + "num_tokens": 219979437.0, + "reward": 0.614062488079071, + "reward_std": 0.46748965978622437, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9803451299667358, + "sampling/importance_sampling_ratio/min": 0.0007139010122045875, + "sampling/sampling_logp_difference/max": 7.2447662353515625, + "sampling/sampling_logp_difference/mean": 0.10679909586906433, + "step": 441 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1268.0, + "completions/mean_length": 318.1328125, + "completions/mean_terminated_length": 303.3176574707031, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.09138348698616028, + "epoch": 0.7823008849557522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0035199025850439912, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 220368959.0, + "reward": 0.5248047113418579, + "reward_std": 0.47613638639450073, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822543859481812, + "sampling/importance_sampling_ratio/min": 0.006748310290277004, + "sampling/sampling_logp_difference/max": 4.998463153839111, + "sampling/sampling_logp_difference/mean": 0.11548037827014923, + "step": 442 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1705.0, + "completions/mean_length": 375.08984375, + "completions/mean_terminated_length": 360.498046875, + "completions/min_length": 102.0, + "completions/min_terminated_length": 102.0, + "entropy": 0.09969014022499323, + "epoch": 0.784070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.7800076888829647, + "learning_rate": 1e-06, + "loss": -0.0169, + "num_tokens": 220829350.0, + "reward": 0.7660156488418579, + "reward_std": 0.4103529453277588, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796102046966553, + "sampling/importance_sampling_ratio/min": 0.004096942488104105, + "sampling/sampling_logp_difference/max": 5.497514247894287, + "sampling/sampling_logp_difference/mean": 0.1221030205488205, + "step": 443 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1707.0, + "completions/max_terminated_length": 1707.0, + "completions/mean_length": 396.296875, + "completions/mean_terminated_length": 396.296875, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.10772426798939705, + "epoch": 0.7858407079646018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2087415195653686, + "learning_rate": 1e-06, + "loss": -0.0113, + "num_tokens": 221508242.0, + "reward": 0.47675779461860657, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800965785980225, + "sampling/importance_sampling_ratio/min": 0.005525479093194008, + "sampling/sampling_logp_difference/max": 5.198385238647461, + "sampling/sampling_logp_difference/mean": 0.12877586483955383, + "step": 444 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1450.0, + "completions/max_terminated_length": 1450.0, + "completions/mean_length": 310.40234375, + "completions/mean_terminated_length": 310.40234375, + "completions/min_length": 115.0, + "completions/min_terminated_length": 115.0, + "entropy": 0.09367150627076626, + "epoch": 0.7876106194690266, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34971138259769186, + "learning_rate": 1e-06, + "loss": 0.0003, + "num_tokens": 222059529.0, + "reward": 0.595507800579071, + "reward_std": 0.47065800428390503, + "rewards/execution_accuracy_EX/mean": 0.57421875, + "rewards/execution_accuracy_EX/std": 0.49542948603630066, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9749582409858704, + "sampling/importance_sampling_ratio/min": 0.006767093203961849, + "sampling/sampling_logp_difference/max": 4.995683670043945, + "sampling/sampling_logp_difference/mean": 0.12886998057365417, + "step": 445 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1591.0, + "completions/max_terminated_length": 1591.0, + "completions/mean_length": 327.578125, + "completions/mean_terminated_length": 327.578125, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.07842460833489895, + "epoch": 0.7893805309734513, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2822770722979932, + "learning_rate": 1e-06, + "loss": 0.0235, + "num_tokens": 222598349.0, + "reward": 0.614062488079071, + "reward_std": 0.46748965978622437, + "rewards/execution_accuracy_EX/mean": 0.59375, + "rewards/execution_accuracy_EX/std": 0.49209436774253845, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8763935565948486, + "sampling/importance_sampling_ratio/mean": 0.981664776802063, + "sampling/importance_sampling_ratio/min": 0.00708415312692523, + "sampling/sampling_logp_difference/max": 4.949894905090332, + "sampling/sampling_logp_difference/mean": 0.10455193370580673, + "step": 446 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1348.0, + "completions/max_terminated_length": 1348.0, + "completions/mean_length": 317.0703125, + "completions/mean_terminated_length": 317.0703125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.09262909553945065, + "epoch": 0.7911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3778221159657832, + "learning_rate": 1e-06, + "loss": 0.008, + "num_tokens": 222976927.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9781402349472046, + "sampling/importance_sampling_ratio/mean": 0.9780029654502869, + "sampling/importance_sampling_ratio/min": 0.00409080320969224, + "sampling/sampling_logp_difference/max": 5.499013900756836, + "sampling/sampling_logp_difference/mean": 0.12452073395252228, + "step": 447 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1167.0, + "completions/max_terminated_length": 1167.0, + "completions/mean_length": 330.015625, + "completions/mean_terminated_length": 330.015625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.0872789965942502, + "epoch": 0.7929203539823009, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3657236297441997, + "learning_rate": 1e-06, + "loss": 0.0138, + "num_tokens": 223373923.0, + "reward": 0.5658202767372131, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9787552356719971, + "sampling/importance_sampling_ratio/min": 0.006754662375897169, + "sampling/sampling_logp_difference/max": 4.997522354125977, + "sampling/sampling_logp_difference/mean": 0.11839602887630463, + "step": 448 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1659.0, + "completions/max_terminated_length": 1659.0, + "completions/mean_length": 321.67578125, + "completions/mean_terminated_length": 321.67578125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.08092036191374063, + "epoch": 0.7946902654867256, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2651752619147164, + "learning_rate": 1e-06, + "loss": -0.0088, + "num_tokens": 223736000.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9768083095550537, + "sampling/importance_sampling_ratio/min": 0.005254730116575956, + "sampling/sampling_logp_difference/max": 5.248626708984375, + "sampling/sampling_logp_difference/mean": 0.11782976984977722, + "step": 449 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1105.0, + "completions/max_terminated_length": 1105.0, + "completions/mean_length": 306.34765625, + "completions/mean_terminated_length": 306.34765625, + "completions/min_length": 120.0, + "completions/min_terminated_length": 120.0, + "entropy": 0.07939030043780804, + "epoch": 0.7964601769911505, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31977415414022614, + "learning_rate": 1e-06, + "loss": -0.01, + "num_tokens": 224237881.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9801115989685059, + "sampling/importance_sampling_ratio/min": 0.005262570921331644, + "sampling/sampling_logp_difference/max": 5.247135639190674, + "sampling/sampling_logp_difference/mean": 0.10750046372413635, + "step": 450 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1044.0, + "completions/max_terminated_length": 1044.0, + "completions/mean_length": 316.109375, + "completions/mean_terminated_length": 316.109375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.09447643160820007, + "epoch": 0.7982300884955752, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2655966100963604, + "learning_rate": 1e-06, + "loss": -0.0032, + "num_tokens": 224989829.0, + "reward": 0.8701171875, + "reward_std": 0.32701200246810913, + "rewards/execution_accuracy_EX/mean": 0.86328125, + "rewards/execution_accuracy_EX/std": 0.34422317147254944, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9353786706924438, + "sampling/importance_sampling_ratio/mean": 0.9811705350875854, + "sampling/importance_sampling_ratio/min": 0.004089032299816608, + "sampling/sampling_logp_difference/max": 5.499446868896484, + "sampling/sampling_logp_difference/mean": 0.11678052693605423, + "step": 451 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2181.0, + "completions/mean_length": 414.13671875, + "completions/mean_terminated_length": 399.69805908203125, + "completions/min_length": 116.0, + "completions/min_terminated_length": 116.0, + "entropy": 0.10684379749000072, + "epoch": 0.8, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3080515698834411, + "learning_rate": 1e-06, + "loss": 0.0256, + "num_tokens": 225598360.0, + "reward": 0.732617199420929, + "reward_std": 0.4282895624637604, + "rewards/execution_accuracy_EX/mean": 0.71875, + "rewards/execution_accuracy_EX/std": 0.45048993825912476, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 1.997812032699585, + "sampling/importance_sampling_ratio/mean": 0.9780542850494385, + "sampling/importance_sampling_ratio/min": 0.005716521292924881, + "sampling/sampling_logp_difference/max": 5.164394855499268, + "sampling/sampling_logp_difference/mean": 0.13397234678268433, + "step": 452 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 723.0, + "completions/max_terminated_length": 723.0, + "completions/mean_length": 274.3828125, + "completions/mean_terminated_length": 274.3828125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.07503345794975758, + "epoch": 0.8017699115044248, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32222800405943564, + "learning_rate": 1e-06, + "loss": 0.0038, + "num_tokens": 226191178.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9768715500831604, + "sampling/importance_sampling_ratio/min": 0.001507165958173573, + "sampling/sampling_logp_difference/max": 6.497524261474609, + "sampling/sampling_logp_difference/mean": 0.11086973547935486, + "step": 453 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1686.0, + "completions/max_terminated_length": 1686.0, + "completions/mean_length": 346.09375, + "completions/mean_terminated_length": 346.09375, + "completions/min_length": 187.0, + "completions/min_terminated_length": 187.0, + "entropy": 0.08223924785852432, + "epoch": 0.8035398230088495, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26762464550604864, + "learning_rate": 1e-06, + "loss": -0.0003, + "num_tokens": 226621234.0, + "reward": 0.5658203363418579, + "reward_std": 0.4741697609424591, + "rewards/execution_accuracy_EX/mean": 0.54296875, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794480204582214, + "sampling/importance_sampling_ratio/min": 0.0051038190722465515, + "sampling/sampling_logp_difference/max": 5.277766227722168, + "sampling/sampling_logp_difference/mean": 0.10982825607061386, + "step": 454 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1222.0, + "completions/max_terminated_length": 1222.0, + "completions/mean_length": 336.48046875, + "completions/mean_terminated_length": 336.48046875, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.08400444034487009, + "epoch": 0.8053097345132744, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4363384259899209, + "learning_rate": 1e-06, + "loss": 0.0042, + "num_tokens": 227079309.0, + "reward": 0.6103515625, + "reward_std": 0.46818408370018005, + "rewards/execution_accuracy_EX/mean": 0.58984375, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790494441986084, + "sampling/importance_sampling_ratio/min": 0.004761462565511465, + "sampling/sampling_logp_difference/max": 5.347200393676758, + "sampling/sampling_logp_difference/mean": 0.11057664453983307, + "step": 455 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1864.0, + "completions/max_terminated_length": 1864.0, + "completions/mean_length": 291.53515625, + "completions/mean_terminated_length": 291.53515625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.07178681809455156, + "epoch": 0.8070796460176991, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3319572345197162, + "learning_rate": 1e-06, + "loss": 0.0106, + "num_tokens": 227467366.0, + "reward": 0.5806640386581421, + "reward_std": 0.4726512134075165, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800399541854858, + "sampling/importance_sampling_ratio/min": 0.0056649609468877316, + "sampling/sampling_logp_difference/max": 5.173455238342285, + "sampling/sampling_logp_difference/mean": 0.09974545240402222, + "step": 456 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1124.0, + "completions/mean_length": 318.5390625, + "completions/mean_terminated_length": 303.7254943847656, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.09256428107619286, + "epoch": 0.8088495575221238, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30369054606532936, + "learning_rate": 1e-06, + "loss": 0.0384, + "num_tokens": 228061648.0, + "reward": 0.6175780892372131, + "reward_std": 0.4670134484767914, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786026477813721, + "sampling/importance_sampling_ratio/min": 0.001931285485625267, + "sampling/sampling_logp_difference/max": 6.249569416046143, + "sampling/sampling_logp_difference/mean": 0.11863424628973007, + "step": 457 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1378.0, + "completions/mean_length": 316.33203125, + "completions/mean_terminated_length": 301.50982666015625, + "completions/min_length": 125.0, + "completions/min_terminated_length": 125.0, + "entropy": 0.094835733063519, + "epoch": 0.8106194690265487, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5356222093813154, + "learning_rate": 1e-06, + "loss": 0.0474, + "num_tokens": 228590741.0, + "reward": 0.7771484851837158, + "reward_std": 0.4035811126232147, + "rewards/execution_accuracy_EX/mean": 0.765625, + "rewards/execution_accuracy_EX/std": 0.42443734407424927, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795012474060059, + "sampling/importance_sampling_ratio/min": 0.006784177850931883, + "sampling/sampling_logp_difference/max": 4.993162155151367, + "sampling/sampling_logp_difference/mean": 0.12180796265602112, + "step": 458 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1197.0, + "completions/max_terminated_length": 1197.0, + "completions/mean_length": 296.68359375, + "completions/mean_terminated_length": 296.68359375, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.0966231282800436, + "epoch": 0.8123893805309734, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3685554782293145, + "learning_rate": 1e-06, + "loss": -0.0019, + "num_tokens": 228984324.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779250621795654, + "sampling/importance_sampling_ratio/min": 0.004093545023351908, + "sampling/sampling_logp_difference/max": 5.4983439445495605, + "sampling/sampling_logp_difference/mean": 0.12139870226383209, + "step": 459 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 759.0, + "completions/max_terminated_length": 759.0, + "completions/mean_length": 258.25390625, + "completions/mean_terminated_length": 258.25390625, + "completions/min_length": 153.0, + "completions/min_terminated_length": 153.0, + "entropy": 0.07291886862367392, + "epoch": 0.8141592920353983, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5167090050362314, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 229537205.0, + "reward": 0.847851574420929, + "reward_std": 0.3490958511829376, + "rewards/execution_accuracy_EX/mean": 0.83984375, + "rewards/execution_accuracy_EX/std": 0.36746934056282043, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779763221740723, + "sampling/importance_sampling_ratio/min": 0.005634597036987543, + "sampling/sampling_logp_difference/max": 5.178829669952393, + "sampling/sampling_logp_difference/mean": 0.10613366961479187, + "step": 460 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1291.0, + "completions/max_terminated_length": 1291.0, + "completions/mean_length": 344.578125, + "completions/mean_terminated_length": 344.578125, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.10439775697886944, + "epoch": 0.815929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3876629389144973, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 230155113.0, + "reward": 0.6029297113418579, + "reward_std": 0.46948155760765076, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8959710597991943, + "sampling/importance_sampling_ratio/mean": 0.9778479337692261, + "sampling/importance_sampling_ratio/min": 0.004101135302335024, + "sampling/sampling_logp_difference/max": 5.496491432189941, + "sampling/sampling_logp_difference/mean": 0.12573343515396118, + "step": 461 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 573.0, + "completions/max_terminated_length": 573.0, + "completions/mean_length": 242.7265625, + "completions/mean_terminated_length": 242.7265625, + "completions/min_length": 121.0, + "completions/min_terminated_length": 121.0, + "entropy": 0.0799469705671072, + "epoch": 0.8176991150442477, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2972234810588866, + "learning_rate": 1e-06, + "loss": 0.0057, + "num_tokens": 230501267.0, + "reward": 0.7216796875, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.7134068012237549, + "sampling/importance_sampling_ratio/mean": 0.9784244298934937, + "sampling/importance_sampling_ratio/min": 0.008661828935146332, + "sampling/sampling_logp_difference/max": 4.748829364776611, + "sampling/sampling_logp_difference/mean": 0.11004751920700073, + "step": 462 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2012.0, + "completions/max_terminated_length": 2012.0, + "completions/mean_length": 308.67578125, + "completions/mean_terminated_length": 308.67578125, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.09601080138236284, + "epoch": 0.8194690265486726, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4236245912645116, + "learning_rate": 1e-06, + "loss": 0.0001, + "num_tokens": 230888512.0, + "reward": 0.588085949420929, + "reward_std": 0.4717142581939697, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9766750335693359, + "sampling/importance_sampling_ratio/min": 0.006748078390955925, + "sampling/sampling_logp_difference/max": 4.998497486114502, + "sampling/sampling_logp_difference/mean": 0.12495654821395874, + "step": 463 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1173.0, + "completions/max_terminated_length": 1173.0, + "completions/mean_length": 301.8671875, + "completions/mean_terminated_length": 301.8671875, + "completions/min_length": 117.0, + "completions/min_terminated_length": 117.0, + "entropy": 0.09849263541400433, + "epoch": 0.8212389380530973, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.270938897089436, + "learning_rate": 1e-06, + "loss": 0.0041, + "num_tokens": 231461182.0, + "reward": 0.43964841961860657, + "reward_std": 0.46818408370018005, + "rewards/execution_accuracy_EX/mean": 0.41015625, + "rewards/execution_accuracy_EX/std": 0.49282538890838623, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.976449728012085, + "sampling/importance_sampling_ratio/min": 0.00628508860245347, + "sampling/sampling_logp_difference/max": 5.069575309753418, + "sampling/sampling_logp_difference/mean": 0.12751615047454834, + "step": 464 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1096.0, + "completions/mean_length": 284.84375, + "completions/mean_terminated_length": 269.8980407714844, + "completions/min_length": 162.0, + "completions/min_terminated_length": 162.0, + "entropy": 0.08508272003382444, + "epoch": 0.8230088495575221, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3992028430125624, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 231903430.0, + "reward": 0.5804687738418579, + "reward_std": 0.47288161516189575, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783906936645508, + "sampling/importance_sampling_ratio/min": 0.0052665723487734795, + "sampling/sampling_logp_difference/max": 5.246375560760498, + "sampling/sampling_logp_difference/mean": 0.11341390013694763, + "step": 465 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3015.0, + "completions/mean_length": 354.26171875, + "completions/mean_terminated_length": 339.5882568359375, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.1056946087628603, + "epoch": 0.8247787610619469, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24208797497960982, + "learning_rate": 1e-06, + "loss": -0.0009, + "num_tokens": 232385337.0, + "reward": 0.5804687738418579, + "reward_std": 0.47248753905296326, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797363877296448, + "sampling/importance_sampling_ratio/min": 0.005264295265078545, + "sampling/sampling_logp_difference/max": 5.246808052062988, + "sampling/sampling_logp_difference/mean": 0.12469679117202759, + "step": 466 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2396.0, + "completions/mean_length": 387.91015625, + "completions/mean_terminated_length": 329.0516052246094, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.09944021981209517, + "epoch": 0.8265486725663717, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3333028002812477, + "learning_rate": 1e-06, + "loss": -0.0346, + "num_tokens": 232984002.0, + "reward": 0.5873047113418579, + "reward_std": 0.472648948431015, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.981889009475708, + "sampling/importance_sampling_ratio/min": 0.006765489932149649, + "sampling/sampling_logp_difference/max": 4.995920658111572, + "sampling/sampling_logp_difference/mean": 0.11872401833534241, + "step": 467 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1549.0, + "completions/mean_length": 310.57421875, + "completions/mean_terminated_length": 295.72943115234375, + "completions/min_length": 122.0, + "completions/min_terminated_length": 122.0, + "entropy": 0.09511182922869921, + "epoch": 0.8283185840707965, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.177096113735158, + "learning_rate": 1e-06, + "loss": -0.002, + "num_tokens": 233585941.0, + "reward": 0.628710925579071, + "reward_std": 0.4646587371826172, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 1.8923084735870361, + "sampling/importance_sampling_ratio/mean": 0.9799119830131531, + "sampling/importance_sampling_ratio/min": 0.004091258160769939, + "sampling/sampling_logp_difference/max": 5.498902797698975, + "sampling/sampling_logp_difference/mean": 0.12098175287246704, + "step": 468 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1151.0, + "completions/max_terminated_length": 1151.0, + "completions/mean_length": 295.96875, + "completions/mean_terminated_length": 295.96875, + "completions/min_length": 129.0, + "completions/min_terminated_length": 129.0, + "entropy": 0.11154700815677643, + "epoch": 0.8300884955752212, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.34022177779675594, + "learning_rate": 1e-06, + "loss": -0.029, + "num_tokens": 234076189.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795331358909607, + "sampling/importance_sampling_ratio/min": 0.008679230697453022, + "sampling/sampling_logp_difference/max": 4.746822357177734, + "sampling/sampling_logp_difference/mean": 0.12765681743621826, + "step": 469 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.01171875, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1358.0, + "completions/mean_length": 353.2421875, + "completions/mean_terminated_length": 308.8616638183594, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.10618323087692261, + "epoch": 0.831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22207375763996906, + "learning_rate": 1e-06, + "loss": -0.0132, + "num_tokens": 234673403.0, + "reward": 0.602343738079071, + "reward_std": 0.47020477056503296, + "rewards/execution_accuracy_EX/mean": 0.58203125, + "rewards/execution_accuracy_EX/std": 0.49419113993644714, + "rewards/format_reward/mean": 0.98828125, + "rewards/format_reward/std": 0.1078278198838234, + "sampling/importance_sampling_ratio/max": 1.9668482542037964, + "sampling/importance_sampling_ratio/mean": 0.9778167009353638, + "sampling/importance_sampling_ratio/min": 0.006744090002030134, + "sampling/sampling_logp_difference/max": 4.999088764190674, + "sampling/sampling_logp_difference/mean": 0.1282060444355011, + "step": 470 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 274.1640625, + "completions/mean_terminated_length": 274.1640625, + "completions/min_length": 114.0, + "completions/min_terminated_length": 114.0, + "entropy": 0.07836710195988417, + "epoch": 0.8336283185840708, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.39106157893749205, + "learning_rate": 1e-06, + "loss": -0.0073, + "num_tokens": 235065909.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8677870035171509, + "sampling/importance_sampling_ratio/mean": 0.9805489182472229, + "sampling/importance_sampling_ratio/min": 0.0015083501348271966, + "sampling/sampling_logp_difference/max": 6.496738910675049, + "sampling/sampling_logp_difference/mean": 0.10855324566364288, + "step": 471 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1193.0, + "completions/max_terminated_length": 1193.0, + "completions/mean_length": 304.578125, + "completions/mean_terminated_length": 304.578125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.09662522282451391, + "epoch": 0.8353982300884956, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49762010935734724, + "learning_rate": 1e-06, + "loss": -0.0123, + "num_tokens": 235549065.0, + "reward": 0.632617175579071, + "reward_std": 0.46355465054512024, + "rewards/execution_accuracy_EX/mean": 0.61328125, + "rewards/execution_accuracy_EX/std": 0.4879522919654846, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9155534505844116, + "sampling/importance_sampling_ratio/mean": 0.9793773889541626, + "sampling/importance_sampling_ratio/min": 0.005264220293611288, + "sampling/sampling_logp_difference/max": 5.246822357177734, + "sampling/sampling_logp_difference/mean": 0.11838836967945099, + "step": 472 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1203.0, + "completions/max_terminated_length": 1203.0, + "completions/mean_length": 361.87890625, + "completions/mean_terminated_length": 361.87890625, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.11292469780892134, + "epoch": 0.8371681415929203, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2334895495758685, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 235973434.0, + "reward": 0.4878906011581421, + "reward_std": 0.47447580099105835, + "rewards/execution_accuracy_EX/mean": 0.4609375, + "rewards/execution_accuracy_EX/std": 0.4994482398033142, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9886711835861206, + "sampling/importance_sampling_ratio/mean": 0.9788157939910889, + "sampling/importance_sampling_ratio/min": 0.006413101684302092, + "sampling/sampling_logp_difference/max": 5.049412250518799, + "sampling/sampling_logp_difference/mean": 0.12969273328781128, + "step": 473 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1406.0, + "completions/max_terminated_length": 1406.0, + "completions/mean_length": 412.51171875, + "completions/mean_terminated_length": 412.51171875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.13038655370473862, + "epoch": 0.8389380530973451, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26600631244934575, + "learning_rate": 1e-06, + "loss": -0.0104, + "num_tokens": 236574461.0, + "reward": 0.5769531726837158, + "reward_std": 0.47307515144348145, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779107570648193, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1439025104045868, + "step": 474 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 791.0, + "completions/max_terminated_length": 791.0, + "completions/mean_length": 269.2109375, + "completions/mean_terminated_length": 269.2109375, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.08845985028892756, + "epoch": 0.8407079646017699, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4759736012801882, + "learning_rate": 1e-06, + "loss": 0.0073, + "num_tokens": 237175155.0, + "reward": 0.892382800579071, + "reward_std": 0.3016792833805084, + "rewards/execution_accuracy_EX/mean": 0.88671875, + "rewards/execution_accuracy_EX/std": 0.31755712628364563, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9776943922042847, + "sampling/importance_sampling_ratio/min": 0.007106812205165625, + "sampling/sampling_logp_difference/max": 4.946701526641846, + "sampling/sampling_logp_difference/mean": 0.11410893499851227, + "step": 475 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1234.0, + "completions/mean_length": 336.890625, + "completions/mean_terminated_length": 322.1490478515625, + "completions/min_length": 192.0, + "completions/min_terminated_length": 192.0, + "entropy": 0.09549013525247574, + "epoch": 0.8424778761061947, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.451095405100909, + "learning_rate": 1e-06, + "loss": -0.0459, + "num_tokens": 237681991.0, + "reward": 0.7660155892372131, + "reward_std": 0.4103529453277588, + "rewards/execution_accuracy_EX/mean": 0.75390625, + "rewards/execution_accuracy_EX/std": 0.43157756328582764, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781780242919922, + "sampling/importance_sampling_ratio/min": 0.0052643753588199615, + "sampling/sampling_logp_difference/max": 5.246792793273926, + "sampling/sampling_logp_difference/mean": 0.12181083858013153, + "step": 476 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2138.0, + "completions/max_terminated_length": 2138.0, + "completions/mean_length": 419.80078125, + "completions/mean_terminated_length": 419.80078125, + "completions/min_length": 128.0, + "completions/min_terminated_length": 128.0, + "entropy": 0.13889087829738855, + "epoch": 0.8442477876106195, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2564480535930519, + "learning_rate": 1e-06, + "loss": -0.0039, + "num_tokens": 238064516.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8539725542068481, + "sampling/importance_sampling_ratio/mean": 0.980320930480957, + "sampling/importance_sampling_ratio/min": 0.004103494342416525, + "sampling/sampling_logp_difference/max": 5.495916366577148, + "sampling/sampling_logp_difference/mean": 0.14981991052627563, + "step": 477 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1503.0, + "completions/max_terminated_length": 1503.0, + "completions/mean_length": 357.73828125, + "completions/mean_terminated_length": 357.73828125, + "completions/min_length": 173.0, + "completions/min_terminated_length": 173.0, + "entropy": 0.10023377742618322, + "epoch": 0.8460176991150442, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13332488800234604, + "learning_rate": 1e-06, + "loss": 0.0005, + "num_tokens": 238531889.0, + "reward": 0.5732421875, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.55078125, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783233404159546, + "sampling/importance_sampling_ratio/min": 0.00525366747751832, + "sampling/sampling_logp_difference/max": 5.248828887939453, + "sampling/sampling_logp_difference/mean": 0.12261553108692169, + "step": 478 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1742.0, + "completions/mean_length": 379.9453125, + "completions/mean_terminated_length": 365.37255859375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.11600113194435835, + "epoch": 0.8477876106194691, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29708866923388877, + "learning_rate": 1e-06, + "loss": -0.0216, + "num_tokens": 239113379.0, + "reward": 0.6509765386581421, + "reward_std": 0.4591008126735687, + "rewards/execution_accuracy_EX/mean": 0.6328125, + "rewards/execution_accuracy_EX/std": 0.48298248648643494, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980574369430542, + "sampling/importance_sampling_ratio/min": 0.0031994825694710016, + "sampling/sampling_logp_difference/max": 5.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.12897510826587677, + "step": 479 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2453.0, + "completions/max_terminated_length": 2453.0, + "completions/mean_length": 373.86328125, + "completions/mean_terminated_length": 373.86328125, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.11521474458277225, + "epoch": 0.8495575221238938, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1868225823936595, + "learning_rate": 1e-06, + "loss": -0.0118, + "num_tokens": 239471104.0, + "reward": 0.5361328125, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807099103927612, + "sampling/importance_sampling_ratio/min": 0.011119124479591846, + "sampling/sampling_logp_difference/max": 4.499088764190674, + "sampling/sampling_logp_difference/mean": 0.1286713182926178, + "step": 480 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1772.0, + "completions/mean_length": 335.7265625, + "completions/mean_terminated_length": 320.98040771484375, + "completions/min_length": 191.0, + "completions/min_terminated_length": 191.0, + "entropy": 0.0934757087379694, + "epoch": 0.8513274336283185, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19902786838365327, + "learning_rate": 1e-06, + "loss": -0.0117, + "num_tokens": 239957354.0, + "reward": 0.6398437023162842, + "reward_std": 0.4620228111743927, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837499856948853, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.10776247084140778, + "step": 481 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2343.0, + "completions/max_terminated_length": 2343.0, + "completions/mean_length": 384.7265625, + "completions/mean_terminated_length": 384.7265625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.1303556589409709, + "epoch": 0.8530973451327434, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1347995517132836, + "learning_rate": 1e-06, + "loss": -0.0024, + "num_tokens": 240401876.0, + "reward": 0.5361328125, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.51171875, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799525737762451, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.14246472716331482, + "step": 482 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2144.0, + "completions/max_terminated_length": 2144.0, + "completions/mean_length": 434.94921875, + "completions/mean_terminated_length": 434.94921875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.12181931920349598, + "epoch": 0.8548672566371681, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2763309347628719, + "learning_rate": 1e-06, + "loss": 0.0027, + "num_tokens": 241004567.0, + "reward": 0.5101562738418579, + "reward_std": 0.47569799423217773, + "rewards/execution_accuracy_EX/mean": 0.484375, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9833311438560486, + "sampling/importance_sampling_ratio/min": 0.0006765146972611547, + "sampling/sampling_logp_difference/max": 7.298556327819824, + "sampling/sampling_logp_difference/mean": 0.1296975165605545, + "step": 483 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1828.0, + "completions/mean_length": 377.12890625, + "completions/mean_terminated_length": 362.54510498046875, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.0899679409340024, + "epoch": 0.856637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20983163973775368, + "learning_rate": 1e-06, + "loss": -0.0165, + "num_tokens": 241374248.0, + "reward": 0.5990234613418579, + "reward_std": 0.4703242778778076, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9800814390182495, + "sampling/importance_sampling_ratio/min": 0.008661837317049503, + "sampling/sampling_logp_difference/max": 4.748828411102295, + "sampling/sampling_logp_difference/mean": 0.11837954819202423, + "step": 484 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1728.0, + "completions/max_terminated_length": 1728.0, + "completions/mean_length": 448.2109375, + "completions/mean_terminated_length": 448.2109375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.11048067081719637, + "epoch": 0.8584070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3145437375507862, + "learning_rate": 1e-06, + "loss": 0.0143, + "num_tokens": 241922686.0, + "reward": 0.5992187261581421, + "reward_std": 0.47008487582206726, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808949828147888, + "sampling/importance_sampling_ratio/min": 0.006867280695587397, + "sampling/sampling_logp_difference/max": 4.980987071990967, + "sampling/sampling_logp_difference/mean": 0.12507322430610657, + "step": 485 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1378.0, + "completions/max_terminated_length": 1378.0, + "completions/mean_length": 329.79296875, + "completions/mean_terminated_length": 329.79296875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.09246771596372128, + "epoch": 0.8601769911504424, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13229801693114943, + "learning_rate": 1e-06, + "loss": 0.0053, + "num_tokens": 242302729.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9906672239303589, + "sampling/importance_sampling_ratio/mean": 0.9774895906448364, + "sampling/importance_sampling_ratio/min": 0.005810766946524382, + "sampling/sampling_logp_difference/max": 5.148042678833008, + "sampling/sampling_logp_difference/mean": 0.11818039417266846, + "step": 486 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2082.0, + "completions/max_terminated_length": 2082.0, + "completions/mean_length": 453.2421875, + "completions/mean_terminated_length": 453.2421875, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.11817977298051119, + "epoch": 0.8619469026548673, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.10729397252234574, + "learning_rate": 1e-06, + "loss": -0.001, + "num_tokens": 242791847.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9795247316360474, + "sampling/importance_sampling_ratio/min": 0.001930958591401577, + "sampling/sampling_logp_difference/max": 6.249738693237305, + "sampling/sampling_logp_difference/mean": 0.13629119098186493, + "step": 487 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1224.0, + "completions/mean_length": 368.82421875, + "completions/mean_terminated_length": 339.47637939453125, + "completions/min_length": 184.0, + "completions/min_terminated_length": 184.0, + "entropy": 0.08221569657325745, + "epoch": 0.863716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2331255969849816, + "learning_rate": 1e-06, + "loss": -0.0151, + "num_tokens": 243339690.0, + "reward": 0.7435546517372131, + "reward_std": 0.4230230450630188, + "rewards/execution_accuracy_EX/mean": 0.73046875, + "rewards/execution_accuracy_EX/std": 0.44458550214767456, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 1.854588270187378, + "sampling/importance_sampling_ratio/mean": 0.9802987575531006, + "sampling/importance_sampling_ratio/min": 0.005265557672828436, + "sampling/sampling_logp_difference/max": 5.246568202972412, + "sampling/sampling_logp_difference/mean": 0.10977109521627426, + "step": 488 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2213.0, + "completions/max_terminated_length": 2213.0, + "completions/mean_length": 406.40234375, + "completions/mean_terminated_length": 406.40234375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.09937719628214836, + "epoch": 0.8654867256637168, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23779578331354234, + "learning_rate": 1e-06, + "loss": -0.0033, + "num_tokens": 243947809.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814603328704834, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.11837709695100784, + "step": 489 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1587.0, + "completions/max_terminated_length": 1587.0, + "completions/mean_length": 359.12109375, + "completions/mean_terminated_length": 359.12109375, + "completions/min_length": 150.0, + "completions/min_terminated_length": 150.0, + "entropy": 0.08976826258003712, + "epoch": 0.8672566371681416, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4324991634947946, + "learning_rate": 1e-06, + "loss": -0.0321, + "num_tokens": 244433616.0, + "reward": 0.7736327648162842, + "reward_std": 0.40552324056625366, + "rewards/execution_accuracy_EX/mean": 0.76171875, + "rewards/execution_accuracy_EX/std": 0.4268665909767151, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9764032959938049, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.11896872520446777, + "step": 490 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2296.0, + "completions/max_terminated_length": 2296.0, + "completions/mean_length": 350.80859375, + "completions/mean_terminated_length": 350.80859375, + "completions/min_length": 135.0, + "completions/min_terminated_length": 135.0, + "entropy": 0.08920551370829344, + "epoch": 0.8690265486725663, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15734730094162222, + "learning_rate": 1e-06, + "loss": 0.0099, + "num_tokens": 244870687.0, + "reward": 0.576953113079071, + "reward_std": 0.47307515144348145, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9843152165412903, + "sampling/importance_sampling_ratio/min": 0.005254056304693222, + "sampling/sampling_logp_difference/max": 5.248754978179932, + "sampling/sampling_logp_difference/mean": 0.10570266097784042, + "step": 491 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1604.0, + "completions/max_terminated_length": 1604.0, + "completions/mean_length": 403.25390625, + "completions/mean_terminated_length": 403.25390625, + "completions/min_length": 146.0, + "completions/min_terminated_length": 146.0, + "entropy": 0.09854715596884489, + "epoch": 0.8707964601769912, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15796668590192425, + "learning_rate": 1e-06, + "loss": -0.0038, + "num_tokens": 245341056.0, + "reward": 0.818164050579071, + "reward_std": 0.3744697868824005, + "rewards/execution_accuracy_EX/mean": 0.80859375, + "rewards/execution_accuracy_EX/std": 0.39417871832847595, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790687561035156, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.12462789565324783, + "step": 492 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1840.0, + "completions/max_terminated_length": 1840.0, + "completions/mean_length": 431.53125, + "completions/mean_terminated_length": 431.53125, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.11714785825461149, + "epoch": 0.8725663716814159, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30365585894340785, + "learning_rate": 1e-06, + "loss": -0.0083, + "num_tokens": 246064200.0, + "reward": 0.539843738079071, + "reward_std": 0.47569799423217773, + "rewards/execution_accuracy_EX/mean": 0.515625, + "rewards/execution_accuracy_EX/std": 0.5007347464561462, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9790074825286865, + "sampling/importance_sampling_ratio/min": 0.004090499132871628, + "sampling/sampling_logp_difference/max": 5.499088287353516, + "sampling/sampling_logp_difference/mean": 0.13717404007911682, + "step": 493 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1528.0, + "completions/max_terminated_length": 1528.0, + "completions/mean_length": 467.015625, + "completions/mean_terminated_length": 467.015625, + "completions/min_length": 203.0, + "completions/min_terminated_length": 203.0, + "entropy": 0.11044671479612589, + "epoch": 0.8743362831858407, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1579492562378529, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 246674044.0, + "reward": 0.5287109613418579, + "reward_std": 0.47591593861579895, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9786334037780762, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.13233034312725067, + "step": 494 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3893.0, + "completions/max_terminated_length": 3893.0, + "completions/mean_length": 575.8984375, + "completions/mean_terminated_length": 575.8984375, + "completions/min_length": 157.0, + "completions/min_terminated_length": 157.0, + "entropy": 0.11832599341869354, + "epoch": 0.8761061946902655, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23791347944061128, + "learning_rate": 1e-06, + "loss": -0.0171, + "num_tokens": 247231794.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9844000339508057, + "sampling/importance_sampling_ratio/min": 0.006744096055626869, + "sampling/sampling_logp_difference/max": 4.999087810516357, + "sampling/sampling_logp_difference/mean": 0.12517523765563965, + "step": 495 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1979.0, + "completions/max_terminated_length": 1979.0, + "completions/mean_length": 431.796875, + "completions/mean_terminated_length": 431.796875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.10006839781999588, + "epoch": 0.8778761061946903, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33967296231070576, + "learning_rate": 1e-06, + "loss": 0.0326, + "num_tokens": 247851294.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9763818979263306, + "sampling/importance_sampling_ratio/min": 0.0020691403187811375, + "sampling/sampling_logp_difference/max": 6.180622100830078, + "sampling/sampling_logp_difference/mean": 0.13006316125392914, + "step": 496 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1658.0, + "completions/max_terminated_length": 1658.0, + "completions/mean_length": 367.765625, + "completions/mean_terminated_length": 367.765625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.0875820703804493, + "epoch": 0.879646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 248338866.0, + "reward": 0.5843749642372131, + "reward_std": 0.47219762206077576, + "rewards/execution_accuracy_EX/mean": 0.5625, + "rewards/execution_accuracy_EX/std": 0.49705013632774353, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.939441204071045, + "sampling/importance_sampling_ratio/mean": 0.9807998538017273, + "sampling/importance_sampling_ratio/min": 0.0031850412487983704, + "sampling/sampling_logp_difference/max": 5.7492899894714355, + "sampling/sampling_logp_difference/mean": 0.11449206620454788, + "step": 497 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2029.0, + "completions/mean_length": 417.390625, + "completions/mean_terminated_length": 402.9647216796875, + "completions/min_length": 155.0, + "completions/min_terminated_length": 155.0, + "entropy": 0.09304992202669382, + "epoch": 0.8814159292035398, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2856579719446031, + "learning_rate": 1e-06, + "loss": 0.0045, + "num_tokens": 248861206.0, + "reward": 0.6880859136581421, + "reward_std": 0.4472186863422394, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826182723045349, + "sampling/importance_sampling_ratio/min": 0.0025853586848825216, + "sampling/sampling_logp_difference/max": 5.95789098739624, + "sampling/sampling_logp_difference/mean": 0.11140456795692444, + "step": 498 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3125.0, + "completions/max_terminated_length": 3125.0, + "completions/mean_length": 478.19140625, + "completions/mean_terminated_length": 478.19140625, + "completions/min_length": 132.0, + "completions/min_terminated_length": 132.0, + "entropy": 0.1192424027249217, + "epoch": 0.8831858407079646, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31405386678781605, + "learning_rate": 1e-06, + "loss": 0.0243, + "num_tokens": 249331079.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600863337516785, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9545390605926514, + "sampling/importance_sampling_ratio/mean": 0.9833201169967651, + "sampling/importance_sampling_ratio/min": 0.004090553615242243, + "sampling/sampling_logp_difference/max": 5.499074935913086, + "sampling/sampling_logp_difference/mean": 0.1283523440361023, + "step": 499 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3463.0, + "completions/max_terminated_length": 3463.0, + "completions/mean_length": 472.64453125, + "completions/mean_terminated_length": 472.64453125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.11922439560294151, + "epoch": 0.8849557522123894, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3128883702994043, + "learning_rate": 1e-06, + "loss": 0.0175, + "num_tokens": 249848524.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593042254447937, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9780963659286499, + "sampling/importance_sampling_ratio/min": 0.002482479205355048, + "sampling/sampling_logp_difference/max": 5.998497486114502, + "sampling/sampling_logp_difference/mean": 0.13651864230632782, + "step": 500 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3222.0, + "completions/max_terminated_length": 3222.0, + "completions/mean_length": 538.19140625, + "completions/mean_terminated_length": 538.19140625, + "completions/min_length": 171.0, + "completions/min_terminated_length": 171.0, + "entropy": 0.1208310667425394, + "epoch": 0.8867256637168142, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4005324520398836, + "learning_rate": 1e-06, + "loss": 0.0494, + "num_tokens": 250441805.0, + "reward": 0.781054675579071, + "reward_std": 0.40085992217063904, + "rewards/execution_accuracy_EX/mean": 0.76953125, + "rewards/execution_accuracy_EX/std": 0.4219578504562378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980903148651123, + "sampling/importance_sampling_ratio/min": 0.00409121485427022, + "sampling/sampling_logp_difference/max": 5.498913288116455, + "sampling/sampling_logp_difference/mean": 0.13536778092384338, + "step": 501 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0234375, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3866.0, + "completions/mean_length": 592.1875, + "completions/mean_terminated_length": 508.0960388183594, + "completions/min_length": 123.0, + "completions/min_terminated_length": 123.0, + "entropy": 0.12260824535042048, + "epoch": 0.8884955752212389, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.22612955695692616, + "learning_rate": 1e-06, + "loss": 0.0121, + "num_tokens": 251306525.0, + "reward": 0.5906250476837158, + "reward_std": 0.4726126492023468, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 0.9765625, + "rewards/format_reward/std": 0.15158477425575256, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9827229380607605, + "sampling/importance_sampling_ratio/min": 0.0031930049881339073, + "sampling/sampling_logp_difference/max": 5.746792793273926, + "sampling/sampling_logp_difference/mean": 0.13382920622825623, + "step": 502 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2912.0, + "completions/mean_length": 541.49609375, + "completions/mean_terminated_length": 527.556884765625, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.10825845878571272, + "epoch": 0.8902654867256637, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3159933968187752, + "learning_rate": 1e-06, + "loss": 0.0274, + "num_tokens": 251727196.0, + "reward": 0.662109375, + "reward_std": 0.45588722825050354, + "rewards/execution_accuracy_EX/mean": 0.64453125, + "rewards/execution_accuracy_EX/std": 0.4795927405357361, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812757968902588, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.12318531423807144, + "step": 503 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1989.0, + "completions/max_terminated_length": 1989.0, + "completions/mean_length": 486.94921875, + "completions/mean_terminated_length": 486.94921875, + "completions/min_length": 130.0, + "completions/min_terminated_length": 130.0, + "entropy": 0.109563447535038, + "epoch": 0.8920353982300885, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 252416847.0, + "reward": 0.762499988079071, + "reward_std": 0.41216787695884705, + "rewards/execution_accuracy_EX/mean": 0.75, + "rewards/execution_accuracy_EX/std": 0.4338609278202057, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816642999649048, + "sampling/importance_sampling_ratio/min": 0.00527913635596633, + "sampling/sampling_logp_difference/max": 5.243992805480957, + "sampling/sampling_logp_difference/mean": 0.12530678510665894, + "step": 504 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2113.0, + "completions/max_terminated_length": 2113.0, + "completions/mean_length": 512.69140625, + "completions/mean_terminated_length": 512.69140625, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.1084189796820283, + "epoch": 0.8938053097345132, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09095908302862653, + "learning_rate": 1e-06, + "loss": -0.0029, + "num_tokens": 253025600.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9807025194168091, + "sampling/importance_sampling_ratio/min": 0.004090502858161926, + "sampling/sampling_logp_difference/max": 5.499087333679199, + "sampling/sampling_logp_difference/mean": 0.12630710005760193, + "step": 505 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1307.0, + "completions/max_terminated_length": 1307.0, + "completions/mean_length": 376.03125, + "completions/mean_terminated_length": 376.03125, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.09384919796139002, + "epoch": 0.8955752212389381, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23427264693869862, + "learning_rate": 1e-06, + "loss": -0.0023, + "num_tokens": 253559592.0, + "reward": 0.6177734136581421, + "reward_std": 0.46676453948020935, + "rewards/execution_accuracy_EX/mean": 0.59765625, + "rewards/execution_accuracy_EX/std": 0.4913311004638672, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788470268249512, + "sampling/importance_sampling_ratio/min": 0.001591972541064024, + "sampling/sampling_logp_difference/max": 6.442781448364258, + "sampling/sampling_logp_difference/mean": 0.11854994297027588, + "step": 506 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1295.0, + "completions/max_terminated_length": 1295.0, + "completions/mean_length": 388.46875, + "completions/mean_terminated_length": 388.46875, + "completions/min_length": 189.0, + "completions/min_terminated_length": 189.0, + "entropy": 0.09089147672057152, + "epoch": 0.8973451327433628, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.5619963055559248, + "learning_rate": 1e-06, + "loss": 0.0169, + "num_tokens": 254248544.0, + "reward": 0.8441406488418579, + "reward_std": 0.35250478982925415, + "rewards/execution_accuracy_EX/mean": 0.8359375, + "rewards/execution_accuracy_EX/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9788107872009277, + "sampling/importance_sampling_ratio/min": 0.004090497270226479, + "sampling/sampling_logp_difference/max": 5.499088764190674, + "sampling/sampling_logp_difference/mean": 0.11855372041463852, + "step": 507 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2497.0, + "completions/max_terminated_length": 2497.0, + "completions/mean_length": 489.6875, + "completions/mean_terminated_length": 489.6875, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.1044982336461544, + "epoch": 0.8991150442477877, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2921129016530104, + "learning_rate": 1e-06, + "loss": -0.0345, + "num_tokens": 254817392.0, + "reward": 0.5880858898162842, + "reward_std": 0.4717142879962921, + "rewards/execution_accuracy_EX/mean": 0.56640625, + "rewards/execution_accuracy_EX/std": 0.4965413510799408, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9816838502883911, + "sampling/importance_sampling_ratio/min": 0.0040977573953568935, + "sampling/sampling_logp_difference/max": 5.497315406799316, + "sampling/sampling_logp_difference/mean": 0.12222027778625488, + "step": 508 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2209.0, + "completions/mean_length": 439.22265625, + "completions/mean_terminated_length": 424.88238525390625, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.09216785151511431, + "epoch": 0.9008849557522124, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32409460565615733, + "learning_rate": 1e-06, + "loss": -0.0298, + "num_tokens": 255554553.0, + "reward": 0.8179687261581421, + "reward_std": 0.3748847544193268, + "rewards/execution_accuracy_EX/mean": 0.80859375, + "rewards/execution_accuracy_EX/std": 0.39417871832847595, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9796115159988403, + "sampling/importance_sampling_ratio/min": 0.0031879206653684378, + "sampling/sampling_logp_difference/max": 5.748386383056641, + "sampling/sampling_logp_difference/mean": 0.11551597714424133, + "step": 509 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1539.0, + "completions/max_terminated_length": 1539.0, + "completions/mean_length": 416.89453125, + "completions/mean_terminated_length": 416.89453125, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.0894682826474309, + "epoch": 0.9026548672566371, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.267773109103576, + "learning_rate": 1e-06, + "loss": -0.0114, + "num_tokens": 256062830.0, + "reward": 0.699414074420929, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9791895151138306, + "sampling/importance_sampling_ratio/min": 0.004094167612493038, + "sampling/sampling_logp_difference/max": 5.498191833496094, + "sampling/sampling_logp_difference/mean": 0.1187569797039032, + "step": 510 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1987.0, + "completions/mean_length": 546.890625, + "completions/mean_terminated_length": 532.9725952148438, + "completions/min_length": 199.0, + "completions/min_terminated_length": 199.0, + "entropy": 0.111790481954813, + "epoch": 0.904424778761062, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1727699268771358, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 256732882.0, + "reward": 0.810546875, + "reward_std": 0.3805904686450958, + "rewards/execution_accuracy_EX/mean": 0.80078125, + "rewards/execution_accuracy_EX/std": 0.40019527077674866, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9822320938110352, + "sampling/importance_sampling_ratio/min": 0.0033309035934507847, + "sampling/sampling_logp_difference/max": 5.704511642456055, + "sampling/sampling_logp_difference/mean": 0.1255459487438202, + "step": 511 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2446.0, + "completions/max_terminated_length": 2446.0, + "completions/mean_length": 473.73046875, + "completions/mean_terminated_length": 473.73046875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.094749940559268, + "epoch": 0.9061946902654867, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.49153464022870086, + "learning_rate": 1e-06, + "loss": 0.0109, + "num_tokens": 257315437.0, + "reward": 0.8923828601837158, + "reward_std": 0.30167925357818604, + "rewards/execution_accuracy_EX/mean": 0.88671875, + "rewards/execution_accuracy_EX/std": 0.31755712628364563, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9789038896560669, + "sampling/importance_sampling_ratio/min": 0.00409361720085144, + "sampling/sampling_logp_difference/max": 5.498326301574707, + "sampling/sampling_logp_difference/mean": 0.12143756449222565, + "step": 512 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1649.0, + "completions/max_terminated_length": 1649.0, + "completions/mean_length": 408.375, + "completions/mean_terminated_length": 408.375, + "completions/min_length": 147.0, + "completions/min_terminated_length": 147.0, + "entropy": 0.08351874630898237, + "epoch": 0.9079646017699115, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.4131325644119373, + "learning_rate": 1e-06, + "loss": -0.0162, + "num_tokens": 257769533.0, + "reward": 0.8404296636581421, + "reward_std": 0.3558422923088074, + "rewards/execution_accuracy_EX/mean": 0.83203125, + "rewards/execution_accuracy_EX/std": 0.3745708465576172, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9636545181274414, + "sampling/importance_sampling_ratio/mean": 0.9805830717086792, + "sampling/importance_sampling_ratio/min": 0.0052512455731630325, + "sampling/sampling_logp_difference/max": 5.2492899894714355, + "sampling/sampling_logp_difference/mean": 0.10943485051393509, + "step": 513 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1727.0, + "completions/max_terminated_length": 1727.0, + "completions/mean_length": 404.546875, + "completions/mean_terminated_length": 404.546875, + "completions/min_length": 141.0, + "completions/min_terminated_length": 141.0, + "entropy": 0.08706046268343925, + "epoch": 0.9097345132743363, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17693289212742777, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 258189177.0, + "reward": 0.7699218988418579, + "reward_std": 0.40778404474258423, + "rewards/execution_accuracy_EX/mean": 0.7578125, + "rewards/execution_accuracy_EX/std": 0.4292463958263397, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794632196426392, + "sampling/importance_sampling_ratio/min": 0.004096901509910822, + "sampling/sampling_logp_difference/max": 5.497524261474609, + "sampling/sampling_logp_difference/mean": 0.1164979562163353, + "step": 514 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2443.0, + "completions/max_terminated_length": 2443.0, + "completions/mean_length": 425.27734375, + "completions/mean_terminated_length": 425.27734375, + "completions/min_length": 152.0, + "completions/min_terminated_length": 152.0, + "entropy": 0.08132558595389128, + "epoch": 0.911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32445579593943996, + "learning_rate": 1e-06, + "loss": -0.0264, + "num_tokens": 258727360.0, + "reward": 0.38398435711860657, + "reward_std": 0.4544737637042999, + "rewards/execution_accuracy_EX/mean": 0.3515625, + "rewards/execution_accuracy_EX/std": 0.47839346528053284, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9808209538459778, + "sampling/importance_sampling_ratio/min": 0.002113393973559141, + "sampling/sampling_logp_difference/max": 6.159460067749023, + "sampling/sampling_logp_difference/mean": 0.10712788999080658, + "step": 515 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3404.0, + "completions/max_terminated_length": 3404.0, + "completions/mean_length": 517.51171875, + "completions/mean_terminated_length": 517.51171875, + "completions/min_length": 209.0, + "completions/min_terminated_length": 209.0, + "entropy": 0.09406900592148304, + "epoch": 0.9132743362831859, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17404203103502056, + "learning_rate": 1e-06, + "loss": 0.0117, + "num_tokens": 259274915.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9837716221809387, + "sampling/importance_sampling_ratio/min": 0.003188925562426448, + "sampling/sampling_logp_difference/max": 5.748071193695068, + "sampling/sampling_logp_difference/mean": 0.10998736321926117, + "step": 516 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2474.0, + "completions/mean_length": 600.546875, + "completions/mean_terminated_length": 586.8392333984375, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.11646796762943268, + "epoch": 0.9150442477876106, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15815404113748285, + "learning_rate": 1e-06, + "loss": -0.0135, + "num_tokens": 260107151.0, + "reward": 0.5804687738418579, + "reward_std": 0.47288164496421814, + "rewards/execution_accuracy_EX/mean": 0.55859375, + "rewards/execution_accuracy_EX/std": 0.4975275993347168, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820814728736877, + "sampling/importance_sampling_ratio/min": 0.0005535886157304049, + "sampling/sampling_logp_difference/max": 7.499088764190674, + "sampling/sampling_logp_difference/mean": 0.1287885457277298, + "step": 517 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1655.0, + "completions/max_terminated_length": 1655.0, + "completions/mean_length": 444.37890625, + "completions/mean_terminated_length": 444.37890625, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.08392216078937054, + "epoch": 0.9168141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2918197477798191, + "learning_rate": 1e-06, + "loss": 0.0296, + "num_tokens": 260586608.0, + "reward": 0.6251952648162842, + "reward_std": 0.4652217924594879, + "rewards/execution_accuracy_EX/mean": 0.60546875, + "rewards/execution_accuracy_EX/std": 0.48970720171928406, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9797810316085815, + "sampling/importance_sampling_ratio/min": 0.004092922434210777, + "sampling/sampling_logp_difference/max": 5.498496055603027, + "sampling/sampling_logp_difference/mean": 0.11067145317792892, + "step": 518 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.015625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3870.0, + "completions/mean_length": 763.07421875, + "completions/mean_terminated_length": 710.170654296875, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.11816421616822481, + "epoch": 0.9185840707964602, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14280175240062037, + "learning_rate": 1e-06, + "loss": -0.0063, + "num_tokens": 261095315.0, + "reward": 0.598437488079071, + "reward_std": 0.4710412621498108, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 0.984375, + "rewards/format_reward/std": 0.12426253408193588, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9847060441970825, + "sampling/importance_sampling_ratio/min": 0.005259410012513399, + "sampling/sampling_logp_difference/max": 5.24773645401001, + "sampling/sampling_logp_difference/mean": 0.1283852905035019, + "step": 519 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1585.0, + "completions/max_terminated_length": 1585.0, + "completions/mean_length": 456.22265625, + "completions/mean_terminated_length": 456.22265625, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.0838457578793168, + "epoch": 0.9203539823008849, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18157275520876287, + "learning_rate": 1e-06, + "loss": 0.0007, + "num_tokens": 261569372.0, + "reward": 0.688281238079071, + "reward_std": 0.44692784547805786, + "rewards/execution_accuracy_EX/mean": 0.671875, + "rewards/execution_accuracy_EX/std": 0.47045037150382996, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9820669889450073, + "sampling/importance_sampling_ratio/min": 0.002480123657733202, + "sampling/sampling_logp_difference/max": 5.999446868896484, + "sampling/sampling_logp_difference/mean": 0.10748156905174255, + "step": 520 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2600.0, + "completions/max_terminated_length": 2600.0, + "completions/mean_length": 598.69921875, + "completions/mean_terminated_length": 598.69921875, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.09875625185668468, + "epoch": 0.9221238938053097, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21427103776619694, + "learning_rate": 1e-06, + "loss": 0.0092, + "num_tokens": 262117679.0, + "reward": 0.5138671398162842, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9818670153617859, + "sampling/importance_sampling_ratio/min": 0.0031865073833614588, + "sampling/sampling_logp_difference/max": 5.7488298416137695, + "sampling/sampling_logp_difference/mean": 0.11636389791965485, + "step": 521 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2034.0, + "completions/max_terminated_length": 2034.0, + "completions/mean_length": 535.44140625, + "completions/mean_terminated_length": 535.44140625, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.09476997330784798, + "epoch": 0.9238938053097345, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.14134909702894238, + "learning_rate": 1e-06, + "loss": 0.0004, + "num_tokens": 262652528.0, + "reward": 0.5249999761581421, + "reward_std": 0.47593045234680176, + "rewards/execution_accuracy_EX/mean": 0.5, + "rewards/execution_accuracy_EX/std": 0.5009794235229492, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9805616140365601, + "sampling/importance_sampling_ratio/min": 0.004092916380614042, + "sampling/sampling_logp_difference/max": 5.498497486114502, + "sampling/sampling_logp_difference/mean": 0.11794306337833405, + "step": 522 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2073.0, + "completions/max_terminated_length": 2073.0, + "completions/mean_length": 624.69921875, + "completions/mean_terminated_length": 624.69921875, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.10736214555799961, + "epoch": 0.9256637168141593, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.29183791822517186, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 263290339.0, + "reward": 0.9146484136581421, + "reward_std": 0.2721920311450958, + "rewards/execution_accuracy_EX/mean": 0.91015625, + "rewards/execution_accuracy_EX/std": 0.2865179479122162, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9826004505157471, + "sampling/importance_sampling_ratio/min": 0.0024888834450393915, + "sampling/sampling_logp_difference/max": 5.9959211349487305, + "sampling/sampling_logp_difference/mean": 0.12244834750890732, + "step": 523 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1350.0, + "completions/max_terminated_length": 1350.0, + "completions/mean_length": 479.49609375, + "completions/mean_terminated_length": 479.49609375, + "completions/min_length": 224.0, + "completions/min_terminated_length": 224.0, + "entropy": 0.09032739698886871, + "epoch": 0.9274336283185841, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09007119235338681, + "learning_rate": 1e-06, + "loss": 0.0008, + "num_tokens": 263705634.0, + "reward": 0.6400390267372131, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977393627166748, + "sampling/importance_sampling_ratio/min": 0.005295821465551853, + "sampling/sampling_logp_difference/max": 5.240837097167969, + "sampling/sampling_logp_difference/mean": 0.12161089479923248, + "step": 524 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2266.0, + "completions/max_terminated_length": 2266.0, + "completions/mean_length": 605.0390625, + "completions/mean_terminated_length": 605.0390625, + "completions/min_length": 170.0, + "completions/min_terminated_length": 170.0, + "entropy": 0.09797837119549513, + "epoch": 0.9292035398230089, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3552627690830833, + "learning_rate": 1e-06, + "loss": 0.0264, + "num_tokens": 264163196.0, + "reward": 0.7847656011581421, + "reward_std": 0.39845579862594604, + "rewards/execution_accuracy_EX/mean": 0.7734375, + "rewards/execution_accuracy_EX/std": 0.41942715644836426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9794970750808716, + "sampling/importance_sampling_ratio/min": 0.00121711113024503, + "sampling/sampling_logp_difference/max": 6.711275100708008, + "sampling/sampling_logp_difference/mean": 0.12153811752796173, + "step": 525 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2605.0, + "completions/mean_length": 738.27734375, + "completions/mean_terminated_length": 725.10986328125, + "completions/min_length": 133.0, + "completions/min_terminated_length": 133.0, + "entropy": 0.11737409885972738, + "epoch": 0.9309734513274336, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06868158913692704, + "learning_rate": 1e-06, + "loss": -0.0004, + "num_tokens": 264607667.0, + "reward": 0.69921875, + "reward_std": 0.44298383593559265, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.983741819858551, + "sampling/importance_sampling_ratio/min": 0.004089032299816608, + "sampling/sampling_logp_difference/max": 5.499446868896484, + "sampling/sampling_logp_difference/mean": 0.12724415957927704, + "step": 526 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1629.0, + "completions/max_terminated_length": 1629.0, + "completions/mean_length": 482.62109375, + "completions/mean_terminated_length": 482.62109375, + "completions/min_length": 131.0, + "completions/min_terminated_length": 131.0, + "entropy": 0.08801850583404303, + "epoch": 0.9327433628318584, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1981702316256674, + "learning_rate": 1e-06, + "loss": -0.0028, + "num_tokens": 265250706.0, + "reward": 0.6214843988418579, + "reward_std": 0.46600866317749023, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9793202877044678, + "sampling/importance_sampling_ratio/min": 0.005254073534160852, + "sampling/sampling_logp_difference/max": 5.248751640319824, + "sampling/sampling_logp_difference/mean": 0.11619452387094498, + "step": 527 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1949.0, + "completions/mean_length": 604.30078125, + "completions/mean_terminated_length": 590.6078491210938, + "completions/min_length": 167.0, + "completions/min_terminated_length": 167.0, + "entropy": 0.11058936733752489, + "epoch": 0.9345132743362832, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.13104503028291592, + "learning_rate": 1e-06, + "loss": -0.0043, + "num_tokens": 265764943.0, + "reward": 0.5285155773162842, + "reward_std": 0.4761233627796173, + "rewards/execution_accuracy_EX/mean": 0.50390625, + "rewards/execution_accuracy_EX/std": 0.5009641647338867, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9812273979187012, + "sampling/importance_sampling_ratio/min": 0.0012502801837399602, + "sampling/sampling_logp_difference/max": 6.684387683868408, + "sampling/sampling_logp_difference/mean": 0.1292923390865326, + "step": 528 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2930.0, + "completions/max_terminated_length": 2930.0, + "completions/mean_length": 801.4140625, + "completions/mean_terminated_length": 801.4140625, + "completions/min_length": 136.0, + "completions/min_terminated_length": 136.0, + "entropy": 0.1286866944283247, + "epoch": 0.9362831858407079, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24626629949481477, + "learning_rate": 1e-06, + "loss": -0.0098, + "num_tokens": 266414441.0, + "reward": 0.513867199420929, + "reward_std": 0.47579970955848694, + "rewards/execution_accuracy_EX/mean": 0.48828125, + "rewards/execution_accuracy_EX/std": 0.5008418560028076, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9831207990646362, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.1360650360584259, + "step": 529 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2637.0, + "completions/max_terminated_length": 2637.0, + "completions/mean_length": 689.53125, + "completions/mean_terminated_length": 689.53125, + "completions/min_length": 176.0, + "completions/min_terminated_length": 176.0, + "entropy": 0.12619277369230986, + "epoch": 0.9380530973451328, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20366507624595218, + "learning_rate": 1e-06, + "loss": -0.0002, + "num_tokens": 266961649.0, + "reward": 0.6214843392372131, + "reward_std": 0.46600866317749023, + "rewards/execution_accuracy_EX/mean": 0.6015625, + "rewards/execution_accuracy_EX/std": 0.4905354380607605, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.978034496307373, + "sampling/importance_sampling_ratio/min": 0.005257649812847376, + "sampling/sampling_logp_difference/max": 5.248071193695068, + "sampling/sampling_logp_difference/mean": 0.14427325129508972, + "step": 530 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1736.0, + "completions/max_terminated_length": 1736.0, + "completions/mean_length": 580.36328125, + "completions/mean_terminated_length": 580.36328125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.10494234599173069, + "epoch": 0.9398230088495575, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27675007888714714, + "learning_rate": 1e-06, + "loss": -0.0153, + "num_tokens": 267450094.0, + "reward": 0.703125, + "reward_std": 0.44119933247566223, + "rewards/execution_accuracy_EX/mean": 0.6875, + "rewards/execution_accuracy_EX/std": 0.4644203782081604, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9789376258850098, + "sampling/importance_sampling_ratio/min": 0.0023584607988595963, + "sampling/sampling_logp_difference/max": 6.049746036529541, + "sampling/sampling_logp_difference/mean": 0.12963271141052246, + "step": 531 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2089.0, + "completions/max_terminated_length": 2089.0, + "completions/mean_length": 578.01953125, + "completions/mean_terminated_length": 578.01953125, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.09198547527194023, + "epoch": 0.9415929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.31963955960204044, + "learning_rate": 1e-06, + "loss": -0.0078, + "num_tokens": 267878435.0, + "reward": 0.669726550579071, + "reward_std": 0.4533011019229889, + "rewards/execution_accuracy_EX/mean": 0.65234375, + "rewards/execution_accuracy_EX/std": 0.4771590530872345, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.9281052350997925, + "sampling/importance_sampling_ratio/mean": 0.9801375269889832, + "sampling/importance_sampling_ratio/min": 0.00674409931525588, + "sampling/sampling_logp_difference/max": 4.999087333679199, + "sampling/sampling_logp_difference/mean": 0.11358733475208282, + "step": 532 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1985.0, + "completions/max_terminated_length": 1985.0, + "completions/mean_length": 469.66796875, + "completions/mean_terminated_length": 469.66796875, + "completions/min_length": 143.0, + "completions/min_terminated_length": 143.0, + "entropy": 0.09485425241291523, + "epoch": 0.9433628318584071, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.06560598067038206, + "learning_rate": 1e-06, + "loss": 0.0014, + "num_tokens": 268358302.0, + "reward": 0.7587890625, + "reward_std": 0.4142923355102539, + "rewards/execution_accuracy_EX/mean": 0.74609375, + "rewards/execution_accuracy_EX/std": 0.4360972046852112, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9779180288314819, + "sampling/importance_sampling_ratio/min": 0.005253662820905447, + "sampling/sampling_logp_difference/max": 5.2488298416137695, + "sampling/sampling_logp_difference/mean": 0.12235035747289658, + "step": 533 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1998.0, + "completions/mean_length": 487.91015625, + "completions/mean_terminated_length": 459.5, + "completions/min_length": 144.0, + "completions/min_terminated_length": 144.0, + "entropy": 0.09254768490791321, + "epoch": 0.9451327433628318, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26639882083187877, + "learning_rate": 1e-06, + "loss": -0.0084, + "num_tokens": 268833959.0, + "reward": 0.8326171636581421, + "reward_std": 0.3631838858127594, + "rewards/execution_accuracy_EX/mean": 0.82421875, + "rewards/execution_accuracy_EX/std": 0.3813795745372772, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9783578515052795, + "sampling/importance_sampling_ratio/min": 0.003186581889167428, + "sampling/sampling_logp_difference/max": 5.748806476593018, + "sampling/sampling_logp_difference/mean": 0.12303325533866882, + "step": 534 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2641.0, + "completions/mean_length": 568.8203125, + "completions/mean_terminated_length": 554.98828125, + "completions/min_length": 139.0, + "completions/min_terminated_length": 139.0, + "entropy": 0.10234727151691914, + "epoch": 0.9469026548672567, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.47445261674977657, + "learning_rate": 1e-06, + "loss": -0.0627, + "num_tokens": 269286217.0, + "reward": 0.881054699420929, + "reward_std": 0.31533122062683105, + "rewards/execution_accuracy_EX/mean": 0.875, + "rewards/execution_accuracy_EX/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.981724202632904, + "sampling/importance_sampling_ratio/min": 0.005258482415229082, + "sampling/sampling_logp_difference/max": 5.247912883758545, + "sampling/sampling_logp_difference/mean": 0.1213020533323288, + "step": 535 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1720.0, + "completions/max_terminated_length": 1720.0, + "completions/mean_length": 506.16796875, + "completions/mean_terminated_length": 506.16796875, + "completions/min_length": 168.0, + "completions/min_terminated_length": 168.0, + "entropy": 0.10132628306746483, + "epoch": 0.9486725663716814, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20337304514034285, + "learning_rate": 1e-06, + "loss": 0.0193, + "num_tokens": 269933188.0, + "reward": 0.740234375, + "reward_std": 0.4242667853832245, + "rewards/execution_accuracy_EX/mean": 0.7265625, + "rewards/execution_accuracy_EX/std": 0.446596622467041, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9774594902992249, + "sampling/importance_sampling_ratio/min": 0.002208918798714876, + "sampling/sampling_logp_difference/max": 6.1152520179748535, + "sampling/sampling_logp_difference/mean": 0.12845507264137268, + "step": 536 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2129.0, + "completions/max_terminated_length": 2129.0, + "completions/mean_length": 602.69921875, + "completions/mean_terminated_length": 602.69921875, + "completions/min_length": 206.0, + "completions/min_terminated_length": 206.0, + "entropy": 0.11411858536303043, + "epoch": 0.9504424778761061, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.26247268454516753, + "learning_rate": 1e-06, + "loss": 0.0151, + "num_tokens": 270587543.0, + "reward": 0.781054675579071, + "reward_std": 0.40085992217063904, + "rewards/execution_accuracy_EX/mean": 0.76953125, + "rewards/execution_accuracy_EX/std": 0.4219578504562378, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825751781463623, + "sampling/importance_sampling_ratio/min": 0.002506313845515251, + "sampling/sampling_logp_difference/max": 5.9889421463012695, + "sampling/sampling_logp_difference/mean": 0.12803950905799866, + "step": 537 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1466.0, + "completions/max_terminated_length": 1466.0, + "completions/mean_length": 551.98046875, + "completions/mean_terminated_length": 551.98046875, + "completions/min_length": 158.0, + "completions/min_terminated_length": 158.0, + "entropy": 0.10755674354732037, + "epoch": 0.952212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.0, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 271284898.0, + "reward": 0.6437499523162842, + "reward_std": 0.46081769466400146, + "rewards/execution_accuracy_EX/mean": 0.625, + "rewards/execution_accuracy_EX/std": 0.4850712716579437, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9782460927963257, + "sampling/importance_sampling_ratio/min": 0.0031874829437583685, + "sampling/sampling_logp_difference/max": 5.748523712158203, + "sampling/sampling_logp_difference/mean": 0.1307043731212616, + "step": 538 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3235.0, + "completions/mean_length": 585.19921875, + "completions/mean_terminated_length": 571.431396484375, + "completions/min_length": 193.0, + "completions/min_terminated_length": 193.0, + "entropy": 0.1014731377363205, + "epoch": 0.9539823008849557, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2460155561648805, + "learning_rate": 1e-06, + "loss": 0.0216, + "num_tokens": 271868741.0, + "reward": 0.6732421517372131, + "reward_std": 0.4523758888244629, + "rewards/execution_accuracy_EX/mean": 0.65625, + "rewards/execution_accuracy_EX/std": 0.47588926553726196, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.980535626411438, + "sampling/importance_sampling_ratio/min": 0.002563734073191881, + "sampling/sampling_logp_difference/max": 5.966290473937988, + "sampling/sampling_logp_difference/mean": 0.12229540944099426, + "step": 539 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2161.0, + "completions/max_terminated_length": 2161.0, + "completions/mean_length": 574.2734375, + "completions/mean_terminated_length": 574.2734375, + "completions/min_length": 140.0, + "completions/min_terminated_length": 140.0, + "entropy": 0.10646900534629822, + "epoch": 0.9557522123893806, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.32292854049371705, + "learning_rate": 1e-06, + "loss": 0.0083, + "num_tokens": 272659355.0, + "reward": 0.47675782442092896, + "reward_std": 0.47346949577331543, + "rewards/execution_accuracy_EX/mean": 0.44921875, + "rewards/execution_accuracy_EX/std": 0.49838894605636597, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.894546389579773, + "sampling/importance_sampling_ratio/mean": 0.97877037525177, + "sampling/importance_sampling_ratio/min": 0.0002635188866406679, + "sampling/sampling_logp_difference/max": 8.241385459899902, + "sampling/sampling_logp_difference/mean": 0.12778767943382263, + "step": 540 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2168.0, + "completions/mean_length": 489.64453125, + "completions/mean_terminated_length": 475.5019836425781, + "completions/min_length": 215.0, + "completions/min_terminated_length": 215.0, + "entropy": 0.09318062476813793, + "epoch": 0.9575221238938053, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.1821087107227332, + "learning_rate": 1e-06, + "loss": -0.0179, + "num_tokens": 273140576.0, + "reward": 0.6546875238418579, + "reward_std": 0.4580622911453247, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.977733314037323, + "sampling/importance_sampling_ratio/min": 0.005289058666676283, + "sampling/sampling_logp_difference/max": 5.242115020751953, + "sampling/sampling_logp_difference/mean": 0.12172285467386246, + "step": 541 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2160.0, + "completions/max_terminated_length": 2160.0, + "completions/mean_length": 654.5, + "completions/mean_terminated_length": 654.5, + "completions/min_length": 223.0, + "completions/min_terminated_length": 223.0, + "entropy": 0.12306590657681227, + "epoch": 0.95929203539823, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19859294388031962, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 273753856.0, + "reward": 0.517578125, + "reward_std": 0.47587236762046814, + "rewards/execution_accuracy_EX/mean": 0.4921875, + "rewards/execution_accuracy_EX/std": 0.5009182691574097, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.981598973274231, + "sampling/importance_sampling_ratio/min": 0.0031870543025434017, + "sampling/sampling_logp_difference/max": 5.748658180236816, + "sampling/sampling_logp_difference/mean": 0.13284003734588623, + "step": 542 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1447.0, + "completions/max_terminated_length": 1447.0, + "completions/mean_length": 513.44140625, + "completions/mean_terminated_length": 513.44140625, + "completions/min_length": 185.0, + "completions/min_terminated_length": 185.0, + "entropy": 0.09991770703345537, + "epoch": 0.9610619469026549, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.15342334977114047, + "learning_rate": 1e-06, + "loss": -0.0031, + "num_tokens": 274275281.0, + "reward": 0.62890625, + "reward_std": 0.46440389752388, + "rewards/execution_accuracy_EX/mean": 0.609375, + "rewards/execution_accuracy_EX/std": 0.48884621262550354, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9770808219909668, + "sampling/importance_sampling_ratio/min": 0.0067402091808617115, + "sampling/sampling_logp_difference/max": 4.999664306640625, + "sampling/sampling_logp_difference/mean": 0.1259656548500061, + "step": 543 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2306.0, + "completions/max_terminated_length": 2306.0, + "completions/mean_length": 742.94921875, + "completions/mean_terminated_length": 742.94921875, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.12249947339296341, + "epoch": 0.9628318584070796, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.25436754142506995, + "learning_rate": 1e-06, + "loss": 0.0066, + "num_tokens": 274932052.0, + "reward": 0.6548827886581421, + "reward_std": 0.45779263973236084, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9806671142578125, + "sampling/importance_sampling_ratio/min": 0.004190544597804546, + "sampling/sampling_logp_difference/max": 5.474924564361572, + "sampling/sampling_logp_difference/mean": 0.1349233239889145, + "step": 544 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 2188.0, + "completions/mean_length": 581.171875, + "completions/mean_terminated_length": 567.3882446289062, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.10665471758693457, + "epoch": 0.9646017699115044, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.08188788651492554, + "learning_rate": 1e-06, + "loss": -0.0046, + "num_tokens": 275359216.0, + "reward": 0.6955077648162842, + "reward_std": 0.44443103671073914, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9830989837646484, + "sampling/importance_sampling_ratio/min": 0.0024902531877160072, + "sampling/sampling_logp_difference/max": 5.995370864868164, + "sampling/sampling_logp_difference/mean": 0.1171613335609436, + "step": 545 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3124.0, + "completions/max_terminated_length": 3124.0, + "completions/mean_length": 672.296875, + "completions/mean_terminated_length": 672.296875, + "completions/min_length": 160.0, + "completions/min_terminated_length": 160.0, + "entropy": 0.12414712738245726, + "epoch": 0.9663716814159292, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.17047261584505055, + "learning_rate": 1e-06, + "loss": 0.0328, + "num_tokens": 275936252.0, + "reward": 0.5992187261581421, + "reward_std": 0.4700848460197449, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840304851531982, + "sampling/importance_sampling_ratio/min": 0.005327270831912756, + "sampling/sampling_logp_difference/max": 5.2349162101745605, + "sampling/sampling_logp_difference/mean": 0.13033321499824524, + "step": 546 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2653.0, + "completions/max_terminated_length": 2653.0, + "completions/mean_length": 722.80859375, + "completions/mean_terminated_length": 722.80859375, + "completions/min_length": 149.0, + "completions/min_terminated_length": 149.0, + "entropy": 0.12042015977203846, + "epoch": 0.968141592920354, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23625603716043173, + "learning_rate": 1e-06, + "loss": -0.0107, + "num_tokens": 276612939.0, + "reward": 0.576953113079071, + "reward_std": 0.47307512164115906, + "rewards/execution_accuracy_EX/mean": 0.5546875, + "rewards/execution_accuracy_EX/std": 0.49797385931015015, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9823607206344604, + "sampling/importance_sampling_ratio/min": 0.004093400668352842, + "sampling/sampling_logp_difference/max": 5.498379230499268, + "sampling/sampling_logp_difference/mean": 0.1306840181350708, + "step": 547 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2369.0, + "completions/max_terminated_length": 2369.0, + "completions/mean_length": 755.6484375, + "completions/mean_terminated_length": 755.6484375, + "completions/min_length": 166.0, + "completions/min_terminated_length": 166.0, + "entropy": 0.13839434459805489, + "epoch": 0.9699115044247788, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24603509269471777, + "learning_rate": 1e-06, + "loss": -0.0081, + "num_tokens": 277319649.0, + "reward": 0.6771484017372131, + "reward_std": 0.4508545994758606, + "rewards/execution_accuracy_EX/mean": 0.66015625, + "rewards/execution_accuracy_EX/std": 0.47458380460739136, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9849244356155396, + "sampling/importance_sampling_ratio/min": 0.0009713406325317919, + "sampling/sampling_logp_difference/max": 6.936833381652832, + "sampling/sampling_logp_difference/mean": 0.1350674033164978, + "step": 548 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1409.0, + "completions/max_terminated_length": 1409.0, + "completions/mean_length": 559.14453125, + "completions/mean_terminated_length": 559.14453125, + "completions/min_length": 183.0, + "completions/min_terminated_length": 183.0, + "entropy": 0.11165465787053108, + "epoch": 0.9716814159292035, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21138017777095772, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 277891766.0, + "reward": 0.7216796875, + "reward_std": 0.4332149624824524, + "rewards/execution_accuracy_EX/mean": 0.70703125, + "rewards/execution_accuracy_EX/std": 0.45601576566696167, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9781904220581055, + "sampling/importance_sampling_ratio/min": 0.0031994825694710016, + "sampling/sampling_logp_difference/max": 5.7447662353515625, + "sampling/sampling_logp_difference/mean": 0.13184812664985657, + "step": 549 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0078125, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 1937.0, + "completions/mean_length": 555.58984375, + "completions/mean_terminated_length": 527.7125854492188, + "completions/min_length": 181.0, + "completions/min_terminated_length": 181.0, + "entropy": 0.10886009782552719, + "epoch": 0.9734513274336283, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.24652408550642546, + "learning_rate": 1e-06, + "loss": 0.004, + "num_tokens": 278449309.0, + "reward": 0.6544921398162842, + "reward_std": 0.45833173394203186, + "rewards/execution_accuracy_EX/mean": 0.63671875, + "rewards/execution_accuracy_EX/std": 0.48188701272010803, + "rewards/format_reward/mean": 0.9921875, + "rewards/format_reward/std": 0.08821486681699753, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9799903035163879, + "sampling/importance_sampling_ratio/min": 0.00248888460919261, + "sampling/sampling_logp_difference/max": 5.995920658111572, + "sampling/sampling_logp_difference/mean": 0.12649500370025635, + "step": 550 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2451.0, + "completions/max_terminated_length": 2451.0, + "completions/mean_length": 728.11328125, + "completions/mean_terminated_length": 728.11328125, + "completions/min_length": 145.0, + "completions/min_terminated_length": 145.0, + "entropy": 0.1434102226048708, + "epoch": 0.9752212389380531, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2282316784079456, + "learning_rate": 1e-06, + "loss": 0.0, + "num_tokens": 278950234.0, + "reward": 0.5695312023162842, + "reward_std": 0.473834365606308, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9825781583786011, + "sampling/importance_sampling_ratio/min": 0.00443238252773881, + "sampling/sampling_logp_difference/max": 5.41881799697876, + "sampling/sampling_logp_difference/mean": 0.1440458595752716, + "step": 551 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1593.0, + "completions/max_terminated_length": 1593.0, + "completions/mean_length": 569.9140625, + "completions/mean_terminated_length": 569.9140625, + "completions/min_length": 190.0, + "completions/min_terminated_length": 190.0, + "entropy": 0.11375557817518711, + "epoch": 0.9769911504424779, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.19190393031905698, + "learning_rate": 1e-06, + "loss": 0.0052, + "num_tokens": 279539940.0, + "reward": 0.6957031488418579, + "reward_std": 0.44413506984710693, + "rewards/execution_accuracy_EX/mean": 0.6796875, + "rewards/execution_accuracy_EX/std": 0.4675106406211853, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9792619943618774, + "sampling/importance_sampling_ratio/min": 0.0035934499464929104, + "sampling/sampling_logp_difference/max": 5.628642559051514, + "sampling/sampling_logp_difference/mean": 0.13107334077358246, + "step": 552 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1953.0, + "completions/max_terminated_length": 1953.0, + "completions/mean_length": 695.171875, + "completions/mean_terminated_length": 695.171875, + "completions/min_length": 148.0, + "completions/min_terminated_length": 148.0, + "entropy": 0.14454811066389084, + "epoch": 0.9787610619469026, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.2952323511378513, + "learning_rate": 1e-06, + "loss": 0.0395, + "num_tokens": 280272768.0, + "reward": 0.591796875, + "reward_std": 0.471201092004776, + "rewards/execution_accuracy_EX/mean": 0.5703125, + "rewards/execution_accuracy_EX/std": 0.4960011839866638, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9839590787887573, + "sampling/importance_sampling_ratio/min": 0.005254973191767931, + "sampling/sampling_logp_difference/max": 5.248580455780029, + "sampling/sampling_logp_difference/mean": 0.1397646814584732, + "step": 553 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1536.0, + "completions/max_terminated_length": 1536.0, + "completions/mean_length": 598.0390625, + "completions/mean_terminated_length": 598.0390625, + "completions/min_length": 194.0, + "completions/min_terminated_length": 194.0, + "entropy": 0.1200046269223094, + "epoch": 0.9805309734513274, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.28352458701611716, + "learning_rate": 1e-06, + "loss": -0.0101, + "num_tokens": 280682490.0, + "reward": 0.7847656011581421, + "reward_std": 0.39845579862594604, + "rewards/execution_accuracy_EX/mean": 0.7734375, + "rewards/execution_accuracy_EX/std": 0.41942715644836426, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9784917831420898, + "sampling/importance_sampling_ratio/min": 0.004114833660423756, + "sampling/sampling_logp_difference/max": 5.493156909942627, + "sampling/sampling_logp_difference/mean": 0.1344582438468933, + "step": 554 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2138.0, + "completions/max_terminated_length": 2138.0, + "completions/mean_length": 722.98046875, + "completions/mean_terminated_length": 722.98046875, + "completions/min_length": 297.0, + "completions/min_terminated_length": 297.0, + "entropy": 0.15025673247873783, + "epoch": 0.9823008849557522, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.3030341174630012, + "learning_rate": 1e-06, + "loss": -0.0091, + "num_tokens": 281238661.0, + "reward": 0.48417967557907104, + "reward_std": 0.4741697311401367, + "rewards/execution_accuracy_EX/mean": 0.45703125, + "rewards/execution_accuracy_EX/std": 0.4991260766983032, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.982776403427124, + "sampling/importance_sampling_ratio/min": 0.0040953196585178375, + "sampling/sampling_logp_difference/max": 5.497910499572754, + "sampling/sampling_logp_difference/mean": 0.14415386319160461, + "step": 555 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1895.0, + "completions/max_terminated_length": 1895.0, + "completions/mean_length": 615.2578125, + "completions/mean_terminated_length": 615.2578125, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.1237062867730856, + "epoch": 0.984070796460177, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.18230060179909763, + "learning_rate": 1e-06, + "loss": 0.0085, + "num_tokens": 281776839.0, + "reward": 0.6400390863418579, + "reward_std": 0.4617617428302765, + "rewards/execution_accuracy_EX/mean": 0.62109375, + "rewards/execution_accuracy_EX/std": 0.4860650300979614, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9814517498016357, + "sampling/importance_sampling_ratio/min": 0.01112808845937252, + "sampling/sampling_logp_difference/max": 4.4982829093933105, + "sampling/sampling_logp_difference/mean": 0.13024307787418365, + "step": 556 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1571.0, + "completions/max_terminated_length": 1571.0, + "completions/mean_length": 520.00390625, + "completions/mean_terminated_length": 520.00390625, + "completions/min_length": 127.0, + "completions/min_terminated_length": 127.0, + "entropy": 0.10962794907391071, + "epoch": 0.9858407079646018, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.09861330351470331, + "learning_rate": 1e-06, + "loss": 0.0021, + "num_tokens": 282454296.0, + "reward": 0.6994140148162842, + "reward_std": 0.44268524646759033, + "rewards/execution_accuracy_EX/mean": 0.68359375, + "rewards/execution_accuracy_EX/std": 0.4659844934940338, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.979133665561676, + "sampling/importance_sampling_ratio/min": 0.005249778274446726, + "sampling/sampling_logp_difference/max": 5.249569416046143, + "sampling/sampling_logp_difference/mean": 0.12833350896835327, + "step": 557 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.00390625, + "completions/max_length": 4096.0, + "completions/max_terminated_length": 3972.0, + "completions/mean_length": 766.83984375, + "completions/mean_terminated_length": 753.7843627929688, + "completions/min_length": 126.0, + "completions/min_terminated_length": 126.0, + "entropy": 0.14016254618763924, + "epoch": 0.9876106194690265, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.23413755278570705, + "learning_rate": 1e-06, + "loss": -0.0086, + "num_tokens": 283027583.0, + "reward": 0.7548828125, + "reward_std": 0.4167163670063019, + "rewards/execution_accuracy_EX/mean": 0.7421875, + "rewards/execution_accuracy_EX/std": 0.4382871091365814, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9850025773048401, + "sampling/importance_sampling_ratio/min": 0.003982211463153362, + "sampling/sampling_logp_difference/max": 5.525918006896973, + "sampling/sampling_logp_difference/mean": 0.1391599178314209, + "step": 558 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 1676.0, + "completions/max_terminated_length": 1676.0, + "completions/mean_length": 554.875, + "completions/mean_terminated_length": 554.875, + "completions/min_length": 163.0, + "completions/min_terminated_length": 163.0, + "entropy": 0.11856131162494421, + "epoch": 0.9893805309734514, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.30695062544975965, + "learning_rate": 1e-06, + "loss": 0.0248, + "num_tokens": 283446031.0, + "reward": 0.8441406488418579, + "reward_std": 0.35250481963157654, + "rewards/execution_accuracy_EX/mean": 0.8359375, + "rewards/execution_accuracy_EX/std": 0.3710577189922333, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 1.8930208683013916, + "sampling/importance_sampling_ratio/mean": 0.9805971384048462, + "sampling/importance_sampling_ratio/min": 0.0014977871906012297, + "sampling/sampling_logp_difference/max": 6.5037665367126465, + "sampling/sampling_logp_difference/mean": 0.13229113817214966, + "step": 559 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2161.0, + "completions/max_terminated_length": 2161.0, + "completions/mean_length": 801.5859375, + "completions/mean_terminated_length": 801.5859375, + "completions/min_length": 175.0, + "completions/min_terminated_length": 175.0, + "entropy": 0.15025562047958374, + "epoch": 0.9911504424778761, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.27251349764168376, + "learning_rate": 1e-06, + "loss": 0.0254, + "num_tokens": 284133477.0, + "reward": 0.5695312023162842, + "reward_std": 0.4738343358039856, + "rewards/execution_accuracy_EX/mean": 0.546875, + "rewards/execution_accuracy_EX/std": 0.4987730085849762, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9856324195861816, + "sampling/importance_sampling_ratio/min": 0.0041136350482702255, + "sampling/sampling_logp_difference/max": 5.493448257446289, + "sampling/sampling_logp_difference/mean": 0.13937297463417053, + "step": 560 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2816.0, + "completions/max_terminated_length": 2816.0, + "completions/mean_length": 560.18359375, + "completions/mean_terminated_length": 560.18359375, + "completions/min_length": 154.0, + "completions/min_terminated_length": 154.0, + "entropy": 0.13532736897468567, + "epoch": 0.9929203539823008, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.33926341804167603, + "learning_rate": 1e-06, + "loss": -0.0064, + "num_tokens": 284477332.0, + "reward": 0.7884765267372131, + "reward_std": 0.3960021138191223, + "rewards/execution_accuracy_EX/mean": 0.77734375, + "rewards/execution_accuracy_EX/std": 0.41684433817863464, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9841941595077515, + "sampling/importance_sampling_ratio/min": 0.006754739210009575, + "sampling/sampling_logp_difference/max": 4.99751091003418, + "sampling/sampling_logp_difference/mean": 0.13882380723953247, + "step": 561 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2215.0, + "completions/max_terminated_length": 2215.0, + "completions/mean_length": 633.6640625, + "completions/mean_terminated_length": 633.6640625, + "completions/min_length": 169.0, + "completions/min_terminated_length": 169.0, + "entropy": 0.13252269942313433, + "epoch": 0.9946902654867257, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.20670189789834498, + "learning_rate": 1e-06, + "loss": 0.0031, + "num_tokens": 284988622.0, + "reward": 0.5435546636581421, + "reward_std": 0.4755672216415405, + "rewards/execution_accuracy_EX/mean": 0.51953125, + "rewards/execution_accuracy_EX/std": 0.5005971193313599, + "rewards/format_reward/mean": 1.0, + "rewards/format_reward/std": 0.0, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.98419189453125, + "sampling/importance_sampling_ratio/min": 0.005253507290035486, + "sampling/sampling_logp_difference/max": 5.248859405517578, + "sampling/sampling_logp_difference/mean": 0.1294565200805664, + "step": 562 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2789.0, + "completions/max_terminated_length": 2789.0, + "completions/mean_length": 704.359375, + "completions/mean_terminated_length": 704.359375, + "completions/min_length": 156.0, + "completions/min_terminated_length": 156.0, + "entropy": 0.15773088298738003, + "epoch": 0.9964601769911504, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21451391274463097, + "learning_rate": 1e-06, + "loss": -0.0256, + "num_tokens": 285461802.0, + "reward": 0.6064453125, + "reward_std": 0.46869391202926636, + "rewards/execution_accuracy_EX/mean": 0.5859375, + "rewards/execution_accuracy_EX/std": 0.4935242533683777, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9833812117576599, + "sampling/importance_sampling_ratio/min": 3.445271431701258e-05, + "sampling/sampling_logp_difference/max": 10.275922775268555, + "sampling/sampling_logp_difference/mean": 0.1499510258436203, + "step": 563 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 2317.0, + "completions/max_terminated_length": 2317.0, + "completions/mean_length": 687.1015625, + "completions/mean_terminated_length": 687.1015625, + "completions/min_length": 165.0, + "completions/min_terminated_length": 165.0, + "entropy": 0.1400073505938053, + "epoch": 0.9982300884955753, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.21422768031719658, + "learning_rate": 1e-06, + "loss": 0.0139, + "num_tokens": 286150612.0, + "reward": 0.5990234613418579, + "reward_std": 0.4699280560016632, + "rewards/execution_accuracy_EX/mean": 0.578125, + "rewards/execution_accuracy_EX/std": 0.49482619762420654, + "rewards/format_reward/mean": 0.99609375, + "rewards/format_reward/std": 0.0625, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9824703931808472, + "sampling/importance_sampling_ratio/min": 0.0008885476854629815, + "sampling/sampling_logp_difference/max": 7.0259222984313965, + "sampling/sampling_logp_difference/mean": 0.13782978057861328, + "step": 564 + }, + { + "clip_ratio/high_max": 0.0, + "clip_ratio/high_mean": 0.0, + "clip_ratio/low_mean": 0.0, + "clip_ratio/low_min": 0.0, + "clip_ratio/region_mean": 0.0, + "completions/clipped_ratio": 0.0, + "completions/max_length": 3693.0, + "completions/max_terminated_length": 3693.0, + "completions/mean_length": 695.51171875, + "completions/mean_terminated_length": 695.51171875, + "completions/min_length": 179.0, + "completions/min_terminated_length": 179.0, + "entropy": 0.1185274813324213, + "epoch": 1.0, + "frac_reward_zero_std": 0.0, + "grad_norm": 0.03277521648262241, + "learning_rate": 1e-06, + "loss": -0.0021, + "num_tokens": 286630103.0, + "reward": 0.8765624761581421, + "reward_std": 0.3139525055885315, + "rewards/execution_accuracy_EX/mean": 0.875, + "rewards/execution_accuracy_EX/std": 0.33136674761772156, + "rewards/format_reward/mean": 0.90625, + "rewards/format_reward/std": 0.2920515835285187, + "sampling/importance_sampling_ratio/max": 2.0, + "sampling/importance_sampling_ratio/mean": 0.9840861558914185, + "sampling/importance_sampling_ratio/min": 0.0001331465900875628, + "sampling/sampling_logp_difference/max": 8.924059867858887, + "sampling/sampling_logp_difference/mean": 0.12314605712890625, + "step": 565 + }, + { + "epoch": 1.0, + "step": 565, + "total_flos": 0.0, + "train_loss": -8.269557677852237e-05, + "train_runtime": 34845.6401, + "train_samples_per_second": 0.26, + "train_steps_per_second": 0.016 + } + ], + "logging_steps": 1, + "max_steps": 565, + "num_input_tokens_seen": 286630103, + "num_train_epochs": 1, + "save_steps": 50, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 0.0, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}