{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8076923076923077, "eval_steps": 500, "global_step": 252, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2199.8, "completions/max_terminated_length": 1639.9, "completions/mean_length": 412.37578125, "completions/mean_terminated_length": 406.5489105224609, "completions/min_length": 119.4, "completions/min_terminated_length": 119.4, "entropy": 0.19867916740477085, "epoch": 0.03205128205128205, "frac_reward_zero_std": 0.028125, "grad_norm": 0.002149249856337326, "learning_rate": 1.4285714285714286e-06, "loss": 0.0001, "num_tokens": 2250657.0, "reward": -0.275947505235672, "reward_std": 0.1295723870396614, "rewards/grpo_reward_function/mean": -0.2759475141763687, "rewards/grpo_reward_function/std": 0.424668425321579, "sampling/importance_sampling_ratio/max": 0.27481397837400434, "sampling/importance_sampling_ratio/mean": 0.010644285473972559, "sampling/importance_sampling_ratio/min": 7.943800001636386e-18, "sampling/sampling_logp_difference/max": 1.52898166179657, "sampling/sampling_logp_difference/mean": 0.04394495449960232, "step": 10, "step_time": 108.14200488887727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00234375, "completions/max_length": 2391.5, "completions/max_terminated_length": 1909.9, "completions/mean_length": 404.9625, "completions/mean_terminated_length": 396.33917541503905, "completions/min_length": 97.9, "completions/min_terminated_length": 97.9, "entropy": 0.19242343455553054, "epoch": 0.0641025641025641, "frac_reward_zero_std": 0.059375, "grad_norm": 0.0011151177348417973, "learning_rate": 3.015873015873016e-06, "loss": -0.0008, "num_tokens": 4484137.0, "reward": -0.24521568268537522, "reward_std": 0.1464571863412857, "rewards/grpo_reward_function/mean": -0.24521567821502685, "rewards/grpo_reward_function/std": 0.4808669984340668, "sampling/importance_sampling_ratio/max": 0.33791280463337897, "sampling/importance_sampling_ratio/mean": 0.015166364191100002, "sampling/importance_sampling_ratio/min": 2.5371388141412726e-20, "sampling/sampling_logp_difference/max": 1.3974398732185365, "sampling/sampling_logp_difference/mean": 0.042889106273651126, "step": 20, "step_time": 109.47327837087214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01015625, "completions/max_length": 3805.9, "completions/max_terminated_length": 2067.3, "completions/mean_length": 453.64921875, "completions/mean_terminated_length": 416.3313415527344, "completions/min_length": 110.3, "completions/min_terminated_length": 110.3, "entropy": 0.1974952444434166, "epoch": 0.09615384615384616, "frac_reward_zero_std": 0.034375, "grad_norm": 8.499528216137695e-05, "learning_rate": 4.603174603174604e-06, "loss": -0.0002, "num_tokens": 6759408.0, "reward": -0.2948721319437027, "reward_std": 0.15282833948731422, "rewards/grpo_reward_function/mean": -0.29487212747335434, "rewards/grpo_reward_function/std": 0.47507801949977874, "sampling/importance_sampling_ratio/max": 0.13380551273003222, "sampling/importance_sampling_ratio/mean": 0.005529391323216259, "sampling/importance_sampling_ratio/min": 9.182118521107166e-23, "sampling/sampling_logp_difference/max": 1.5520962238311768, "sampling/sampling_logp_difference/mean": 0.043160675838589665, "step": 30, "step_time": 116.19033445902168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0015625, "completions/max_length": 2842.5, "completions/max_terminated_length": 2502.3, "completions/mean_length": 414.8203125, "completions/mean_terminated_length": 409.07032470703126, "completions/min_length": 104.5, "completions/min_terminated_length": 104.5, "entropy": 0.19993907809257508, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.040625, "grad_norm": 0.0006745797790728429, "learning_rate": 6.1904761904761914e-06, "loss": -0.0002, "num_tokens": 9021210.0, "reward": -0.3394616931676865, "reward_std": 0.18811093866825104, "rewards/grpo_reward_function/mean": -0.3394616901874542, "rewards/grpo_reward_function/std": 0.4581590205430984, "sampling/importance_sampling_ratio/max": 0.13625358305871488, "sampling/importance_sampling_ratio/mean": 0.006088873354019597, "sampling/importance_sampling_ratio/min": 7.88066005403204e-16, "sampling/sampling_logp_difference/max": 1.499702548980713, "sampling/sampling_logp_difference/mean": 0.04408838003873825, "step": 40, "step_time": 108.74849968068301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00546875, "completions/max_length": 3194.0, "completions/max_terminated_length": 2308.3, "completions/mean_length": 425.12578125, "completions/mean_terminated_length": 404.89239196777345, "completions/min_length": 100.8, "completions/min_terminated_length": 100.8, "entropy": 0.1986594047397375, "epoch": 0.16025641025641027, "frac_reward_zero_std": 0.028125, "grad_norm": 0.0010311153051996479, "learning_rate": 7.77777777777778e-06, "loss": -0.0001, "num_tokens": 11280859.0, "reward": -0.2975210875272751, "reward_std": 0.1474170058965683, "rewards/grpo_reward_function/mean": -0.29752107709646225, "rewards/grpo_reward_function/std": 0.4570117324590683, "sampling/importance_sampling_ratio/max": 0.29515470042824743, "sampling/importance_sampling_ratio/mean": 0.0092799658421427, "sampling/importance_sampling_ratio/min": 5.108517233618296e-19, "sampling/sampling_logp_difference/max": 1.4333376169204712, "sampling/sampling_logp_difference/mean": 0.04367562681436539, "step": 50, "step_time": 128.00819000415504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0046875, "completions/max_length": 3292.6, "completions/max_terminated_length": 2423.1, "completions/mean_length": 442.92421875, "completions/mean_terminated_length": 425.74745788574216, "completions/min_length": 114.5, "completions/min_terminated_length": 114.5, "entropy": 0.19451429434120654, "epoch": 0.19230769230769232, "frac_reward_zero_std": 0.03125, "grad_norm": 0.00034187396484116245, "learning_rate": 9.365079365079366e-06, "loss": 0.0005, "num_tokens": 13606514.0, "reward": -0.291646933555603, "reward_std": 0.18595448434352874, "rewards/grpo_reward_function/mean": -0.2916469365358353, "rewards/grpo_reward_function/std": 0.500500214099884, "sampling/importance_sampling_ratio/max": 0.3140833295881748, "sampling/importance_sampling_ratio/mean": 0.008379129139939323, "sampling/importance_sampling_ratio/min": 1.013470538854885e-16, "sampling/sampling_logp_difference/max": 1.369999098777771, "sampling/sampling_logp_difference/mean": 0.04265468046069145, "step": 60, "step_time": 124.51363873668015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 3630.2, "completions/max_terminated_length": 2578.8, "completions/mean_length": 446.13671875, "completions/mean_terminated_length": 423.1569488525391, "completions/min_length": 114.2, "completions/min_terminated_length": 114.2, "entropy": 0.186833830550313, "epoch": 0.22435897435897437, "frac_reward_zero_std": 0.05625, "grad_norm": 0.0008922675305969227, "learning_rate": 9.985680226398261e-06, "loss": 0.0005, "num_tokens": 15864709.0, "reward": -0.23017083778977393, "reward_std": 0.17412283048033714, "rewards/grpo_reward_function/mean": -0.23017083778977393, "rewards/grpo_reward_function/std": 0.5827802836894989, "sampling/importance_sampling_ratio/max": 0.3794835552573204, "sampling/importance_sampling_ratio/mean": 0.01778233496006578, "sampling/importance_sampling_ratio/min": 3.8213540481902065e-23, "sampling/sampling_logp_difference/max": 1.4418152451515198, "sampling/sampling_logp_difference/mean": 0.04067011401057243, "step": 70, "step_time": 133.25019105784594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00703125, "completions/max_length": 2734.7, "completions/max_terminated_length": 2040.6, "completions/mean_length": 462.809375, "completions/mean_terminated_length": 437.35008850097654, "completions/min_length": 132.1, "completions/min_terminated_length": 132.1, "entropy": 0.19860588386654854, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.046875, "grad_norm": 0.00016373889128419196, "learning_rate": 9.89846735808731e-06, "loss": -0.0002, "num_tokens": 18216537.0, "reward": -0.2901266008615494, "reward_std": 0.17708385214209557, "rewards/grpo_reward_function/mean": -0.2901266127824783, "rewards/grpo_reward_function/std": 0.47634183168411254, "sampling/importance_sampling_ratio/max": 0.21812310051172973, "sampling/importance_sampling_ratio/mean": 0.008993787248618901, "sampling/importance_sampling_ratio/min": 1.4824077907487356e-19, "sampling/sampling_logp_difference/max": 1.4244856238365173, "sampling/sampling_logp_difference/mean": 0.043855397030711174, "step": 80, "step_time": 124.79502142816781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00625, "completions/max_length": 3060.3, "completions/max_terminated_length": 2223.8, "completions/mean_length": 443.69140625, "completions/mean_terminated_length": 420.85289306640624, "completions/min_length": 99.7, "completions/min_terminated_length": 99.7, "entropy": 0.19390448927879333, "epoch": 0.28846153846153844, "frac_reward_zero_std": 0.034375, "grad_norm": 0.0008450721128363838, "learning_rate": 9.733381816303395e-06, "loss": 0.0, "num_tokens": 20505158.0, "reward": -0.3168130427598953, "reward_std": 0.17042916193604468, "rewards/grpo_reward_function/mean": -0.3168130397796631, "rewards/grpo_reward_function/std": 0.44108698666095736, "sampling/importance_sampling_ratio/max": 0.15219400450587273, "sampling/importance_sampling_ratio/mean": 0.006058020202908665, "sampling/importance_sampling_ratio/min": 1.8109796575372013e-16, "sampling/sampling_logp_difference/max": 1.789087176322937, "sampling/sampling_logp_difference/mean": 0.04262695387005806, "step": 90, "step_time": 121.23764831908048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 3030.9, "completions/max_terminated_length": 2092.8, "completions/mean_length": 446.86796875, "completions/mean_terminated_length": 412.3775573730469, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "entropy": 0.2053853865712881, "epoch": 0.32051282051282054, "frac_reward_zero_std": 0.0375, "grad_norm": 0.0017788003099368768, "learning_rate": 9.493048024473413e-06, "loss": 0.0, "num_tokens": 22735201.0, "reward": -0.2926711246371269, "reward_std": 0.13454800248146057, "rewards/grpo_reward_function/mean": -0.29267111867666246, "rewards/grpo_reward_function/std": 0.4685827106237411, "sampling/importance_sampling_ratio/max": 0.3497620947659016, "sampling/importance_sampling_ratio/mean": 0.010224268934689463, "sampling/importance_sampling_ratio/min": 1.3665770084452744e-22, "sampling/sampling_logp_difference/max": 1.469244658946991, "sampling/sampling_logp_difference/mean": 0.045334973558783534, "step": 100, "step_time": 121.1782845430076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0109375, "completions/max_length": 3420.3, "completions/max_terminated_length": 2431.0, "completions/mean_length": 500.421875, "completions/mean_terminated_length": 460.7878082275391, "completions/min_length": 122.6, "completions/min_terminated_length": 122.6, "entropy": 0.20378807857632636, "epoch": 0.3525641025641026, "frac_reward_zero_std": 0.025, "grad_norm": 1.6932425828206626e-05, "learning_rate": 9.18128665415186e-06, "loss": -0.0006, "num_tokens": 25112061.0, "reward": -0.29770597368478774, "reward_std": 0.17742747887969018, "rewards/grpo_reward_function/mean": -0.29770597964525225, "rewards/grpo_reward_function/std": 0.4508321911096573, "sampling/importance_sampling_ratio/max": 0.32459819614887236, "sampling/importance_sampling_ratio/mean": 0.009665063163265587, "sampling/importance_sampling_ratio/min": 1.5782984274003976e-25, "sampling/sampling_logp_difference/max": 1.4484178304672242, "sampling/sampling_logp_difference/mean": 0.0441218126565218, "step": 110, "step_time": 140.20600229538977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01328125, "completions/max_length": 3741.9, "completions/max_terminated_length": 2664.0, "completions/mean_length": 498.21796875, "completions/mean_terminated_length": 449.88751220703125, "completions/min_length": 126.2, "completions/min_terminated_length": 126.2, "entropy": 0.20669106915593147, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.053125, "grad_norm": 0.0012622156269559955, "learning_rate": 8.803053886449644e-06, "loss": -0.0001, "num_tokens": 27466064.0, "reward": -0.28830415159463885, "reward_std": 0.1518045909702778, "rewards/grpo_reward_function/mean": -0.28830414414405825, "rewards/grpo_reward_function/std": 0.4679163545370102, "sampling/importance_sampling_ratio/max": 0.3135061163455248, "sampling/importance_sampling_ratio/mean": 0.011271886434406042, "sampling/importance_sampling_ratio/min": 4.960477138108862e-24, "sampling/sampling_logp_difference/max": 1.522000241279602, "sampling/sampling_logp_difference/mean": 0.04500752612948418, "step": 120, "step_time": 139.78347074612975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 3804.1, "completions/max_terminated_length": 2468.0, "completions/mean_length": 492.92265625, "completions/mean_terminated_length": 447.16551513671874, "completions/min_length": 119.7, "completions/min_terminated_length": 119.7, "entropy": 0.20307930894196033, "epoch": 0.4166666666666667, "frac_reward_zero_std": 0.040625, "grad_norm": 0.0014322418381552205, "learning_rate": 8.364362621864595e-06, "loss": 0.0001, "num_tokens": 29859033.0, "reward": -0.30168216228485106, "reward_std": 0.16489436179399491, "rewards/grpo_reward_function/mean": -0.3016821652650833, "rewards/grpo_reward_function/std": 0.4505169212818146, "sampling/importance_sampling_ratio/max": 0.22667273543775082, "sampling/importance_sampling_ratio/mean": 0.0076055907527916135, "sampling/importance_sampling_ratio/min": 1.894551322523181e-19, "sampling/sampling_logp_difference/max": 1.4547302842140197, "sampling/sampling_logp_difference/mean": 0.043515678495168686, "step": 130, "step_time": 133.21774347238244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00859375, "completions/max_length": 3465.7, "completions/max_terminated_length": 2434.9, "completions/mean_length": 484.92421875, "completions/mean_terminated_length": 453.5516418457031, "completions/min_length": 119.5, "completions/min_terminated_length": 119.5, "entropy": 0.20659017004072666, "epoch": 0.44871794871794873, "frac_reward_zero_std": 0.0375, "grad_norm": 0.0002746249901608735, "learning_rate": 7.872186891068997e-06, "loss": -0.0008, "num_tokens": 32161808.0, "reward": -0.2935184553265572, "reward_std": 0.16844813525676727, "rewards/grpo_reward_function/mean": -0.29351845383644104, "rewards/grpo_reward_function/std": 0.4532062470912933, "sampling/importance_sampling_ratio/max": 0.3149314503185451, "sampling/importance_sampling_ratio/mean": 0.011943908949615435, "sampling/importance_sampling_ratio/min": 3.97985328172928e-23, "sampling/sampling_logp_difference/max": 1.4437684297561646, "sampling/sampling_logp_difference/mean": 0.04485268816351891, "step": 140, "step_time": 142.53533354103564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.009375, "completions/max_length": 3581.1, "completions/max_terminated_length": 2453.6, "completions/mean_length": 461.7703125, "completions/mean_terminated_length": 427.3534698486328, "completions/min_length": 105.6, "completions/min_terminated_length": 105.6, "entropy": 0.19936109744012356, "epoch": 0.4807692307692308, "frac_reward_zero_std": 0.0375, "grad_norm": 0.00022260655992366576, "learning_rate": 7.3343509862697295e-06, "loss": -0.0005, "num_tokens": 34443498.0, "reward": -0.2657527238130569, "reward_std": 0.15306396633386612, "rewards/grpo_reward_function/mean": -0.2657527312636375, "rewards/grpo_reward_function/std": 0.4356934979557991, "sampling/importance_sampling_ratio/max": 0.3762046877294779, "sampling/importance_sampling_ratio/mean": 0.012305672373622656, "sampling/importance_sampling_ratio/min": 5.441343613780519e-26, "sampling/sampling_logp_difference/max": 1.4880928158760072, "sampling/sampling_logp_difference/mean": 0.04350205473601818, "step": 150, "step_time": 124.75091602019965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3721.0, "completions/max_terminated_length": 2522.3, "completions/mean_length": 473.11796875, "completions/mean_terminated_length": 430.37310180664065, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "entropy": 0.19293792247772218, "epoch": 0.5128205128205128, "frac_reward_zero_std": 0.03125, "grad_norm": 0.0002647318948475709, "learning_rate": 6.759405075659165e-06, "loss": -0.0014, "num_tokens": 36770557.0, "reward": -0.2993745982646942, "reward_std": 0.14573606997728347, "rewards/grpo_reward_function/mean": -0.2993745982646942, "rewards/grpo_reward_function/std": 0.4381078839302063, "sampling/importance_sampling_ratio/max": 0.24595369808375836, "sampling/importance_sampling_ratio/mean": 0.00787243063095957, "sampling/importance_sampling_ratio/min": 6.3163868467048435e-22, "sampling/sampling_logp_difference/max": 1.4162023305892943, "sampling/sampling_logp_difference/mean": 0.042249183356761935, "step": 160, "step_time": 125.13825652077794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0109375, "completions/max_length": 3383.8, "completions/max_terminated_length": 2437.6, "completions/mean_length": 493.4984375, "completions/mean_terminated_length": 453.9416534423828, "completions/min_length": 125.1, "completions/min_terminated_length": 125.1, "entropy": 0.18732221610844135, "epoch": 0.5448717948717948, "frac_reward_zero_std": 0.053125, "grad_norm": 4.303013523235627e-05, "learning_rate": 6.156489278357967e-06, "loss": 0.0003, "num_tokens": 39161855.0, "reward": -0.2639135167002678, "reward_std": 0.14633900821208953, "rewards/grpo_reward_function/mean": -0.2639135167002678, "rewards/grpo_reward_function/std": 0.4198122411966324, "sampling/importance_sampling_ratio/max": 0.31537908464670183, "sampling/importance_sampling_ratio/mean": 0.016544956981670113, "sampling/importance_sampling_ratio/min": 3.4869657448903444e-20, "sampling/sampling_logp_difference/max": 1.4604986667633058, "sampling/sampling_logp_difference/mean": 0.04075679816305637, "step": 170, "step_time": 129.28650456257165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0109375, "completions/max_length": 3635.7, "completions/max_terminated_length": 2598.9, "completions/mean_length": 502.08984375, "completions/mean_terminated_length": 462.4349060058594, "completions/min_length": 115.7, "completions/min_terminated_length": 115.7, "entropy": 0.20257815159857273, "epoch": 0.5769230769230769, "frac_reward_zero_std": 0.034375, "grad_norm": 0.00110081427350525, "learning_rate": 5.535188360698687e-06, "loss": 0.0001, "num_tokens": 41584806.0, "reward": -0.2823033004999161, "reward_std": 0.15869370326399804, "rewards/grpo_reward_function/mean": -0.28230329751968386, "rewards/grpo_reward_function/std": 0.42252050116658213, "sampling/importance_sampling_ratio/max": 0.33239962328225375, "sampling/importance_sampling_ratio/mean": 0.01243584465701133, "sampling/importance_sampling_ratio/min": 4.5339507161993353e-20, "sampling/sampling_logp_difference/max": 1.5050785779953002, "sampling/sampling_logp_difference/mean": 0.04326198622584343, "step": 180, "step_time": 138.22612802386283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 3394.0, "completions/max_terminated_length": 2141.9, "completions/mean_length": 499.40859375, "completions/mean_terminated_length": 454.2876892089844, "completions/min_length": 130.1, "completions/min_terminated_length": 130.1, "entropy": 0.20578568913042544, "epoch": 0.6089743589743589, "frac_reward_zero_std": 0.04375, "grad_norm": 0.00043071462756502963, "learning_rate": 4.905379363794907e-06, "loss": -0.0004, "num_tokens": 43902245.0, "reward": -0.31633972823619844, "reward_std": 0.15992135927081108, "rewards/grpo_reward_function/mean": -0.31633972078561784, "rewards/grpo_reward_function/std": 0.39677430093288424, "sampling/importance_sampling_ratio/max": 0.27164736688137053, "sampling/importance_sampling_ratio/mean": 0.011083718878217042, "sampling/importance_sampling_ratio/min": 1.0615744153343375e-25, "sampling/sampling_logp_difference/max": 1.5213817000389098, "sampling/sampling_logp_difference/mean": 0.04432792365550995, "step": 190, "step_time": 128.48162812702358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3664.2, "completions/max_terminated_length": 2662.7, "completions/mean_length": 472.95703125, "completions/mean_terminated_length": 444.4935272216797, "completions/min_length": 123.6, "completions/min_terminated_length": 123.6, "entropy": 0.2034649882465601, "epoch": 0.6410256410256411, "frac_reward_zero_std": 0.04375, "grad_norm": 0.0005240383166085926, "learning_rate": 4.277074584714447e-06, "loss": -0.0011, "num_tokens": 46234190.0, "reward": -0.3208511009812355, "reward_std": 0.15701716020703316, "rewards/grpo_reward_function/mean": -0.3208511024713516, "rewards/grpo_reward_function/std": 0.45050418078899385, "sampling/importance_sampling_ratio/max": 0.328475890122354, "sampling/importance_sampling_ratio/mean": 0.00983516314299777, "sampling/importance_sampling_ratio/min": 5.287639308021113e-21, "sampling/sampling_logp_difference/max": 1.5113076210021972, "sampling/sampling_logp_difference/mean": 0.04435745738446713, "step": 200, "step_time": 129.30136452838778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0109375, "completions/max_length": 3303.2, "completions/max_terminated_length": 2150.1, "completions/mean_length": 471.20078125, "completions/mean_terminated_length": 431.7250091552734, "completions/min_length": 108.1, "completions/min_terminated_length": 108.1, "entropy": 0.2041917782276869, "epoch": 0.6730769230769231, "frac_reward_zero_std": 0.04375, "grad_norm": 0.00012037056894249982, "learning_rate": 3.6602624074407354e-06, "loss": -0.0001, "num_tokens": 48535751.0, "reward": -0.30149365663528443, "reward_std": 0.1691875860095024, "rewards/grpo_reward_function/mean": -0.3014936536550522, "rewards/grpo_reward_function/std": 0.47970321476459504, "sampling/importance_sampling_ratio/max": 0.2858625270426273, "sampling/importance_sampling_ratio/mean": 0.009694790339563043, "sampling/importance_sampling_ratio/min": 2.2194460385427903e-23, "sampling/sampling_logp_difference/max": 1.5242174863815308, "sampling/sampling_logp_difference/mean": 0.04446354694664478, "step": 210, "step_time": 133.9577619012445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0125, "completions/max_length": 3120.7, "completions/max_terminated_length": 2390.0, "completions/mean_length": 491.26171875, "completions/mean_terminated_length": 445.72156982421876, "completions/min_length": 127.1, "completions/min_terminated_length": 127.1, "entropy": 0.19974812418222426, "epoch": 0.7051282051282052, "frac_reward_zero_std": 0.0375, "grad_norm": 0.00018642290477094414, "learning_rate": 3.0647485139889145e-06, "loss": 0.0001, "num_tokens": 50902950.0, "reward": -0.2944794222712517, "reward_std": 0.1628885895013809, "rewards/grpo_reward_function/mean": -0.2944794237613678, "rewards/grpo_reward_function/std": 0.4895193099975586, "sampling/importance_sampling_ratio/max": 0.240592747554183, "sampling/importance_sampling_ratio/mean": 0.008309419581200928, "sampling/importance_sampling_ratio/min": 1.3308400815697736e-22, "sampling/sampling_logp_difference/max": 1.5919574618339538, "sampling/sampling_logp_difference/mean": 0.04345178902149201, "step": 220, "step_time": 126.08353825174272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01171875, "completions/max_length": 3751.5, "completions/max_terminated_length": 2523.9, "completions/mean_length": 474.57265625, "completions/mean_terminated_length": 431.8814666748047, "completions/min_length": 114.4, "completions/min_terminated_length": 114.4, "entropy": 0.1985881496220827, "epoch": 0.7371794871794872, "frac_reward_zero_std": 0.0375, "grad_norm": 0.0026658825070435413, "learning_rate": 2.5000000000000015e-06, "loss": -0.0012, "num_tokens": 53179911.0, "reward": -0.2606832191348076, "reward_std": 0.14023807421326637, "rewards/grpo_reward_function/mean": -0.2606832206249237, "rewards/grpo_reward_function/std": 0.44046649634838103, "sampling/importance_sampling_ratio/max": 0.4130809709429741, "sampling/importance_sampling_ratio/mean": 0.01407341011799872, "sampling/importance_sampling_ratio/min": 5.8113626282032196e-24, "sampling/sampling_logp_difference/max": 1.5074282884597778, "sampling/sampling_logp_difference/mean": 0.043073756620287895, "step": 230, "step_time": 135.25556658878924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0078125, "completions/max_length": 3329.5, "completions/max_terminated_length": 1825.4, "completions/mean_length": 442.0640625, "completions/mean_terminated_length": 413.4823333740234, "completions/min_length": 124.7, "completions/min_terminated_length": 124.7, "entropy": 0.1891279250383377, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.05, "grad_norm": 0.000754428644898323, "learning_rate": 1.9749948729627188e-06, "loss": -0.0004, "num_tokens": 55444105.0, "reward": -0.25499732717871665, "reward_std": 0.15963515415787696, "rewards/grpo_reward_function/mean": -0.2549973249435425, "rewards/grpo_reward_function/std": 0.45704702734947206, "sampling/importance_sampling_ratio/max": 0.7155711248517036, "sampling/importance_sampling_ratio/mean": 0.023246456822380423, "sampling/importance_sampling_ratio/min": 3.208456082463682e-19, "sampling/sampling_logp_difference/max": 1.402456557750702, "sampling/sampling_logp_difference/mean": 0.041706265136599543, "step": 240, "step_time": 131.66669817045332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00859375, "completions/max_length": 3810.7, "completions/max_terminated_length": 2189.7, "completions/mean_length": 461.63203125, "completions/mean_terminated_length": 430.09210510253905, "completions/min_length": 128.3, "completions/min_terminated_length": 128.3, "entropy": 0.2066630445420742, "epoch": 0.8012820512820513, "frac_reward_zero_std": 0.0375, "grad_norm": 0.00013277689397189554, "learning_rate": 1.4980793256432474e-06, "loss": 0.0007, "num_tokens": 57751202.0, "reward": -0.2818883016705513, "reward_std": 0.13387203291058541, "rewards/grpo_reward_function/mean": -0.28188829869031906, "rewards/grpo_reward_function/std": 0.4075877174735069, "sampling/importance_sampling_ratio/max": 0.4012782000005245, "sampling/importance_sampling_ratio/mean": 0.014056763611733913, "sampling/importance_sampling_ratio/min": 3.294229030138832e-22, "sampling/sampling_logp_difference/max": 1.4437347650527954, "sampling/sampling_logp_difference/mean": 0.04458657465875149, "step": 250, "step_time": 136.0185191631317 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 58200744, "num_train_epochs": 1, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }