| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.8076923076923077, |
| "eval_steps": 500, |
| "global_step": 252, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 2199.8, |
| "completions/max_terminated_length": 1639.9, |
| "completions/mean_length": 412.37578125, |
| "completions/mean_terminated_length": 406.5489105224609, |
| "completions/min_length": 119.4, |
| "completions/min_terminated_length": 119.4, |
| "entropy": 0.19867916740477085, |
| "epoch": 0.03205128205128205, |
| "frac_reward_zero_std": 0.028125, |
| "grad_norm": 0.002149249856337326, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": 0.0001, |
| "num_tokens": 2250657.0, |
| "reward": -0.275947505235672, |
| "reward_std": 0.1295723870396614, |
| "rewards/grpo_reward_function/mean": -0.2759475141763687, |
| "rewards/grpo_reward_function/std": 0.424668425321579, |
| "sampling/importance_sampling_ratio/max": 0.27481397837400434, |
| "sampling/importance_sampling_ratio/mean": 0.010644285473972559, |
| "sampling/importance_sampling_ratio/min": 7.943800001636386e-18, |
| "sampling/sampling_logp_difference/max": 1.52898166179657, |
| "sampling/sampling_logp_difference/mean": 0.04394495449960232, |
| "step": 10, |
| "step_time": 108.14200488887727 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00234375, |
| "completions/max_length": 2391.5, |
| "completions/max_terminated_length": 1909.9, |
| "completions/mean_length": 404.9625, |
| "completions/mean_terminated_length": 396.33917541503905, |
| "completions/min_length": 97.9, |
| "completions/min_terminated_length": 97.9, |
| "entropy": 0.19242343455553054, |
| "epoch": 0.0641025641025641, |
| "frac_reward_zero_std": 0.059375, |
| "grad_norm": 0.0011151177348417973, |
| "learning_rate": 3.015873015873016e-06, |
| "loss": -0.0008, |
| "num_tokens": 4484137.0, |
| "reward": -0.24521568268537522, |
| "reward_std": 0.1464571863412857, |
| "rewards/grpo_reward_function/mean": -0.24521567821502685, |
| "rewards/grpo_reward_function/std": 0.4808669984340668, |
| "sampling/importance_sampling_ratio/max": 0.33791280463337897, |
| "sampling/importance_sampling_ratio/mean": 0.015166364191100002, |
| "sampling/importance_sampling_ratio/min": 2.5371388141412726e-20, |
| "sampling/sampling_logp_difference/max": 1.3974398732185365, |
| "sampling/sampling_logp_difference/mean": 0.042889106273651126, |
| "step": 20, |
| "step_time": 109.47327837087214 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01015625, |
| "completions/max_length": 3805.9, |
| "completions/max_terminated_length": 2067.3, |
| "completions/mean_length": 453.64921875, |
| "completions/mean_terminated_length": 416.3313415527344, |
| "completions/min_length": 110.3, |
| "completions/min_terminated_length": 110.3, |
| "entropy": 0.1974952444434166, |
| "epoch": 0.09615384615384616, |
| "frac_reward_zero_std": 0.034375, |
| "grad_norm": 8.499528216137695e-05, |
| "learning_rate": 4.603174603174604e-06, |
| "loss": -0.0002, |
| "num_tokens": 6759408.0, |
| "reward": -0.2948721319437027, |
| "reward_std": 0.15282833948731422, |
| "rewards/grpo_reward_function/mean": -0.29487212747335434, |
| "rewards/grpo_reward_function/std": 0.47507801949977874, |
| "sampling/importance_sampling_ratio/max": 0.13380551273003222, |
| "sampling/importance_sampling_ratio/mean": 0.005529391323216259, |
| "sampling/importance_sampling_ratio/min": 9.182118521107166e-23, |
| "sampling/sampling_logp_difference/max": 1.5520962238311768, |
| "sampling/sampling_logp_difference/mean": 0.043160675838589665, |
| "step": 30, |
| "step_time": 116.19033445902168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0015625, |
| "completions/max_length": 2842.5, |
| "completions/max_terminated_length": 2502.3, |
| "completions/mean_length": 414.8203125, |
| "completions/mean_terminated_length": 409.07032470703126, |
| "completions/min_length": 104.5, |
| "completions/min_terminated_length": 104.5, |
| "entropy": 0.19993907809257508, |
| "epoch": 0.1282051282051282, |
| "frac_reward_zero_std": 0.040625, |
| "grad_norm": 0.0006745797790728429, |
| "learning_rate": 6.1904761904761914e-06, |
| "loss": -0.0002, |
| "num_tokens": 9021210.0, |
| "reward": -0.3394616931676865, |
| "reward_std": 0.18811093866825104, |
| "rewards/grpo_reward_function/mean": -0.3394616901874542, |
| "rewards/grpo_reward_function/std": 0.4581590205430984, |
| "sampling/importance_sampling_ratio/max": 0.13625358305871488, |
| "sampling/importance_sampling_ratio/mean": 0.006088873354019597, |
| "sampling/importance_sampling_ratio/min": 7.88066005403204e-16, |
| "sampling/sampling_logp_difference/max": 1.499702548980713, |
| "sampling/sampling_logp_difference/mean": 0.04408838003873825, |
| "step": 40, |
| "step_time": 108.74849968068301 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00546875, |
| "completions/max_length": 3194.0, |
| "completions/max_terminated_length": 2308.3, |
| "completions/mean_length": 425.12578125, |
| "completions/mean_terminated_length": 404.89239196777345, |
| "completions/min_length": 100.8, |
| "completions/min_terminated_length": 100.8, |
| "entropy": 0.1986594047397375, |
| "epoch": 0.16025641025641027, |
| "frac_reward_zero_std": 0.028125, |
| "grad_norm": 0.0010311153051996479, |
| "learning_rate": 7.77777777777778e-06, |
| "loss": -0.0001, |
| "num_tokens": 11280859.0, |
| "reward": -0.2975210875272751, |
| "reward_std": 0.1474170058965683, |
| "rewards/grpo_reward_function/mean": -0.29752107709646225, |
| "rewards/grpo_reward_function/std": 0.4570117324590683, |
| "sampling/importance_sampling_ratio/max": 0.29515470042824743, |
| "sampling/importance_sampling_ratio/mean": 0.0092799658421427, |
| "sampling/importance_sampling_ratio/min": 5.108517233618296e-19, |
| "sampling/sampling_logp_difference/max": 1.4333376169204712, |
| "sampling/sampling_logp_difference/mean": 0.04367562681436539, |
| "step": 50, |
| "step_time": 128.00819000415504 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0046875, |
| "completions/max_length": 3292.6, |
| "completions/max_terminated_length": 2423.1, |
| "completions/mean_length": 442.92421875, |
| "completions/mean_terminated_length": 425.74745788574216, |
| "completions/min_length": 114.5, |
| "completions/min_terminated_length": 114.5, |
| "entropy": 0.19451429434120654, |
| "epoch": 0.19230769230769232, |
| "frac_reward_zero_std": 0.03125, |
| "grad_norm": 0.00034187396484116245, |
| "learning_rate": 9.365079365079366e-06, |
| "loss": 0.0005, |
| "num_tokens": 13606514.0, |
| "reward": -0.291646933555603, |
| "reward_std": 0.18595448434352874, |
| "rewards/grpo_reward_function/mean": -0.2916469365358353, |
| "rewards/grpo_reward_function/std": 0.500500214099884, |
| "sampling/importance_sampling_ratio/max": 0.3140833295881748, |
| "sampling/importance_sampling_ratio/mean": 0.008379129139939323, |
| "sampling/importance_sampling_ratio/min": 1.013470538854885e-16, |
| "sampling/sampling_logp_difference/max": 1.369999098777771, |
| "sampling/sampling_logp_difference/mean": 0.04265468046069145, |
| "step": 60, |
| "step_time": 124.51363873668015 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 3630.2, |
| "completions/max_terminated_length": 2578.8, |
| "completions/mean_length": 446.13671875, |
| "completions/mean_terminated_length": 423.1569488525391, |
| "completions/min_length": 114.2, |
| "completions/min_terminated_length": 114.2, |
| "entropy": 0.186833830550313, |
| "epoch": 0.22435897435897437, |
| "frac_reward_zero_std": 0.05625, |
| "grad_norm": 0.0008922675305969227, |
| "learning_rate": 9.985680226398261e-06, |
| "loss": 0.0005, |
| "num_tokens": 15864709.0, |
| "reward": -0.23017083778977393, |
| "reward_std": 0.17412283048033714, |
| "rewards/grpo_reward_function/mean": -0.23017083778977393, |
| "rewards/grpo_reward_function/std": 0.5827802836894989, |
| "sampling/importance_sampling_ratio/max": 0.3794835552573204, |
| "sampling/importance_sampling_ratio/mean": 0.01778233496006578, |
| "sampling/importance_sampling_ratio/min": 3.8213540481902065e-23, |
| "sampling/sampling_logp_difference/max": 1.4418152451515198, |
| "sampling/sampling_logp_difference/mean": 0.04067011401057243, |
| "step": 70, |
| "step_time": 133.25019105784594 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00703125, |
| "completions/max_length": 2734.7, |
| "completions/max_terminated_length": 2040.6, |
| "completions/mean_length": 462.809375, |
| "completions/mean_terminated_length": 437.35008850097654, |
| "completions/min_length": 132.1, |
| "completions/min_terminated_length": 132.1, |
| "entropy": 0.19860588386654854, |
| "epoch": 0.2564102564102564, |
| "frac_reward_zero_std": 0.046875, |
| "grad_norm": 0.00016373889128419196, |
| "learning_rate": 9.89846735808731e-06, |
| "loss": -0.0002, |
| "num_tokens": 18216537.0, |
| "reward": -0.2901266008615494, |
| "reward_std": 0.17708385214209557, |
| "rewards/grpo_reward_function/mean": -0.2901266127824783, |
| "rewards/grpo_reward_function/std": 0.47634183168411254, |
| "sampling/importance_sampling_ratio/max": 0.21812310051172973, |
| "sampling/importance_sampling_ratio/mean": 0.008993787248618901, |
| "sampling/importance_sampling_ratio/min": 1.4824077907487356e-19, |
| "sampling/sampling_logp_difference/max": 1.4244856238365173, |
| "sampling/sampling_logp_difference/mean": 0.043855397030711174, |
| "step": 80, |
| "step_time": 124.79502142816781 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00625, |
| "completions/max_length": 3060.3, |
| "completions/max_terminated_length": 2223.8, |
| "completions/mean_length": 443.69140625, |
| "completions/mean_terminated_length": 420.85289306640624, |
| "completions/min_length": 99.7, |
| "completions/min_terminated_length": 99.7, |
| "entropy": 0.19390448927879333, |
| "epoch": 0.28846153846153844, |
| "frac_reward_zero_std": 0.034375, |
| "grad_norm": 0.0008450721128363838, |
| "learning_rate": 9.733381816303395e-06, |
| "loss": 0.0, |
| "num_tokens": 20505158.0, |
| "reward": -0.3168130427598953, |
| "reward_std": 0.17042916193604468, |
| "rewards/grpo_reward_function/mean": -0.3168130397796631, |
| "rewards/grpo_reward_function/std": 0.44108698666095736, |
| "sampling/importance_sampling_ratio/max": 0.15219400450587273, |
| "sampling/importance_sampling_ratio/mean": 0.006058020202908665, |
| "sampling/importance_sampling_ratio/min": 1.8109796575372013e-16, |
| "sampling/sampling_logp_difference/max": 1.789087176322937, |
| "sampling/sampling_logp_difference/mean": 0.04262695387005806, |
| "step": 90, |
| "step_time": 121.23764831908048 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 3030.9, |
| "completions/max_terminated_length": 2092.8, |
| "completions/mean_length": 446.86796875, |
| "completions/mean_terminated_length": 412.3775573730469, |
| "completions/min_length": 129.0, |
| "completions/min_terminated_length": 129.0, |
| "entropy": 0.2053853865712881, |
| "epoch": 0.32051282051282054, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.0017788003099368768, |
| "learning_rate": 9.493048024473413e-06, |
| "loss": 0.0, |
| "num_tokens": 22735201.0, |
| "reward": -0.2926711246371269, |
| "reward_std": 0.13454800248146057, |
| "rewards/grpo_reward_function/mean": -0.29267111867666246, |
| "rewards/grpo_reward_function/std": 0.4685827106237411, |
| "sampling/importance_sampling_ratio/max": 0.3497620947659016, |
| "sampling/importance_sampling_ratio/mean": 0.010224268934689463, |
| "sampling/importance_sampling_ratio/min": 1.3665770084452744e-22, |
| "sampling/sampling_logp_difference/max": 1.469244658946991, |
| "sampling/sampling_logp_difference/mean": 0.045334973558783534, |
| "step": 100, |
| "step_time": 121.1782845430076 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0109375, |
| "completions/max_length": 3420.3, |
| "completions/max_terminated_length": 2431.0, |
| "completions/mean_length": 500.421875, |
| "completions/mean_terminated_length": 460.7878082275391, |
| "completions/min_length": 122.6, |
| "completions/min_terminated_length": 122.6, |
| "entropy": 0.20378807857632636, |
| "epoch": 0.3525641025641026, |
| "frac_reward_zero_std": 0.025, |
| "grad_norm": 1.6932425828206626e-05, |
| "learning_rate": 9.18128665415186e-06, |
| "loss": -0.0006, |
| "num_tokens": 25112061.0, |
| "reward": -0.29770597368478774, |
| "reward_std": 0.17742747887969018, |
| "rewards/grpo_reward_function/mean": -0.29770597964525225, |
| "rewards/grpo_reward_function/std": 0.4508321911096573, |
| "sampling/importance_sampling_ratio/max": 0.32459819614887236, |
| "sampling/importance_sampling_ratio/mean": 0.009665063163265587, |
| "sampling/importance_sampling_ratio/min": 1.5782984274003976e-25, |
| "sampling/sampling_logp_difference/max": 1.4484178304672242, |
| "sampling/sampling_logp_difference/mean": 0.0441218126565218, |
| "step": 110, |
| "step_time": 140.20600229538977 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01328125, |
| "completions/max_length": 3741.9, |
| "completions/max_terminated_length": 2664.0, |
| "completions/mean_length": 498.21796875, |
| "completions/mean_terminated_length": 449.88751220703125, |
| "completions/min_length": 126.2, |
| "completions/min_terminated_length": 126.2, |
| "entropy": 0.20669106915593147, |
| "epoch": 0.38461538461538464, |
| "frac_reward_zero_std": 0.053125, |
| "grad_norm": 0.0012622156269559955, |
| "learning_rate": 8.803053886449644e-06, |
| "loss": -0.0001, |
| "num_tokens": 27466064.0, |
| "reward": -0.28830415159463885, |
| "reward_std": 0.1518045909702778, |
| "rewards/grpo_reward_function/mean": -0.28830414414405825, |
| "rewards/grpo_reward_function/std": 0.4679163545370102, |
| "sampling/importance_sampling_ratio/max": 0.3135061163455248, |
| "sampling/importance_sampling_ratio/mean": 0.011271886434406042, |
| "sampling/importance_sampling_ratio/min": 4.960477138108862e-24, |
| "sampling/sampling_logp_difference/max": 1.522000241279602, |
| "sampling/sampling_logp_difference/mean": 0.04500752612948418, |
| "step": 120, |
| "step_time": 139.78347074612975 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 3804.1, |
| "completions/max_terminated_length": 2468.0, |
| "completions/mean_length": 492.92265625, |
| "completions/mean_terminated_length": 447.16551513671874, |
| "completions/min_length": 119.7, |
| "completions/min_terminated_length": 119.7, |
| "entropy": 0.20307930894196033, |
| "epoch": 0.4166666666666667, |
| "frac_reward_zero_std": 0.040625, |
| "grad_norm": 0.0014322418381552205, |
| "learning_rate": 8.364362621864595e-06, |
| "loss": 0.0001, |
| "num_tokens": 29859033.0, |
| "reward": -0.30168216228485106, |
| "reward_std": 0.16489436179399491, |
| "rewards/grpo_reward_function/mean": -0.3016821652650833, |
| "rewards/grpo_reward_function/std": 0.4505169212818146, |
| "sampling/importance_sampling_ratio/max": 0.22667273543775082, |
| "sampling/importance_sampling_ratio/mean": 0.0076055907527916135, |
| "sampling/importance_sampling_ratio/min": 1.894551322523181e-19, |
| "sampling/sampling_logp_difference/max": 1.4547302842140197, |
| "sampling/sampling_logp_difference/mean": 0.043515678495168686, |
| "step": 130, |
| "step_time": 133.21774347238244 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00859375, |
| "completions/max_length": 3465.7, |
| "completions/max_terminated_length": 2434.9, |
| "completions/mean_length": 484.92421875, |
| "completions/mean_terminated_length": 453.5516418457031, |
| "completions/min_length": 119.5, |
| "completions/min_terminated_length": 119.5, |
| "entropy": 0.20659017004072666, |
| "epoch": 0.44871794871794873, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.0002746249901608735, |
| "learning_rate": 7.872186891068997e-06, |
| "loss": -0.0008, |
| "num_tokens": 32161808.0, |
| "reward": -0.2935184553265572, |
| "reward_std": 0.16844813525676727, |
| "rewards/grpo_reward_function/mean": -0.29351845383644104, |
| "rewards/grpo_reward_function/std": 0.4532062470912933, |
| "sampling/importance_sampling_ratio/max": 0.3149314503185451, |
| "sampling/importance_sampling_ratio/mean": 0.011943908949615435, |
| "sampling/importance_sampling_ratio/min": 3.97985328172928e-23, |
| "sampling/sampling_logp_difference/max": 1.4437684297561646, |
| "sampling/sampling_logp_difference/mean": 0.04485268816351891, |
| "step": 140, |
| "step_time": 142.53533354103564 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.009375, |
| "completions/max_length": 3581.1, |
| "completions/max_terminated_length": 2453.6, |
| "completions/mean_length": 461.7703125, |
| "completions/mean_terminated_length": 427.3534698486328, |
| "completions/min_length": 105.6, |
| "completions/min_terminated_length": 105.6, |
| "entropy": 0.19936109744012356, |
| "epoch": 0.4807692307692308, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.00022260655992366576, |
| "learning_rate": 7.3343509862697295e-06, |
| "loss": -0.0005, |
| "num_tokens": 34443498.0, |
| "reward": -0.2657527238130569, |
| "reward_std": 0.15306396633386612, |
| "rewards/grpo_reward_function/mean": -0.2657527312636375, |
| "rewards/grpo_reward_function/std": 0.4356934979557991, |
| "sampling/importance_sampling_ratio/max": 0.3762046877294779, |
| "sampling/importance_sampling_ratio/mean": 0.012305672373622656, |
| "sampling/importance_sampling_ratio/min": 5.441343613780519e-26, |
| "sampling/sampling_logp_difference/max": 1.4880928158760072, |
| "sampling/sampling_logp_difference/mean": 0.04350205473601818, |
| "step": 150, |
| "step_time": 124.75091602019965 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01171875, |
| "completions/max_length": 3721.0, |
| "completions/max_terminated_length": 2522.3, |
| "completions/mean_length": 473.11796875, |
| "completions/mean_terminated_length": 430.37310180664065, |
| "completions/min_length": 126.0, |
| "completions/min_terminated_length": 126.0, |
| "entropy": 0.19293792247772218, |
| "epoch": 0.5128205128205128, |
| "frac_reward_zero_std": 0.03125, |
| "grad_norm": 0.0002647318948475709, |
| "learning_rate": 6.759405075659165e-06, |
| "loss": -0.0014, |
| "num_tokens": 36770557.0, |
| "reward": -0.2993745982646942, |
| "reward_std": 0.14573606997728347, |
| "rewards/grpo_reward_function/mean": -0.2993745982646942, |
| "rewards/grpo_reward_function/std": 0.4381078839302063, |
| "sampling/importance_sampling_ratio/max": 0.24595369808375836, |
| "sampling/importance_sampling_ratio/mean": 0.00787243063095957, |
| "sampling/importance_sampling_ratio/min": 6.3163868467048435e-22, |
| "sampling/sampling_logp_difference/max": 1.4162023305892943, |
| "sampling/sampling_logp_difference/mean": 0.042249183356761935, |
| "step": 160, |
| "step_time": 125.13825652077794 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0109375, |
| "completions/max_length": 3383.8, |
| "completions/max_terminated_length": 2437.6, |
| "completions/mean_length": 493.4984375, |
| "completions/mean_terminated_length": 453.9416534423828, |
| "completions/min_length": 125.1, |
| "completions/min_terminated_length": 125.1, |
| "entropy": 0.18732221610844135, |
| "epoch": 0.5448717948717948, |
| "frac_reward_zero_std": 0.053125, |
| "grad_norm": 4.303013523235627e-05, |
| "learning_rate": 6.156489278357967e-06, |
| "loss": 0.0003, |
| "num_tokens": 39161855.0, |
| "reward": -0.2639135167002678, |
| "reward_std": 0.14633900821208953, |
| "rewards/grpo_reward_function/mean": -0.2639135167002678, |
| "rewards/grpo_reward_function/std": 0.4198122411966324, |
| "sampling/importance_sampling_ratio/max": 0.31537908464670183, |
| "sampling/importance_sampling_ratio/mean": 0.016544956981670113, |
| "sampling/importance_sampling_ratio/min": 3.4869657448903444e-20, |
| "sampling/sampling_logp_difference/max": 1.4604986667633058, |
| "sampling/sampling_logp_difference/mean": 0.04075679816305637, |
| "step": 170, |
| "step_time": 129.28650456257165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0109375, |
| "completions/max_length": 3635.7, |
| "completions/max_terminated_length": 2598.9, |
| "completions/mean_length": 502.08984375, |
| "completions/mean_terminated_length": 462.4349060058594, |
| "completions/min_length": 115.7, |
| "completions/min_terminated_length": 115.7, |
| "entropy": 0.20257815159857273, |
| "epoch": 0.5769230769230769, |
| "frac_reward_zero_std": 0.034375, |
| "grad_norm": 0.00110081427350525, |
| "learning_rate": 5.535188360698687e-06, |
| "loss": 0.0001, |
| "num_tokens": 41584806.0, |
| "reward": -0.2823033004999161, |
| "reward_std": 0.15869370326399804, |
| "rewards/grpo_reward_function/mean": -0.28230329751968386, |
| "rewards/grpo_reward_function/std": 0.42252050116658213, |
| "sampling/importance_sampling_ratio/max": 0.33239962328225375, |
| "sampling/importance_sampling_ratio/mean": 0.01243584465701133, |
| "sampling/importance_sampling_ratio/min": 4.5339507161993353e-20, |
| "sampling/sampling_logp_difference/max": 1.5050785779953002, |
| "sampling/sampling_logp_difference/mean": 0.04326198622584343, |
| "step": 180, |
| "step_time": 138.22612802386283 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 3394.0, |
| "completions/max_terminated_length": 2141.9, |
| "completions/mean_length": 499.40859375, |
| "completions/mean_terminated_length": 454.2876892089844, |
| "completions/min_length": 130.1, |
| "completions/min_terminated_length": 130.1, |
| "entropy": 0.20578568913042544, |
| "epoch": 0.6089743589743589, |
| "frac_reward_zero_std": 0.04375, |
| "grad_norm": 0.00043071462756502963, |
| "learning_rate": 4.905379363794907e-06, |
| "loss": -0.0004, |
| "num_tokens": 43902245.0, |
| "reward": -0.31633972823619844, |
| "reward_std": 0.15992135927081108, |
| "rewards/grpo_reward_function/mean": -0.31633972078561784, |
| "rewards/grpo_reward_function/std": 0.39677430093288424, |
| "sampling/importance_sampling_ratio/max": 0.27164736688137053, |
| "sampling/importance_sampling_ratio/mean": 0.011083718878217042, |
| "sampling/importance_sampling_ratio/min": 1.0615744153343375e-25, |
| "sampling/sampling_logp_difference/max": 1.5213817000389098, |
| "sampling/sampling_logp_difference/mean": 0.04432792365550995, |
| "step": 190, |
| "step_time": 128.48162812702358 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0078125, |
| "completions/max_length": 3664.2, |
| "completions/max_terminated_length": 2662.7, |
| "completions/mean_length": 472.95703125, |
| "completions/mean_terminated_length": 444.4935272216797, |
| "completions/min_length": 123.6, |
| "completions/min_terminated_length": 123.6, |
| "entropy": 0.2034649882465601, |
| "epoch": 0.6410256410256411, |
| "frac_reward_zero_std": 0.04375, |
| "grad_norm": 0.0005240383166085926, |
| "learning_rate": 4.277074584714447e-06, |
| "loss": -0.0011, |
| "num_tokens": 46234190.0, |
| "reward": -0.3208511009812355, |
| "reward_std": 0.15701716020703316, |
| "rewards/grpo_reward_function/mean": -0.3208511024713516, |
| "rewards/grpo_reward_function/std": 0.45050418078899385, |
| "sampling/importance_sampling_ratio/max": 0.328475890122354, |
| "sampling/importance_sampling_ratio/mean": 0.00983516314299777, |
| "sampling/importance_sampling_ratio/min": 5.287639308021113e-21, |
| "sampling/sampling_logp_difference/max": 1.5113076210021972, |
| "sampling/sampling_logp_difference/mean": 0.04435745738446713, |
| "step": 200, |
| "step_time": 129.30136452838778 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0109375, |
| "completions/max_length": 3303.2, |
| "completions/max_terminated_length": 2150.1, |
| "completions/mean_length": 471.20078125, |
| "completions/mean_terminated_length": 431.7250091552734, |
| "completions/min_length": 108.1, |
| "completions/min_terminated_length": 108.1, |
| "entropy": 0.2041917782276869, |
| "epoch": 0.6730769230769231, |
| "frac_reward_zero_std": 0.04375, |
| "grad_norm": 0.00012037056894249982, |
| "learning_rate": 3.6602624074407354e-06, |
| "loss": -0.0001, |
| "num_tokens": 48535751.0, |
| "reward": -0.30149365663528443, |
| "reward_std": 0.1691875860095024, |
| "rewards/grpo_reward_function/mean": -0.3014936536550522, |
| "rewards/grpo_reward_function/std": 0.47970321476459504, |
| "sampling/importance_sampling_ratio/max": 0.2858625270426273, |
| "sampling/importance_sampling_ratio/mean": 0.009694790339563043, |
| "sampling/importance_sampling_ratio/min": 2.2194460385427903e-23, |
| "sampling/sampling_logp_difference/max": 1.5242174863815308, |
| "sampling/sampling_logp_difference/mean": 0.04446354694664478, |
| "step": 210, |
| "step_time": 133.9577619012445 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0125, |
| "completions/max_length": 3120.7, |
| "completions/max_terminated_length": 2390.0, |
| "completions/mean_length": 491.26171875, |
| "completions/mean_terminated_length": 445.72156982421876, |
| "completions/min_length": 127.1, |
| "completions/min_terminated_length": 127.1, |
| "entropy": 0.19974812418222426, |
| "epoch": 0.7051282051282052, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.00018642290477094414, |
| "learning_rate": 3.0647485139889145e-06, |
| "loss": 0.0001, |
| "num_tokens": 50902950.0, |
| "reward": -0.2944794222712517, |
| "reward_std": 0.1628885895013809, |
| "rewards/grpo_reward_function/mean": -0.2944794237613678, |
| "rewards/grpo_reward_function/std": 0.4895193099975586, |
| "sampling/importance_sampling_ratio/max": 0.240592747554183, |
| "sampling/importance_sampling_ratio/mean": 0.008309419581200928, |
| "sampling/importance_sampling_ratio/min": 1.3308400815697736e-22, |
| "sampling/sampling_logp_difference/max": 1.5919574618339538, |
| "sampling/sampling_logp_difference/mean": 0.04345178902149201, |
| "step": 220, |
| "step_time": 126.08353825174272 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01171875, |
| "completions/max_length": 3751.5, |
| "completions/max_terminated_length": 2523.9, |
| "completions/mean_length": 474.57265625, |
| "completions/mean_terminated_length": 431.8814666748047, |
| "completions/min_length": 114.4, |
| "completions/min_terminated_length": 114.4, |
| "entropy": 0.1985881496220827, |
| "epoch": 0.7371794871794872, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.0026658825070435413, |
| "learning_rate": 2.5000000000000015e-06, |
| "loss": -0.0012, |
| "num_tokens": 53179911.0, |
| "reward": -0.2606832191348076, |
| "reward_std": 0.14023807421326637, |
| "rewards/grpo_reward_function/mean": -0.2606832206249237, |
| "rewards/grpo_reward_function/std": 0.44046649634838103, |
| "sampling/importance_sampling_ratio/max": 0.4130809709429741, |
| "sampling/importance_sampling_ratio/mean": 0.01407341011799872, |
| "sampling/importance_sampling_ratio/min": 5.8113626282032196e-24, |
| "sampling/sampling_logp_difference/max": 1.5074282884597778, |
| "sampling/sampling_logp_difference/mean": 0.043073756620287895, |
| "step": 230, |
| "step_time": 135.25556658878924 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0078125, |
| "completions/max_length": 3329.5, |
| "completions/max_terminated_length": 1825.4, |
| "completions/mean_length": 442.0640625, |
| "completions/mean_terminated_length": 413.4823333740234, |
| "completions/min_length": 124.7, |
| "completions/min_terminated_length": 124.7, |
| "entropy": 0.1891279250383377, |
| "epoch": 0.7692307692307693, |
| "frac_reward_zero_std": 0.05, |
| "grad_norm": 0.000754428644898323, |
| "learning_rate": 1.9749948729627188e-06, |
| "loss": -0.0004, |
| "num_tokens": 55444105.0, |
| "reward": -0.25499732717871665, |
| "reward_std": 0.15963515415787696, |
| "rewards/grpo_reward_function/mean": -0.2549973249435425, |
| "rewards/grpo_reward_function/std": 0.45704702734947206, |
| "sampling/importance_sampling_ratio/max": 0.7155711248517036, |
| "sampling/importance_sampling_ratio/mean": 0.023246456822380423, |
| "sampling/importance_sampling_ratio/min": 3.208456082463682e-19, |
| "sampling/sampling_logp_difference/max": 1.402456557750702, |
| "sampling/sampling_logp_difference/mean": 0.041706265136599543, |
| "step": 240, |
| "step_time": 131.66669817045332 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.00859375, |
| "completions/max_length": 3810.7, |
| "completions/max_terminated_length": 2189.7, |
| "completions/mean_length": 461.63203125, |
| "completions/mean_terminated_length": 430.09210510253905, |
| "completions/min_length": 128.3, |
| "completions/min_terminated_length": 128.3, |
| "entropy": 0.2066630445420742, |
| "epoch": 0.8012820512820513, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.00013277689397189554, |
| "learning_rate": 1.4980793256432474e-06, |
| "loss": 0.0007, |
| "num_tokens": 57751202.0, |
| "reward": -0.2818883016705513, |
| "reward_std": 0.13387203291058541, |
| "rewards/grpo_reward_function/mean": -0.28188829869031906, |
| "rewards/grpo_reward_function/std": 0.4075877174735069, |
| "sampling/importance_sampling_ratio/max": 0.4012782000005245, |
| "sampling/importance_sampling_ratio/mean": 0.014056763611733913, |
| "sampling/importance_sampling_ratio/min": 3.294229030138832e-22, |
| "sampling/sampling_logp_difference/max": 1.4437347650527954, |
| "sampling/sampling_logp_difference/mean": 0.04458657465875149, |
| "step": 250, |
| "step_time": 136.0185191631317 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 312, |
| "num_input_tokens_seen": 58200744, |
| "num_train_epochs": 1, |
| "save_steps": 63, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|