{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.40384615384615385, "eval_steps": 500, "global_step": 126, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2993.0, "completions/mean_length": 842.1671875, "completions/mean_terminated_length": 625.3808288574219, "completions/min_length": 143.7, "completions/min_terminated_length": 143.7, "entropy": 0.18502369821071624, "epoch": 0.03205128205128205, "frac_reward_zero_std": 0.046875, "grad_norm": 0.00025154745623815123, "learning_rate": 1.4285714285714286e-06, "loss": -0.0002, "num_tokens": 2443746.0, "reward": -0.22845993265509607, "reward_std": 0.2702211543917656, "rewards/grpo_reward_function/mean": -0.2284599334001541, "rewards/grpo_reward_function/std": 0.7218511283397675, "sampling/importance_sampling_ratio/max": 0.12355739073827862, "sampling/importance_sampling_ratio/mean": 0.0029692337644519284, "sampling/importance_sampling_ratio/min": 5.253219159649809e-30, "sampling/sampling_logp_difference/max": 1.5544446587562561, "sampling/sampling_logp_difference/mean": 0.03719679862260818, "step": 10, "step_time": 152.91457959786058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2889.4, "completions/mean_length": 863.446875, "completions/mean_terminated_length": 671.0787475585937, "completions/min_length": 164.7, "completions/min_terminated_length": 164.7, "entropy": 0.1817332101985812, "epoch": 0.0641025641025641, "frac_reward_zero_std": 0.05, "grad_norm": 1.9538725158444555e-05, "learning_rate": 3.015873015873016e-06, "loss": -0.0001, "num_tokens": 4916654.0, "reward": -0.18899608142673968, "reward_std": 0.3027258589863777, "rewards/grpo_reward_function/mean": -0.18899608589708805, "rewards/grpo_reward_function/std": 0.7454370498657227, "sampling/importance_sampling_ratio/max": 0.0923810960026458, "sampling/importance_sampling_ratio/mean": 0.0023226177279866535, "sampling/importance_sampling_ratio/min": 6.460355799993298e-36, "sampling/sampling_logp_difference/max": 1.6125647306442261, "sampling/sampling_logp_difference/mean": 0.03694682139903307, "step": 20, "step_time": 159.88802388124168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2620.1, "completions/mean_length": 832.69296875, "completions/mean_terminated_length": 672.2784484863281, "completions/min_length": 167.4, "completions/min_terminated_length": 167.4, "entropy": 0.19116276763379575, "epoch": 0.09615384615384616, "frac_reward_zero_std": 0.03125, "grad_norm": 0.00031796028415218583, "learning_rate": 4.603174603174604e-06, "loss": -0.0005, "num_tokens": 7342561.0, "reward": -0.20435776934027672, "reward_std": 0.2867868050932884, "rewards/grpo_reward_function/mean": -0.20435777083039283, "rewards/grpo_reward_function/std": 0.7296508550643921, "sampling/importance_sampling_ratio/max": 0.06920737095642834, "sampling/importance_sampling_ratio/mean": 0.0011224002329981886, "sampling/importance_sampling_ratio/min": 5.1338851255929605e-34, "sampling/sampling_logp_difference/max": 1.7805282711982726, "sampling/sampling_logp_difference/mean": 0.03969721123576164, "step": 30, "step_time": 153.14357568509877 }, { "clip_ratio/high_max": 5.1770552090602e-06, "clip_ratio/high_mean": 6.47131901132525e-07, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.47131901132525e-07, "completions/clipped_ratio": 0.075, "completions/max_length": 4096.0, "completions/max_terminated_length": 2996.0, "completions/mean_length": 908.88203125, "completions/mean_terminated_length": 650.461572265625, "completions/min_length": 172.5, "completions/min_terminated_length": 172.5, "entropy": 0.18940101601183415, "epoch": 0.1282051282051282, "frac_reward_zero_std": 0.0375, "grad_norm": 0.00046653815964065254, "learning_rate": 6.1904761904761914e-06, "loss": -0.0007, "num_tokens": 9897290.0, "reward": -0.20948485173285009, "reward_std": 0.3274365648627281, "rewards/grpo_reward_function/mean": -0.20948485508561135, "rewards/grpo_reward_function/std": 0.7691417872905731, "sampling/importance_sampling_ratio/max": 0.12555439178831876, "sampling/importance_sampling_ratio/mean": 0.0029983414759044537, "sampling/importance_sampling_ratio/min": 7.312246668686746e-35, "sampling/sampling_logp_difference/max": 1.8727038741111754, "sampling/sampling_logp_difference/mean": 0.03728015031665564, "step": 40, "step_time": 146.35395600683987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0515625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2913.9, "completions/mean_length": 806.88984375, "completions/mean_terminated_length": 627.2792419433594, "completions/min_length": 123.4, "completions/min_terminated_length": 123.4, "entropy": 0.17744111455976963, "epoch": 0.16025641025641027, "frac_reward_zero_std": 0.05625, "grad_norm": 9.61458607191603e-05, "learning_rate": 7.77777777777778e-06, "loss": 0.0003, "num_tokens": 12302889.0, "reward": -0.18009744100272657, "reward_std": 0.28012245148420334, "rewards/grpo_reward_function/mean": -0.18009743914008142, "rewards/grpo_reward_function/std": 0.7488289535045624, "sampling/importance_sampling_ratio/max": 0.17218044362962245, "sampling/importance_sampling_ratio/mean": 0.004992885573301464, "sampling/importance_sampling_ratio/min": 1.0306885153000913e-30, "sampling/sampling_logp_difference/max": 1.6258580565452576, "sampling/sampling_logp_difference/mean": 0.03607845064252615, "step": 50, "step_time": 147.80184614248574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03984375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2814.3, "completions/mean_length": 809.65078125, "completions/mean_terminated_length": 673.7666534423828, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "entropy": 0.18477382734417916, "epoch": 0.19230769230769232, "frac_reward_zero_std": 0.034375, "grad_norm": 0.0005422528630453429, "learning_rate": 9.365079365079366e-06, "loss": -0.0002, "num_tokens": 14762678.0, "reward": -0.18410459933802487, "reward_std": 0.3047805741429329, "rewards/grpo_reward_function/mean": -0.18410460744053125, "rewards/grpo_reward_function/std": 0.7665571451187134, "sampling/importance_sampling_ratio/max": 0.10517687269020826, "sampling/importance_sampling_ratio/mean": 0.0030420748858887236, "sampling/importance_sampling_ratio/min": 3.9674455768173534e-27, "sampling/sampling_logp_difference/max": 1.571552813053131, "sampling/sampling_logp_difference/mean": 0.03778362385928631, "step": 60, "step_time": 144.8840892329812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.059375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2730.5, "completions/mean_length": 846.08828125, "completions/mean_terminated_length": 641.0483703613281, "completions/min_length": 134.2, "completions/min_terminated_length": 134.2, "entropy": 0.18578412756323814, "epoch": 0.22435897435897437, "frac_reward_zero_std": 0.03125, "grad_norm": 0.00012190250477276098, "learning_rate": 9.985680226398261e-06, "loss": -0.0002, "num_tokens": 17196835.0, "reward": -0.23204001784324646, "reward_std": 0.3367327839136124, "rewards/grpo_reward_function/mean": -0.23204001784324646, "rewards/grpo_reward_function/std": 0.7099093854427337, "sampling/importance_sampling_ratio/max": 0.04683403689414263, "sampling/importance_sampling_ratio/mean": 0.0009577752702170983, "sampling/importance_sampling_ratio/min": 1.4199457666368224e-36, "sampling/sampling_logp_difference/max": 1.5709129929542542, "sampling/sampling_logp_difference/mean": 0.037703079357743266, "step": 70, "step_time": 156.71906763464213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0421875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2464.4, "completions/mean_length": 817.75546875, "completions/mean_terminated_length": 674.2539978027344, "completions/min_length": 149.3, "completions/min_terminated_length": 149.3, "entropy": 0.18018180541694165, "epoch": 0.2564102564102564, "frac_reward_zero_std": 0.05, "grad_norm": 5.762501312773685e-05, "learning_rate": 9.89846735808731e-06, "loss": -0.0, "num_tokens": 19646058.0, "reward": -0.1727930422872305, "reward_std": 0.31547281742095945, "rewards/grpo_reward_function/mean": -0.1727930411696434, "rewards/grpo_reward_function/std": 0.7881518006324768, "sampling/importance_sampling_ratio/max": 0.13909520097076894, "sampling/importance_sampling_ratio/mean": 0.004160506054176949, "sampling/importance_sampling_ratio/min": 6.660712543136218e-25, "sampling/sampling_logp_difference/max": 1.5812078952789306, "sampling/sampling_logp_difference/mean": 0.03748476393520832, "step": 80, "step_time": 151.56825386658312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04609375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2902.4, "completions/mean_length": 803.97109375, "completions/mean_terminated_length": 644.9527221679688, "completions/min_length": 159.1, "completions/min_terminated_length": 159.1, "entropy": 0.18126422837376593, "epoch": 0.28846153846153844, "frac_reward_zero_std": 0.071875, "grad_norm": 1.7400093375084389e-06, "learning_rate": 9.733381816303395e-06, "loss": 0.0001, "num_tokens": 22062549.0, "reward": -0.1791908085346222, "reward_std": 0.283918160200119, "rewards/grpo_reward_function/mean": -0.1791908085346222, "rewards/grpo_reward_function/std": 0.7836591958999634, "sampling/importance_sampling_ratio/max": 0.15381892090663313, "sampling/importance_sampling_ratio/mean": 0.004421875764819561, "sampling/importance_sampling_ratio/min": 3.314493692446194e-30, "sampling/sampling_logp_difference/max": 1.59914892911911, "sampling/sampling_logp_difference/mean": 0.03814994990825653, "step": 90, "step_time": 143.37383015677332 }, { "clip_ratio/high_max": 2.9168124456191437e-06, "clip_ratio/high_mean": 3.6460155570239296e-07, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 3.6460155570239296e-07, "completions/clipped_ratio": 0.06015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3118.0, "completions/mean_length": 833.84453125, "completions/mean_terminated_length": 625.6610107421875, "completions/min_length": 146.9, "completions/min_terminated_length": 146.9, "entropy": 0.17976893596351146, "epoch": 0.32051282051282054, "frac_reward_zero_std": 0.053125, "grad_norm": 1.5939340167787748e-05, "learning_rate": 9.493048024473413e-06, "loss": 0.0002, "num_tokens": 24458486.0, "reward": -0.188871893286705, "reward_std": 0.31348400712013247, "rewards/grpo_reward_function/mean": -0.18887189254164696, "rewards/grpo_reward_function/std": 0.7829755127429963, "sampling/importance_sampling_ratio/max": 0.16449878164567053, "sampling/importance_sampling_ratio/mean": 0.005532666263025021, "sampling/importance_sampling_ratio/min": 3.4595329788052715e-35, "sampling/sampling_logp_difference/max": 1.61622656583786, "sampling/sampling_logp_difference/mean": 0.036300059966742994, "step": 100, "step_time": 150.7187235403806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.05, "completions/max_length": 4096.0, "completions/max_terminated_length": 2412.8, "completions/mean_length": 794.35859375, "completions/mean_terminated_length": 620.5320068359375, "completions/min_length": 128.8, "completions/min_terminated_length": 128.8, "entropy": 0.1772497620433569, "epoch": 0.3525641025641026, "frac_reward_zero_std": 0.046875, "grad_norm": 4.817197293953718e-06, "learning_rate": 9.18128665415186e-06, "loss": -0.0001, "num_tokens": 26866049.0, "reward": -0.22932868972420692, "reward_std": 0.31401748955249786, "rewards/grpo_reward_function/mean": -0.22932869493961333, "rewards/grpo_reward_function/std": 0.7706308960914612, "sampling/importance_sampling_ratio/max": 0.1050915487576276, "sampling/importance_sampling_ratio/mean": 0.0028572272043675185, "sampling/importance_sampling_ratio/min": 5.94164918536593e-23, "sampling/sampling_logp_difference/max": 1.725569200515747, "sampling/sampling_logp_difference/mean": 0.03586800619959831, "step": 110, "step_time": 141.95229457244278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.04921875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2462.2, "completions/mean_length": 793.1078125, "completions/mean_terminated_length": 622.2306030273437, "completions/min_length": 117.5, "completions/min_terminated_length": 117.5, "entropy": 0.17735922262072562, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.053125, "grad_norm": 9.213808267741939e-05, "learning_rate": 8.803053886449644e-06, "loss": -0.0001, "num_tokens": 29246979.0, "reward": -0.2611034892499447, "reward_std": 0.28704351782798765, "rewards/grpo_reward_function/mean": -0.26110349521040915, "rewards/grpo_reward_function/std": 0.7350896656513214, "sampling/importance_sampling_ratio/max": 0.084448746079579, "sampling/importance_sampling_ratio/mean": 0.001967475106357597, "sampling/importance_sampling_ratio/min": 3.0353923760337493e-31, "sampling/sampling_logp_difference/max": 1.6334303975105287, "sampling/sampling_logp_difference/mean": 0.03619050718843937, "step": 120, "step_time": 144.3923004835844 } ], "logging_steps": 10, "max_steps": 312, "num_input_tokens_seen": 30685228, "num_train_epochs": 1, "save_steps": 63, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }