| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.40384615384615385, |
| "eval_steps": 500, |
| "global_step": 126, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2993.0, |
| "completions/mean_length": 842.1671875, |
| "completions/mean_terminated_length": 625.3808288574219, |
| "completions/min_length": 143.7, |
| "completions/min_terminated_length": 143.7, |
| "entropy": 0.18502369821071624, |
| "epoch": 0.03205128205128205, |
| "frac_reward_zero_std": 0.046875, |
| "grad_norm": 0.00025154745623815123, |
| "learning_rate": 1.4285714285714286e-06, |
| "loss": -0.0002, |
| "num_tokens": 2443746.0, |
| "reward": -0.22845993265509607, |
| "reward_std": 0.2702211543917656, |
| "rewards/grpo_reward_function/mean": -0.2284599334001541, |
| "rewards/grpo_reward_function/std": 0.7218511283397675, |
| "sampling/importance_sampling_ratio/max": 0.12355739073827862, |
| "sampling/importance_sampling_ratio/mean": 0.0029692337644519284, |
| "sampling/importance_sampling_ratio/min": 5.253219159649809e-30, |
| "sampling/sampling_logp_difference/max": 1.5544446587562561, |
| "sampling/sampling_logp_difference/mean": 0.03719679862260818, |
| "step": 10, |
| "step_time": 152.91457959786058 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05625, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2889.4, |
| "completions/mean_length": 863.446875, |
| "completions/mean_terminated_length": 671.0787475585937, |
| "completions/min_length": 164.7, |
| "completions/min_terminated_length": 164.7, |
| "entropy": 0.1817332101985812, |
| "epoch": 0.0641025641025641, |
| "frac_reward_zero_std": 0.05, |
| "grad_norm": 1.9538725158444555e-05, |
| "learning_rate": 3.015873015873016e-06, |
| "loss": -0.0001, |
| "num_tokens": 4916654.0, |
| "reward": -0.18899608142673968, |
| "reward_std": 0.3027258589863777, |
| "rewards/grpo_reward_function/mean": -0.18899608589708805, |
| "rewards/grpo_reward_function/std": 0.7454370498657227, |
| "sampling/importance_sampling_ratio/max": 0.0923810960026458, |
| "sampling/importance_sampling_ratio/mean": 0.0023226177279866535, |
| "sampling/importance_sampling_ratio/min": 6.460355799993298e-36, |
| "sampling/sampling_logp_difference/max": 1.6125647306442261, |
| "sampling/sampling_logp_difference/mean": 0.03694682139903307, |
| "step": 20, |
| "step_time": 159.88802388124168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2620.1, |
| "completions/mean_length": 832.69296875, |
| "completions/mean_terminated_length": 672.2784484863281, |
| "completions/min_length": 167.4, |
| "completions/min_terminated_length": 167.4, |
| "entropy": 0.19116276763379575, |
| "epoch": 0.09615384615384616, |
| "frac_reward_zero_std": 0.03125, |
| "grad_norm": 0.00031796028415218583, |
| "learning_rate": 4.603174603174604e-06, |
| "loss": -0.0005, |
| "num_tokens": 7342561.0, |
| "reward": -0.20435776934027672, |
| "reward_std": 0.2867868050932884, |
| "rewards/grpo_reward_function/mean": -0.20435777083039283, |
| "rewards/grpo_reward_function/std": 0.7296508550643921, |
| "sampling/importance_sampling_ratio/max": 0.06920737095642834, |
| "sampling/importance_sampling_ratio/mean": 0.0011224002329981886, |
| "sampling/importance_sampling_ratio/min": 5.1338851255929605e-34, |
| "sampling/sampling_logp_difference/max": 1.7805282711982726, |
| "sampling/sampling_logp_difference/mean": 0.03969721123576164, |
| "step": 30, |
| "step_time": 153.14357568509877 |
| }, |
| { |
| "clip_ratio/high_max": 5.1770552090602e-06, |
| "clip_ratio/high_mean": 6.47131901132525e-07, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 6.47131901132525e-07, |
| "completions/clipped_ratio": 0.075, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2996.0, |
| "completions/mean_length": 908.88203125, |
| "completions/mean_terminated_length": 650.461572265625, |
| "completions/min_length": 172.5, |
| "completions/min_terminated_length": 172.5, |
| "entropy": 0.18940101601183415, |
| "epoch": 0.1282051282051282, |
| "frac_reward_zero_std": 0.0375, |
| "grad_norm": 0.00046653815964065254, |
| "learning_rate": 6.1904761904761914e-06, |
| "loss": -0.0007, |
| "num_tokens": 9897290.0, |
| "reward": -0.20948485173285009, |
| "reward_std": 0.3274365648627281, |
| "rewards/grpo_reward_function/mean": -0.20948485508561135, |
| "rewards/grpo_reward_function/std": 0.7691417872905731, |
| "sampling/importance_sampling_ratio/max": 0.12555439178831876, |
| "sampling/importance_sampling_ratio/mean": 0.0029983414759044537, |
| "sampling/importance_sampling_ratio/min": 7.312246668686746e-35, |
| "sampling/sampling_logp_difference/max": 1.8727038741111754, |
| "sampling/sampling_logp_difference/mean": 0.03728015031665564, |
| "step": 40, |
| "step_time": 146.35395600683987 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0515625, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2913.9, |
| "completions/mean_length": 806.88984375, |
| "completions/mean_terminated_length": 627.2792419433594, |
| "completions/min_length": 123.4, |
| "completions/min_terminated_length": 123.4, |
| "entropy": 0.17744111455976963, |
| "epoch": 0.16025641025641027, |
| "frac_reward_zero_std": 0.05625, |
| "grad_norm": 9.61458607191603e-05, |
| "learning_rate": 7.77777777777778e-06, |
| "loss": 0.0003, |
| "num_tokens": 12302889.0, |
| "reward": -0.18009744100272657, |
| "reward_std": 0.28012245148420334, |
| "rewards/grpo_reward_function/mean": -0.18009743914008142, |
| "rewards/grpo_reward_function/std": 0.7488289535045624, |
| "sampling/importance_sampling_ratio/max": 0.17218044362962245, |
| "sampling/importance_sampling_ratio/mean": 0.004992885573301464, |
| "sampling/importance_sampling_ratio/min": 1.0306885153000913e-30, |
| "sampling/sampling_logp_difference/max": 1.6258580565452576, |
| "sampling/sampling_logp_difference/mean": 0.03607845064252615, |
| "step": 50, |
| "step_time": 147.80184614248574 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03984375, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2814.3, |
| "completions/mean_length": 809.65078125, |
| "completions/mean_terminated_length": 673.7666534423828, |
| "completions/min_length": 141.0, |
| "completions/min_terminated_length": 141.0, |
| "entropy": 0.18477382734417916, |
| "epoch": 0.19230769230769232, |
| "frac_reward_zero_std": 0.034375, |
| "grad_norm": 0.0005422528630453429, |
| "learning_rate": 9.365079365079366e-06, |
| "loss": -0.0002, |
| "num_tokens": 14762678.0, |
| "reward": -0.18410459933802487, |
| "reward_std": 0.3047805741429329, |
| "rewards/grpo_reward_function/mean": -0.18410460744053125, |
| "rewards/grpo_reward_function/std": 0.7665571451187134, |
| "sampling/importance_sampling_ratio/max": 0.10517687269020826, |
| "sampling/importance_sampling_ratio/mean": 0.0030420748858887236, |
| "sampling/importance_sampling_ratio/min": 3.9674455768173534e-27, |
| "sampling/sampling_logp_difference/max": 1.571552813053131, |
| "sampling/sampling_logp_difference/mean": 0.03778362385928631, |
| "step": 60, |
| "step_time": 144.8840892329812 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.059375, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2730.5, |
| "completions/mean_length": 846.08828125, |
| "completions/mean_terminated_length": 641.0483703613281, |
| "completions/min_length": 134.2, |
| "completions/min_terminated_length": 134.2, |
| "entropy": 0.18578412756323814, |
| "epoch": 0.22435897435897437, |
| "frac_reward_zero_std": 0.03125, |
| "grad_norm": 0.00012190250477276098, |
| "learning_rate": 9.985680226398261e-06, |
| "loss": -0.0002, |
| "num_tokens": 17196835.0, |
| "reward": -0.23204001784324646, |
| "reward_std": 0.3367327839136124, |
| "rewards/grpo_reward_function/mean": -0.23204001784324646, |
| "rewards/grpo_reward_function/std": 0.7099093854427337, |
| "sampling/importance_sampling_ratio/max": 0.04683403689414263, |
| "sampling/importance_sampling_ratio/mean": 0.0009577752702170983, |
| "sampling/importance_sampling_ratio/min": 1.4199457666368224e-36, |
| "sampling/sampling_logp_difference/max": 1.5709129929542542, |
| "sampling/sampling_logp_difference/mean": 0.037703079357743266, |
| "step": 70, |
| "step_time": 156.71906763464213 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0421875, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2464.4, |
| "completions/mean_length": 817.75546875, |
| "completions/mean_terminated_length": 674.2539978027344, |
| "completions/min_length": 149.3, |
| "completions/min_terminated_length": 149.3, |
| "entropy": 0.18018180541694165, |
| "epoch": 0.2564102564102564, |
| "frac_reward_zero_std": 0.05, |
| "grad_norm": 5.762501312773685e-05, |
| "learning_rate": 9.89846735808731e-06, |
| "loss": -0.0, |
| "num_tokens": 19646058.0, |
| "reward": -0.1727930422872305, |
| "reward_std": 0.31547281742095945, |
| "rewards/grpo_reward_function/mean": -0.1727930411696434, |
| "rewards/grpo_reward_function/std": 0.7881518006324768, |
| "sampling/importance_sampling_ratio/max": 0.13909520097076894, |
| "sampling/importance_sampling_ratio/mean": 0.004160506054176949, |
| "sampling/importance_sampling_ratio/min": 6.660712543136218e-25, |
| "sampling/sampling_logp_difference/max": 1.5812078952789306, |
| "sampling/sampling_logp_difference/mean": 0.03748476393520832, |
| "step": 80, |
| "step_time": 151.56825386658312 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04609375, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2902.4, |
| "completions/mean_length": 803.97109375, |
| "completions/mean_terminated_length": 644.9527221679688, |
| "completions/min_length": 159.1, |
| "completions/min_terminated_length": 159.1, |
| "entropy": 0.18126422837376593, |
| "epoch": 0.28846153846153844, |
| "frac_reward_zero_std": 0.071875, |
| "grad_norm": 1.7400093375084389e-06, |
| "learning_rate": 9.733381816303395e-06, |
| "loss": 0.0001, |
| "num_tokens": 22062549.0, |
| "reward": -0.1791908085346222, |
| "reward_std": 0.283918160200119, |
| "rewards/grpo_reward_function/mean": -0.1791908085346222, |
| "rewards/grpo_reward_function/std": 0.7836591958999634, |
| "sampling/importance_sampling_ratio/max": 0.15381892090663313, |
| "sampling/importance_sampling_ratio/mean": 0.004421875764819561, |
| "sampling/importance_sampling_ratio/min": 3.314493692446194e-30, |
| "sampling/sampling_logp_difference/max": 1.59914892911911, |
| "sampling/sampling_logp_difference/mean": 0.03814994990825653, |
| "step": 90, |
| "step_time": 143.37383015677332 |
| }, |
| { |
| "clip_ratio/high_max": 2.9168124456191437e-06, |
| "clip_ratio/high_mean": 3.6460155570239296e-07, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 3.6460155570239296e-07, |
| "completions/clipped_ratio": 0.06015625, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 3118.0, |
| "completions/mean_length": 833.84453125, |
| "completions/mean_terminated_length": 625.6610107421875, |
| "completions/min_length": 146.9, |
| "completions/min_terminated_length": 146.9, |
| "entropy": 0.17976893596351146, |
| "epoch": 0.32051282051282054, |
| "frac_reward_zero_std": 0.053125, |
| "grad_norm": 1.5939340167787748e-05, |
| "learning_rate": 9.493048024473413e-06, |
| "loss": 0.0002, |
| "num_tokens": 24458486.0, |
| "reward": -0.188871893286705, |
| "reward_std": 0.31348400712013247, |
| "rewards/grpo_reward_function/mean": -0.18887189254164696, |
| "rewards/grpo_reward_function/std": 0.7829755127429963, |
| "sampling/importance_sampling_ratio/max": 0.16449878164567053, |
| "sampling/importance_sampling_ratio/mean": 0.005532666263025021, |
| "sampling/importance_sampling_ratio/min": 3.4595329788052715e-35, |
| "sampling/sampling_logp_difference/max": 1.61622656583786, |
| "sampling/sampling_logp_difference/mean": 0.036300059966742994, |
| "step": 100, |
| "step_time": 150.7187235403806 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.05, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2412.8, |
| "completions/mean_length": 794.35859375, |
| "completions/mean_terminated_length": 620.5320068359375, |
| "completions/min_length": 128.8, |
| "completions/min_terminated_length": 128.8, |
| "entropy": 0.1772497620433569, |
| "epoch": 0.3525641025641026, |
| "frac_reward_zero_std": 0.046875, |
| "grad_norm": 4.817197293953718e-06, |
| "learning_rate": 9.18128665415186e-06, |
| "loss": -0.0001, |
| "num_tokens": 26866049.0, |
| "reward": -0.22932868972420692, |
| "reward_std": 0.31401748955249786, |
| "rewards/grpo_reward_function/mean": -0.22932869493961333, |
| "rewards/grpo_reward_function/std": 0.7706308960914612, |
| "sampling/importance_sampling_ratio/max": 0.1050915487576276, |
| "sampling/importance_sampling_ratio/mean": 0.0028572272043675185, |
| "sampling/importance_sampling_ratio/min": 5.94164918536593e-23, |
| "sampling/sampling_logp_difference/max": 1.725569200515747, |
| "sampling/sampling_logp_difference/mean": 0.03586800619959831, |
| "step": 110, |
| "step_time": 141.95229457244278 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.04921875, |
| "completions/max_length": 4096.0, |
| "completions/max_terminated_length": 2462.2, |
| "completions/mean_length": 793.1078125, |
| "completions/mean_terminated_length": 622.2306030273437, |
| "completions/min_length": 117.5, |
| "completions/min_terminated_length": 117.5, |
| "entropy": 0.17735922262072562, |
| "epoch": 0.38461538461538464, |
| "frac_reward_zero_std": 0.053125, |
| "grad_norm": 9.213808267741939e-05, |
| "learning_rate": 8.803053886449644e-06, |
| "loss": -0.0001, |
| "num_tokens": 29246979.0, |
| "reward": -0.2611034892499447, |
| "reward_std": 0.28704351782798765, |
| "rewards/grpo_reward_function/mean": -0.26110349521040915, |
| "rewards/grpo_reward_function/std": 0.7350896656513214, |
| "sampling/importance_sampling_ratio/max": 0.084448746079579, |
| "sampling/importance_sampling_ratio/mean": 0.001967475106357597, |
| "sampling/importance_sampling_ratio/min": 3.0353923760337493e-31, |
| "sampling/sampling_logp_difference/max": 1.6334303975105287, |
| "sampling/sampling_logp_difference/mean": 0.03619050718843937, |
| "step": 120, |
| "step_time": 144.3923004835844 |
| } |
| ], |
| "logging_steps": 10, |
| "max_steps": 312, |
| "num_input_tokens_seen": 30685228, |
| "num_train_epochs": 1, |
| "save_steps": 63, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|