{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20048019207683074, "eval_steps": 500, "global_step": 167, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000186264515, "completions/max_length": 2954.7, "completions/max_terminated_length": 1951.6, "completions/mean_length": 545.102099609375, "completions/mean_terminated_length": 500.0404846191406, "completions/min_length": 117.8, "completions/min_terminated_length": 117.8, "entropy": 0.1633674878627062, "epoch": 0.012004801920768308, "frac_reward_zero_std": 0.05000000149011612, "grad_norm": 0.0819886103272438, "learning_rate": 5.389221556886228e-07, "loss": 0.0538, "num_tokens": 1060997.0, "reward": -0.349642014503479, "reward_std": 0.18912948295474052, "rewards/grpo_reward_function/mean": -0.3496420085430145, "rewards/grpo_reward_function/std": 0.4486300081014633, "sampling/importance_sampling_ratio/max": 2.3219674229621887, "sampling/importance_sampling_ratio/mean": 0.3698740124702454, "sampling/importance_sampling_ratio/min": 1.1996005980563495e-06, "sampling/sampling_logp_difference/max": 2.5826863408088685, "sampling/sampling_logp_difference/mean": 0.019079525023698807, "step": 10, "step_time": 591.6221411965787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000037252903, "completions/max_length": 2285.6, "completions/max_terminated_length": 1834.2, "completions/mean_length": 525.1979339599609, "completions/mean_terminated_length": 481.30433044433596, "completions/min_length": 111.4, "completions/min_terminated_length": 111.4, "entropy": 0.15604820642620326, "epoch": 0.024009603841536616, "frac_reward_zero_std": 0.0416666679084301, "grad_norm": 0.09551126509904861, "learning_rate": 1.1377245508982037e-06, "loss": -0.0139, "num_tokens": 2123212.0, "reward": -0.298923921585083, "reward_std": 0.21090517602860928, "rewards/grpo_reward_function/mean": -0.2989239178597927, "rewards/grpo_reward_function/std": 0.4665490254759789, "sampling/importance_sampling_ratio/max": 2.135484504699707, "sampling/importance_sampling_ratio/mean": 0.40152732133865354, "sampling/importance_sampling_ratio/min": 2.7301041336613706e-05, "sampling/sampling_logp_difference/max": 2.5806180000305177, "sampling/sampling_logp_difference/mean": 0.019163084402680396, "step": 20, "step_time": 554.5896356501617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666716337204, "completions/max_length": 3269.3, "completions/max_terminated_length": 2072.6, "completions/mean_length": 556.327099609375, "completions/mean_terminated_length": 496.14097595214844, "completions/min_length": 132.3, "completions/min_terminated_length": 132.3, "entropy": 0.18481314480304717, "epoch": 0.03601440576230492, "frac_reward_zero_std": 0.05000000074505806, "grad_norm": 0.11139781028032303, "learning_rate": 1.7365269461077847e-06, "loss": 0.0256, "num_tokens": 3227189.0, "reward": -0.409818297624588, "reward_std": 0.22780176997184753, "rewards/grpo_reward_function/mean": -0.4098182961344719, "rewards/grpo_reward_function/std": 0.5334153980016708, "sampling/importance_sampling_ratio/max": 2.277985179424286, "sampling/importance_sampling_ratio/mean": 0.3370798110961914, "sampling/importance_sampling_ratio/min": 2.6906073216806556e-05, "sampling/sampling_logp_difference/max": 2.5000483632087707, "sampling/sampling_logp_difference/mean": 0.02049510907381773, "step": 30, "step_time": 555.5971007851883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666679084301, "completions/max_length": 2135.6, "completions/max_terminated_length": 1478.5, "completions/mean_length": 508.5041778564453, "completions/mean_terminated_length": 472.1120574951172, "completions/min_length": 147.4, "completions/min_terminated_length": 147.4, "entropy": 0.1670758031308651, "epoch": 0.04801920768307323, "frac_reward_zero_std": 0.0416666679084301, "grad_norm": 0.06704321503639221, "learning_rate": 2.3353293413173654e-06, "loss": -0.0127, "num_tokens": 4318559.0, "reward": -0.2258751168847084, "reward_std": 0.16097248084843158, "rewards/grpo_reward_function/mean": -0.22587510757148266, "rewards/grpo_reward_function/std": 0.49552616477012634, "sampling/importance_sampling_ratio/max": 2.3126969814300535, "sampling/importance_sampling_ratio/mean": 0.35644740611314774, "sampling/importance_sampling_ratio/min": 2.7391963689638034e-05, "sampling/sampling_logp_difference/max": 2.8060175657272337, "sampling/sampling_logp_difference/mean": 0.01994446888566017, "step": 40, "step_time": 554.565231207572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00416666679084301, "completions/max_length": 1859.8, "completions/max_terminated_length": 1757.3, "completions/mean_length": 513.3979309082031, "completions/mean_terminated_length": 499.66423950195315, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "entropy": 0.17271990440785884, "epoch": 0.060024009603841535, "frac_reward_zero_std": 0.06666666865348816, "grad_norm": 0.09258268028497696, "learning_rate": 2.9341317365269463e-06, "loss": 0.0015, "num_tokens": 5370018.0, "reward": -0.3178896278142929, "reward_std": 0.1635244082659483, "rewards/grpo_reward_function/mean": -0.3178896352648735, "rewards/grpo_reward_function/std": 0.45689679607748984, "sampling/importance_sampling_ratio/max": 2.195555794239044, "sampling/importance_sampling_ratio/mean": 0.3522403955459595, "sampling/importance_sampling_ratio/min": 7.164277021729504e-05, "sampling/sampling_logp_difference/max": 2.4301879167556764, "sampling/sampling_logp_difference/mean": 0.02071673283353448, "step": 50, "step_time": 548.9367319711484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00416666679084301, "completions/max_length": 2202.6, "completions/max_terminated_length": 1812.6, "completions/mean_length": 506.99168395996094, "completions/mean_terminated_length": 492.1666778564453, "completions/min_length": 127.8, "completions/min_terminated_length": 127.8, "entropy": 0.160004629381001, "epoch": 0.07202881152460984, "frac_reward_zero_std": 0.0416666679084301, "grad_norm": 0.05922295153141022, "learning_rate": 3.5329341317365273e-06, "loss": -0.0033, "num_tokens": 6466162.0, "reward": -0.34021527171134947, "reward_std": 0.18639734461903573, "rewards/grpo_reward_function/mean": -0.34021526128053664, "rewards/grpo_reward_function/std": 0.5187867254018783, "sampling/importance_sampling_ratio/max": 2.010967791080475, "sampling/importance_sampling_ratio/mean": 0.30313637256622317, "sampling/importance_sampling_ratio/min": 3.7146345167826667e-06, "sampling/sampling_logp_difference/max": 2.6932833194732666, "sampling/sampling_logp_difference/mean": 0.02041825857013464, "step": 60, "step_time": 530.4414538932033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.016666666977107523, "completions/max_length": 2579.7, "completions/max_terminated_length": 2077.7, "completions/mean_length": 557.7854309082031, "completions/mean_terminated_length": 499.6593933105469, "completions/min_length": 121.5, "completions/min_terminated_length": 121.5, "entropy": 0.16313071362674236, "epoch": 0.08403361344537816, "frac_reward_zero_std": 0.10000000223517418, "grad_norm": 0.014785214327275753, "learning_rate": 4.131736526946108e-06, "loss": 0.0424, "num_tokens": 7608683.0, "reward": -0.33393135815858843, "reward_std": 0.19106332510709761, "rewards/grpo_reward_function/mean": -0.33393134772777555, "rewards/grpo_reward_function/std": 0.5677398703992367, "sampling/importance_sampling_ratio/max": 1.9730541229248046, "sampling/importance_sampling_ratio/mean": 0.3427995890378952, "sampling/importance_sampling_ratio/min": 1.0688633483368904e-05, "sampling/sampling_logp_difference/max": 2.8619158267974854, "sampling/sampling_logp_difference/mean": 0.0201931843534112, "step": 70, "step_time": 541.6015901661478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666716337204, "completions/max_length": 3292.7, "completions/max_terminated_length": 2336.3, "completions/mean_length": 605.0791870117188, "completions/mean_terminated_length": 546.5141571044921, "completions/min_length": 142.3, "completions/min_terminated_length": 142.3, "entropy": 0.16853776723146438, "epoch": 0.09603841536614646, "frac_reward_zero_std": 0.10000000298023223, "grad_norm": 0.0669359341263771, "learning_rate": 4.730538922155689e-06, "loss": 0.003, "num_tokens": 8693089.0, "reward": -0.36407424658536913, "reward_std": 0.15138040184974672, "rewards/grpo_reward_function/mean": -0.364074233174324, "rewards/grpo_reward_function/std": 0.4984104484319687, "sampling/importance_sampling_ratio/max": 2.417657721042633, "sampling/importance_sampling_ratio/mean": 0.3272848010063171, "sampling/importance_sampling_ratio/min": 6.19425904005766e-05, "sampling/sampling_logp_difference/max": 2.832179582118988, "sampling/sampling_logp_difference/mean": 0.019849142245948314, "step": 80, "step_time": 559.4207322074101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.018750000558793545, "completions/max_length": 3217.2, "completions/max_terminated_length": 2202.3, "completions/mean_length": 652.0937744140625, "completions/mean_terminated_length": 587.0378173828125, "completions/min_length": 164.3, "completions/min_terminated_length": 164.3, "entropy": 0.16202539429068566, "epoch": 0.10804321728691477, "frac_reward_zero_std": 0.10000000223517418, "grad_norm": 0.03904345631599426, "learning_rate": 5.32934131736527e-06, "loss": -0.0041, "num_tokens": 9849890.0, "reward": -0.3628114402294159, "reward_std": 0.2544385172426701, "rewards/grpo_reward_function/mean": -0.3628114327788353, "rewards/grpo_reward_function/std": 0.6255016416311264, "sampling/importance_sampling_ratio/max": 1.7914996325969696, "sampling/importance_sampling_ratio/mean": 0.3064177379012108, "sampling/importance_sampling_ratio/min": 1.5591655392199753e-05, "sampling/sampling_logp_difference/max": 2.840235471725464, "sampling/sampling_logp_difference/mean": 0.01814730800688267, "step": 90, "step_time": 555.9057860235683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 2567.4, "completions/max_terminated_length": 2092.1, "completions/mean_length": 548.7250213623047, "completions/mean_terminated_length": 511.10838317871094, "completions/min_length": 116.3, "completions/min_terminated_length": 116.3, "entropy": 0.16799122765660285, "epoch": 0.12004801920768307, "frac_reward_zero_std": 0.10000000223517418, "grad_norm": 0.07849112898111343, "learning_rate": 5.928143712574851e-06, "loss": -0.0009, "num_tokens": 10951862.0, "reward": -0.37651871144771576, "reward_std": 0.17827629819512367, "rewards/grpo_reward_function/mean": -0.3765186980366707, "rewards/grpo_reward_function/std": 0.4988637834787369, "sampling/importance_sampling_ratio/max": 2.2177215456962585, "sampling/importance_sampling_ratio/mean": 0.3616850808262825, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7038833022117617, "sampling/sampling_logp_difference/mean": 0.019389390759170056, "step": 100, "step_time": 539.0174913492053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.012500000186264515, "completions/max_length": 2678.3, "completions/max_terminated_length": 1888.8, "completions/mean_length": 535.3271057128907, "completions/mean_terminated_length": 491.43089904785154, "completions/min_length": 127.5, "completions/min_terminated_length": 127.5, "entropy": 0.1744688918814063, "epoch": 0.13205282112845138, "frac_reward_zero_std": 0.0833333358168602, "grad_norm": 0.043163955211639404, "learning_rate": 6.526946107784432e-06, "loss": -0.0173, "num_tokens": 12060171.0, "reward": -0.2666824638843536, "reward_std": 0.12587157338857652, "rewards/grpo_reward_function/mean": -0.26668245121836665, "rewards/grpo_reward_function/std": 0.40698017328977587, "sampling/importance_sampling_ratio/max": 1.899654006958008, "sampling/importance_sampling_ratio/mean": 0.37773958817124365, "sampling/importance_sampling_ratio/min": 3.46376573256979e-14, "sampling/sampling_logp_difference/max": 2.272650396823883, "sampling/sampling_logp_difference/mean": 0.01878545032814145, "step": 110, "step_time": 554.1334780954755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01666666716337204, "completions/max_length": 3077.6, "completions/max_terminated_length": 1915.6, "completions/mean_length": 565.0541870117188, "completions/mean_terminated_length": 504.5126983642578, "completions/min_length": 138.5, "completions/min_terminated_length": 138.5, "entropy": 0.17697170842438936, "epoch": 0.14405762304921968, "frac_reward_zero_std": 0.0833333358168602, "grad_norm": 0.07391675561666489, "learning_rate": 7.125748502994012e-06, "loss": 0.042, "num_tokens": 13168921.0, "reward": -0.3262443482875824, "reward_std": 0.20909521877765655, "rewards/grpo_reward_function/mean": -0.3262443423271179, "rewards/grpo_reward_function/std": 0.5003126785159111, "sampling/importance_sampling_ratio/max": 2.301289737224579, "sampling/importance_sampling_ratio/mean": 0.38998747766017916, "sampling/importance_sampling_ratio/min": 7.411608444201079e-05, "sampling/sampling_logp_difference/max": 2.7557363152503966, "sampling/sampling_logp_difference/mean": 0.01885297931730747, "step": 120, "step_time": 549.9578140962869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.00416666679084301, "completions/max_length": 2290.1, "completions/max_terminated_length": 1886.2, "completions/mean_length": 499.789599609375, "completions/mean_terminated_length": 484.77695617675784, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "entropy": 0.19473983831703662, "epoch": 0.15606242496998798, "frac_reward_zero_std": 0.03333333432674408, "grad_norm": 0.06443756073713303, "learning_rate": 7.724550898203594e-06, "loss": -0.0203, "num_tokens": 14212700.0, "reward": -0.26288305670022966, "reward_std": 0.1738448791205883, "rewards/grpo_reward_function/mean": -0.26288305073976515, "rewards/grpo_reward_function/std": 0.48268924951553344, "sampling/importance_sampling_ratio/max": 2.1470563650131225, "sampling/importance_sampling_ratio/mean": 0.35831653475761416, "sampling/importance_sampling_ratio/min": 1.968140890369341e-05, "sampling/sampling_logp_difference/max": 2.0650948524475097, "sampling/sampling_logp_difference/mean": 0.01968124657869339, "step": 130, "step_time": 535.0768479405903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01250000037252903, "completions/max_length": 3015.9, "completions/max_terminated_length": 1749.5, "completions/mean_length": 492.37709045410156, "completions/mean_terminated_length": 446.93010864257815, "completions/min_length": 114.7, "completions/min_terminated_length": 114.7, "entropy": 0.21128173358738422, "epoch": 0.16806722689075632, "frac_reward_zero_std": 0.05833333432674408, "grad_norm": 0.09337731450796127, "learning_rate": 8.323353293413174e-06, "loss": 0.0277, "num_tokens": 15293077.0, "reward": -0.26287811398506167, "reward_std": 0.14675465896725653, "rewards/grpo_reward_function/mean": -0.26287810802459716, "rewards/grpo_reward_function/std": 0.37649901360273363, "sampling/importance_sampling_ratio/max": 2.3814776659011843, "sampling/importance_sampling_ratio/mean": 0.46577124297618866, "sampling/importance_sampling_ratio/min": 1.3427511260960534e-06, "sampling/sampling_logp_difference/max": 2.155888545513153, "sampling/sampling_logp_difference/mean": 0.019245322328060865, "step": 140, "step_time": 552.9976656335406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.9971032422035935e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.9971032422035935e-05, "completions/clipped_ratio": 0.01041666679084301, "completions/max_length": 2447.6, "completions/max_terminated_length": 1785.5, "completions/mean_length": 552.295849609375, "completions/mean_terminated_length": 515.5673645019531, "completions/min_length": 144.7, "completions/min_terminated_length": 144.7, "entropy": 0.2773955374956131, "epoch": 0.18007202881152462, "frac_reward_zero_std": 0.02500000074505806, "grad_norm": 0.1007687970995903, "learning_rate": 8.922155688622756e-06, "loss": -0.0141, "num_tokens": 16425531.0, "reward": -0.26935882605612277, "reward_std": 0.12829533144831656, "rewards/grpo_reward_function/mean": -0.2693588202819228, "rewards/grpo_reward_function/std": 0.33063299730420115, "sampling/importance_sampling_ratio/max": 2.1382891178131103, "sampling/importance_sampling_ratio/mean": 0.4445547193288803, "sampling/importance_sampling_ratio/min": 7.412413807410812e-05, "sampling/sampling_logp_difference/max": 1.7894923090934753, "sampling/sampling_logp_difference/mean": 0.019091704115271568, "step": 150, "step_time": 547.844954107888 }, { "clip_ratio/high_max": 4.80769231216982e-05, "clip_ratio/high_mean": 8.012820762814954e-06, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.012820762814954e-06, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 2388.9, "completions/max_terminated_length": 1982.4, "completions/mean_length": 544.4896026611328, "completions/mean_terminated_length": 507.44931945800784, "completions/min_length": 122.8, "completions/min_terminated_length": 122.8, "entropy": 0.3212181769311428, "epoch": 0.19207683073229292, "frac_reward_zero_std": 0.06666666865348816, "grad_norm": 0.06833557039499283, "learning_rate": 9.520958083832336e-06, "loss": 0.0134, "num_tokens": 17564686.0, "reward": -0.28017824441194533, "reward_std": 0.20728585943579675, "rewards/grpo_reward_function/mean": -0.28017824441194533, "rewards/grpo_reward_function/std": 0.5502792000770569, "sampling/importance_sampling_ratio/max": 2.1408735513687134, "sampling/importance_sampling_ratio/mean": 0.5016659319400787, "sampling/importance_sampling_ratio/min": 0.000778414961314411, "sampling/sampling_logp_difference/max": 1.89249027967453, "sampling/sampling_logp_difference/mean": 0.01902961954474449, "step": 160, "step_time": 536.2867619435303 } ], "logging_steps": 10, "max_steps": 833, "num_input_tokens_seen": 18381921, "num_train_epochs": 1, "save_steps": 167, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }