{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.20048019207683074, "eval_steps": 500, "global_step": 167, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01041666679084301, "completions/max_length": 2827.7, "completions/max_terminated_length": 2365.2, "completions/mean_length": 630.6958435058593, "completions/mean_terminated_length": 594.2115203857422, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "entropy": 0.11724737156182527, "epoch": 0.012004801920768308, "frac_reward_zero_std": 0.13333333507180214, "grad_norm": 0.038755763322114944, "learning_rate": 5.389221556886228e-07, "loss": 0.0113, "num_tokens": 799206.0, "reward": -0.18518302096053957, "reward_std": 0.20015475898981094, "rewards/grpo_reward_function/mean": -0.18518302938900888, "rewards/grpo_reward_function/std": 0.6885311886668205, "sampling/importance_sampling_ratio/max": 2.1990586280822755, "sampling/importance_sampling_ratio/mean": 0.4647279143333435, "sampling/importance_sampling_ratio/min": 0.0005068443759228102, "sampling/sampling_logp_difference/max": 2.5390082478523253, "sampling/sampling_logp_difference/mean": 0.013516949955374002, "step": 10, "step_time": 569.2043406252749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.010416666977107525, "completions/max_length": 2742.3, "completions/max_terminated_length": 2382.7, "completions/mean_length": 646.3521057128906, "completions/mean_terminated_length": 610.6208740234375, "completions/min_length": 131.6, "completions/min_terminated_length": 131.6, "entropy": 0.1267015876248479, "epoch": 0.024009603841536616, "frac_reward_zero_std": 0.1083333358168602, "grad_norm": 0.10266362875699997, "learning_rate": 1.1377245508982037e-06, "loss": -0.0225, "num_tokens": 1617099.0, "reward": 0.01770310625433922, "reward_std": 0.23654931634664536, "rewards/grpo_reward_function/mean": 0.0177031047642231, "rewards/grpo_reward_function/std": 0.8463600814342499, "sampling/importance_sampling_ratio/max": 1.9834328293800354, "sampling/importance_sampling_ratio/mean": 0.40317725837230683, "sampling/importance_sampling_ratio/min": 0.0032052009667828283, "sampling/sampling_logp_difference/max": 2.1022926926612855, "sampling/sampling_logp_difference/mean": 0.01352061601355672, "step": 20, "step_time": 548.7977518392727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.014583333767950535, "completions/max_length": 3068.7, "completions/max_terminated_length": 2151.3, "completions/mean_length": 682.6271087646485, "completions/mean_terminated_length": 632.6335662841797, "completions/min_length": 188.7, "completions/min_terminated_length": 188.7, "entropy": 0.125905223749578, "epoch": 0.03601440576230492, "frac_reward_zero_std": 0.11666666939854622, "grad_norm": 0.06394433230161667, "learning_rate": 1.7365269461077847e-06, "loss": 0.0229, "num_tokens": 2465988.0, "reward": -0.18962360136210918, "reward_std": 0.19849726594984532, "rewards/grpo_reward_function/mean": -0.18962358720600606, "rewards/grpo_reward_function/std": 0.6894359931349754, "sampling/importance_sampling_ratio/max": 2.460964298248291, "sampling/importance_sampling_ratio/mean": 0.4253284126520157, "sampling/importance_sampling_ratio/min": 1.070212653598215e-05, "sampling/sampling_logp_difference/max": 2.8448187589645384, "sampling/sampling_logp_difference/mean": 0.013953791093081236, "step": 30, "step_time": 554.4699578347615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01458333358168602, "completions/max_length": 2789.3, "completions/max_terminated_length": 1984.1, "completions/mean_length": 639.958349609375, "completions/mean_terminated_length": 589.7686553955078, "completions/min_length": 163.7, "completions/min_terminated_length": 163.7, "entropy": 0.11666738856583833, "epoch": 0.04801920768307323, "frac_reward_zero_std": 0.1166666679084301, "grad_norm": 0.08781701326370239, "learning_rate": 2.3353293413173654e-06, "loss": -0.0064, "num_tokens": 3297428.0, "reward": -0.03914917185902596, "reward_std": 0.21894535794854164, "rewards/grpo_reward_function/mean": -0.03914917148649692, "rewards/grpo_reward_function/std": 0.8605277180671692, "sampling/importance_sampling_ratio/max": 2.0170334696769716, "sampling/importance_sampling_ratio/mean": 0.4818507760763168, "sampling/importance_sampling_ratio/min": 0.0015486635098906688, "sampling/sampling_logp_difference/max": 2.52269823551178, "sampling/sampling_logp_difference/mean": 0.012881174683570862, "step": 40, "step_time": 541.3490906376392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 4.2163060425082224e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 4.2163060425082224e-05, "completions/clipped_ratio": 0.016666667349636555, "completions/max_length": 2619.1, "completions/max_terminated_length": 2081.8, "completions/mean_length": 660.1000122070312, "completions/mean_terminated_length": 602.3141662597657, "completions/min_length": 205.5, "completions/min_terminated_length": 205.5, "entropy": 0.12803181819617748, "epoch": 0.060024009603841535, "frac_reward_zero_std": 0.1416666701436043, "grad_norm": 0.03966222703456879, "learning_rate": 2.9341317365269463e-06, "loss": 0.0112, "num_tokens": 4129824.0, "reward": -0.11274411627091467, "reward_std": 0.2275936236605048, "rewards/grpo_reward_function/mean": -0.1127441140357405, "rewards/grpo_reward_function/std": 0.8841595828533173, "sampling/importance_sampling_ratio/max": 2.008124852180481, "sampling/importance_sampling_ratio/mean": 0.46366433799266815, "sampling/importance_sampling_ratio/min": 0.0006394427657710367, "sampling/sampling_logp_difference/max": 2.58479106426239, "sampling/sampling_logp_difference/mean": 0.013615725003182888, "step": 50, "step_time": 545.8774313618429 }, { "clip_ratio/high_max": 4.673766961786896e-05, "clip_ratio/high_mean": 7.7896114817122e-06, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 7.7896114817122e-06, "completions/clipped_ratio": 0.01250000037252903, "completions/max_length": 2304.9, "completions/max_terminated_length": 1602.8, "completions/mean_length": 596.5750213623047, "completions/mean_terminated_length": 552.8821624755859, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "entropy": 0.12017892487347126, "epoch": 0.07202881152460984, "frac_reward_zero_std": 0.14166666865348815, "grad_norm": 0.09313877671957016, "learning_rate": 3.5329341317365273e-06, "loss": -0.0307, "num_tokens": 4936176.0, "reward": -0.03057028874754906, "reward_std": 0.2686158835887909, "rewards/grpo_reward_function/mean": -0.03057028613984585, "rewards/grpo_reward_function/std": 0.8661522060632706, "sampling/importance_sampling_ratio/max": 2.2043559432029722, "sampling/importance_sampling_ratio/mean": 0.4847503274679184, "sampling/importance_sampling_ratio/min": 7.13271651690217e-05, "sampling/sampling_logp_difference/max": 2.4851160287857055, "sampling/sampling_logp_difference/mean": 0.013510057888925075, "step": 60, "step_time": 546.4311281181872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 6.860105058876797e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 6.860105058876797e-05, "completions/clipped_ratio": 0.018750000558793545, "completions/max_length": 3205.2, "completions/max_terminated_length": 2270.1, "completions/mean_length": 702.3604431152344, "completions/mean_terminated_length": 636.497216796875, "completions/min_length": 131.8, "completions/min_terminated_length": 131.8, "entropy": 0.11768119670450687, "epoch": 0.08403361344537816, "frac_reward_zero_std": 0.1083333358168602, "grad_norm": 0.0433771014213562, "learning_rate": 4.131736526946108e-06, "loss": 0.0553, "num_tokens": 5841149.0, "reward": -0.0784481130540371, "reward_std": 0.23132488708943127, "rewards/grpo_reward_function/mean": -0.07844811640679836, "rewards/grpo_reward_function/std": 0.8492624998092652, "sampling/importance_sampling_ratio/max": 2.233333742618561, "sampling/importance_sampling_ratio/mean": 0.4808308959007263, "sampling/importance_sampling_ratio/min": 0.0008302704439188347, "sampling/sampling_logp_difference/max": 2.9827078700065615, "sampling/sampling_logp_difference/mean": 0.012630783580243587, "step": 70, "step_time": 561.5568902881816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 8.41788569232449e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.41788569232449e-05, "completions/clipped_ratio": 0.01250000037252903, "completions/max_length": 2541.9, "completions/max_terminated_length": 2033.2, "completions/mean_length": 612.5875213623046, "completions/mean_terminated_length": 569.556509399414, "completions/min_length": 149.1, "completions/min_terminated_length": 149.1, "entropy": 0.13153507560491562, "epoch": 0.09603841536614646, "frac_reward_zero_std": 0.10833333656191826, "grad_norm": 0.08665835857391357, "learning_rate": 4.730538922155689e-06, "loss": 0.0701, "num_tokens": 6606395.0, "reward": -0.011540251970291137, "reward_std": 0.19073452726006507, "rewards/grpo_reward_function/mean": -0.011540257930755615, "rewards/grpo_reward_function/std": 0.784630474448204, "sampling/importance_sampling_ratio/max": 2.1984647274017335, "sampling/importance_sampling_ratio/mean": 0.5050391256809235, "sampling/importance_sampling_ratio/min": 0.00014755414913452113, "sampling/sampling_logp_difference/max": 1.8997669577598573, "sampling/sampling_logp_difference/mean": 0.013426258694380522, "step": 80, "step_time": 551.2647462010384 }, { "clip_ratio/high_max": 2.2563176753465086e-05, "clip_ratio/high_mean": 3.760529580176808e-06, "clip_ratio/low_mean": 1.3224284339230508e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.6984813919407314e-05, "completions/clipped_ratio": 0.01041666679084301, "completions/max_length": 2347.6, "completions/max_terminated_length": 2078.4, "completions/mean_length": 621.4354370117187, "completions/mean_terminated_length": 586.118701171875, "completions/min_length": 163.2, "completions/min_terminated_length": 163.2, "entropy": 0.1206895818002522, "epoch": 0.10804321728691477, "frac_reward_zero_std": 0.1083333358168602, "grad_norm": 0.034579165279865265, "learning_rate": 5.32934131736527e-06, "loss": 0.0011, "num_tokens": 7424828.0, "reward": 0.02708094713743776, "reward_std": 0.23181376457214356, "rewards/grpo_reward_function/mean": 0.027080959058366716, "rewards/grpo_reward_function/std": 0.8183064997196198, "sampling/importance_sampling_ratio/max": 2.499999237060547, "sampling/importance_sampling_ratio/mean": 0.486982923746109, "sampling/importance_sampling_ratio/min": 0.000999147113179788, "sampling/sampling_logp_difference/max": 2.079863798618317, "sampling/sampling_logp_difference/mean": 0.012986462097615004, "step": 90, "step_time": 550.7672496054322 }, { "clip_ratio/high_max": 0.00031043787457747386, "clip_ratio/high_mean": 5.173964618734317e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.173964618734317e-05, "completions/clipped_ratio": 0.01666666716337204, "completions/max_length": 2765.9, "completions/max_terminated_length": 1842.8, "completions/mean_length": 675.4458435058593, "completions/mean_terminated_length": 618.1135345458985, "completions/min_length": 178.2, "completions/min_terminated_length": 178.2, "entropy": 0.149181258212775, "epoch": 0.12004801920768307, "frac_reward_zero_std": 0.1416666701436043, "grad_norm": 0.13291294872760773, "learning_rate": 5.928143712574851e-06, "loss": 0.0212, "num_tokens": 8278282.0, "reward": 0.0703774506226182, "reward_std": 0.2336222641170025, "rewards/grpo_reward_function/mean": 0.07037745183333755, "rewards/grpo_reward_function/std": 0.8314530551433563, "sampling/importance_sampling_ratio/max": 2.2870466232299806, "sampling/importance_sampling_ratio/mean": 0.4643064886331558, "sampling/importance_sampling_ratio/min": 2.921815394074656e-05, "sampling/sampling_logp_difference/max": 1.9242668151855469, "sampling/sampling_logp_difference/mean": 0.014198462665081023, "step": 100, "step_time": 547.0480061549694 }, { "clip_ratio/high_max": 0.0003238706885895226, "clip_ratio/high_mean": 5.397844997787615e-05, "clip_ratio/low_mean": 7.069677012623287e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00012467522010410902, "completions/clipped_ratio": 0.020833334513008596, "completions/max_length": 2925.3, "completions/max_terminated_length": 2297.9, "completions/mean_length": 680.045849609375, "completions/mean_terminated_length": 605.8596954345703, "completions/min_length": 186.9, "completions/min_terminated_length": 186.9, "entropy": 0.1511568833142519, "epoch": 0.13205282112845138, "frac_reward_zero_std": 0.08333333507180214, "grad_norm": 0.0421764962375164, "learning_rate": 6.526946107784432e-06, "loss": -0.0031, "num_tokens": 9165800.0, "reward": 0.04289367534220219, "reward_std": 0.24053554534912108, "rewards/grpo_reward_function/mean": 0.042893677949905396, "rewards/grpo_reward_function/std": 0.835248938202858, "sampling/importance_sampling_ratio/max": 2.332168984413147, "sampling/importance_sampling_ratio/mean": 0.4403663039207458, "sampling/importance_sampling_ratio/min": 0.0001706225667930994, "sampling/sampling_logp_difference/max": 2.4483426809310913, "sampling/sampling_logp_difference/mean": 0.014532316662371158, "step": 110, "step_time": 548.8019280240871 }, { "clip_ratio/high_max": 0.00012998266611248256, "clip_ratio/high_mean": 2.16637781704776e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.16637781704776e-05, "completions/clipped_ratio": 0.002083333395421505, "completions/max_length": 1812.6, "completions/max_terminated_length": 1592.9, "completions/mean_length": 602.1333526611328, "completions/mean_terminated_length": 594.61220703125, "completions/min_length": 179.7, "completions/min_terminated_length": 179.7, "entropy": 0.16710406728088856, "epoch": 0.14405762304921968, "frac_reward_zero_std": 0.0833333358168602, "grad_norm": 0.07664494961500168, "learning_rate": 7.125748502994012e-06, "loss": -0.0309, "num_tokens": 9975204.0, "reward": 0.0826782912015915, "reward_std": 0.23934805542230606, "rewards/grpo_reward_function/mean": 0.0826782874763012, "rewards/grpo_reward_function/std": 0.8862796187400818, "sampling/importance_sampling_ratio/max": 2.2503564238548277, "sampling/importance_sampling_ratio/mean": 0.4635925680398941, "sampling/importance_sampling_ratio/min": 0.0009416027547558823, "sampling/sampling_logp_difference/max": 2.073215699195862, "sampling/sampling_logp_difference/mean": 0.01480921907350421, "step": 120, "step_time": 539.0031213279814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 1.9831826648442074e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 1.9831826648442074e-06, "completions/clipped_ratio": 0.006250000186264515, "completions/max_length": 2290.7, "completions/max_terminated_length": 1880.6, "completions/mean_length": 599.5916900634766, "completions/mean_terminated_length": 578.0104217529297, "completions/min_length": 168.6, "completions/min_terminated_length": 168.6, "entropy": 0.16012020353227854, "epoch": 0.15606242496998798, "frac_reward_zero_std": 0.11666667014360428, "grad_norm": 0.05220530927181244, "learning_rate": 7.724550898203594e-06, "loss": -0.0377, "num_tokens": 10768324.0, "reward": -0.0507307555526495, "reward_std": 0.18351687043905257, "rewards/grpo_reward_function/mean": -0.05073075201362372, "rewards/grpo_reward_function/std": 0.7544578343629837, "sampling/importance_sampling_ratio/max": 2.375898337364197, "sampling/importance_sampling_ratio/mean": 0.524286350607872, "sampling/importance_sampling_ratio/min": 0.00018855740054277704, "sampling/sampling_logp_difference/max": 2.009746181964874, "sampling/sampling_logp_difference/mean": 0.01376222250983119, "step": 130, "step_time": 548.6409472068772 }, { "clip_ratio/high_max": 0.00017211703816428782, "clip_ratio/high_mean": 2.86861730273813e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.86861730273813e-05, "completions/clipped_ratio": 0.00833333358168602, "completions/max_length": 3136.9, "completions/max_terminated_length": 2221.0, "completions/mean_length": 684.8500244140625, "completions/mean_terminated_length": 655.5515625, "completions/min_length": 168.7, "completions/min_terminated_length": 168.7, "entropy": 0.12020768839865922, "epoch": 0.16806722689075632, "frac_reward_zero_std": 0.09166666939854622, "grad_norm": 0.058197326958179474, "learning_rate": 8.323353293413174e-06, "loss": -0.0342, "num_tokens": 11642436.0, "reward": 0.04102597634773701, "reward_std": 0.2864942252635956, "rewards/grpo_reward_function/mean": 0.041025977826211604, "rewards/grpo_reward_function/std": 0.8844284832477569, "sampling/importance_sampling_ratio/max": 2.37525737285614, "sampling/importance_sampling_ratio/mean": 0.46323378682136535, "sampling/importance_sampling_ratio/min": 2.6157076149502247e-08, "sampling/sampling_logp_difference/max": 2.5657184720039368, "sampling/sampling_logp_difference/mean": 0.012760929018259048, "step": 140, "step_time": 550.6772611703724 }, { "clip_ratio/high_max": 0.00029233113455120476, "clip_ratio/high_mean": 4.872185563726816e-05, "clip_ratio/low_mean": 4.4254150270717216e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 9.297600590798538e-05, "completions/clipped_ratio": 0.01041666679084301, "completions/max_length": 2211.6, "completions/max_terminated_length": 1961.7, "completions/mean_length": 601.2291839599609, "completions/mean_terminated_length": 565.5705291748047, "completions/min_length": 143.6, "completions/min_terminated_length": 143.6, "entropy": 0.10819828314706684, "epoch": 0.18007202881152462, "frac_reward_zero_std": 0.10833333656191826, "grad_norm": 0.04749957472085953, "learning_rate": 8.922155688622756e-06, "loss": -0.0236, "num_tokens": 12486318.0, "reward": 0.03778684511780739, "reward_std": 0.25178585574030876, "rewards/grpo_reward_function/mean": 0.03778683394193649, "rewards/grpo_reward_function/std": 0.7447861909866333, "sampling/importance_sampling_ratio/max": 2.481464517116547, "sampling/importance_sampling_ratio/mean": 0.5163449585437775, "sampling/importance_sampling_ratio/min": 3.668112331070006e-05, "sampling/sampling_logp_difference/max": 2.379316544532776, "sampling/sampling_logp_difference/mean": 0.012185737490653992, "step": 150, "step_time": 551.4488250606694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 5.082125426270068e-06, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 5.082125426270068e-06, "completions/clipped_ratio": 0.002083333395421505, "completions/max_length": 2301.5, "completions/max_terminated_length": 2174.7, "completions/mean_length": 625.8979309082031, "completions/mean_terminated_length": 618.7978820800781, "completions/min_length": 170.2, "completions/min_terminated_length": 170.2, "entropy": 0.10213978644460439, "epoch": 0.19207683073229292, "frac_reward_zero_std": 0.08333333507180214, "grad_norm": 0.05615560710430145, "learning_rate": 9.520958083832336e-06, "loss": 0.0043, "num_tokens": 13325121.0, "reward": 0.06630225274711847, "reward_std": 0.18489644899964333, "rewards/grpo_reward_function/mean": 0.0663022572407499, "rewards/grpo_reward_function/std": 0.7666326016187668, "sampling/importance_sampling_ratio/max": 2.1301008343696592, "sampling/importance_sampling_ratio/mean": 0.44916791915893556, "sampling/importance_sampling_ratio/min": 7.07070047610614e-05, "sampling/sampling_logp_difference/max": 2.524372959136963, "sampling/sampling_logp_difference/mean": 0.013301923777908087, "step": 160, "step_time": 538.0738848904148 } ], "logging_steps": 10, "max_steps": 833, "num_input_tokens_seen": 13935881, "num_train_epochs": 1, "save_steps": 167, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }