{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.8076923076923077, "eval_steps": 500, "global_step": 84, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0229166672565043, "completions/max_length": 4096.0, "completions/max_terminated_length": 3560.9, "completions/mean_length": 522.7101684570313, "completions/mean_terminated_length": 438.9754211425781, "completions/min_length": 75.0, "completions/min_terminated_length": 75.0, "entropy": 0.23113284446299076, "epoch": 0.09615384615384616, "frac_reward_zero_std": 0.02291666707023978, "grad_norm": 0.0001292548916491907, "learning_rate": 4.2857142857142855e-06, "loss": -0.0001, "num_tokens": 7139751.0, "reward": -0.3463470071554184, "reward_std": 0.19510238319635392, "rewards/grpo_reward_function/mean": -0.34634698629379274, "rewards/grpo_reward_function/std": 0.45176688134670256, "sampling/importance_sampling_ratio/max": 0.2500600881874561, "sampling/importance_sampling_ratio/mean": 0.003153010329697281, "sampling/importance_sampling_ratio/min": 4.422263656305887e-39, "sampling/sampling_logp_difference/max": 1.7915182709693909, "sampling/sampling_logp_difference/mean": 0.04889127053320408, "step": 10, "step_time": 190.44577815867962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.023697917722165583, "completions/max_length": 4096.0, "completions/max_terminated_length": 3451.8, "completions/mean_length": 534.3229278564453, "completions/mean_terminated_length": 448.0358642578125, "completions/min_length": 88.7, "completions/min_terminated_length": 88.7, "entropy": 0.2307925283908844, "epoch": 0.19230769230769232, "frac_reward_zero_std": 0.030208333767950533, "grad_norm": 7.460668293122211e-05, "learning_rate": 9.047619047619049e-06, "loss": -0.0002, "num_tokens": 14396583.0, "reward": -0.37422587275505065, "reward_std": 0.20355914533138275, "rewards/grpo_reward_function/mean": -0.3742258608341217, "rewards/grpo_reward_function/std": 0.44767938256263734, "sampling/importance_sampling_ratio/max": 0.19214814556762577, "sampling/importance_sampling_ratio/mean": 0.0025246856122976167, "sampling/importance_sampling_ratio/min": 5.423524047837388e-32, "sampling/sampling_logp_difference/max": 1.7129023194313049, "sampling/sampling_logp_difference/mean": 0.04906127564609051, "step": 20, "step_time": 190.18596366122364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.021354167256504297, "completions/max_length": 4096.0, "completions/max_terminated_length": 3362.2, "completions/mean_length": 523.9229278564453, "completions/mean_terminated_length": 446.04356384277344, "completions/min_length": 94.6, "completions/min_terminated_length": 94.6, "entropy": 0.22211343944072723, "epoch": 0.28846153846153844, "frac_reward_zero_std": 0.027083334140479566, "grad_norm": 0.00013486736422447032, "learning_rate": 9.772520313857777e-06, "loss": -0.0001, "num_tokens": 21575715.0, "reward": -0.3505605816841125, "reward_std": 0.18663191050291061, "rewards/grpo_reward_function/mean": -0.35056057274341584, "rewards/grpo_reward_function/std": 0.43719949424266813, "sampling/importance_sampling_ratio/max": 0.27146717831492423, "sampling/importance_sampling_ratio/mean": 0.0029611586302053182, "sampling/importance_sampling_ratio/min": 3.801722733713229e-43, "sampling/sampling_logp_difference/max": 1.808135986328125, "sampling/sampling_logp_difference/mean": 0.04732150360941887, "step": 30, "step_time": 188.15189309306442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.01718750074505806, "completions/max_length": 4096.0, "completions/max_terminated_length": 3561.5, "completions/mean_length": 516.6395965576172, "completions/mean_terminated_length": 454.0846740722656, "completions/min_length": 95.4, "completions/min_terminated_length": 95.4, "entropy": 0.23415493555366992, "epoch": 0.38461538461538464, "frac_reward_zero_std": 0.03125000027939677, "grad_norm": 0.000136481436864881, "learning_rate": 8.883744025880429e-06, "loss": -0.0001, "num_tokens": 28670267.0, "reward": -0.3659424602985382, "reward_std": 0.17396533936262132, "rewards/grpo_reward_function/mean": -0.36594244837760925, "rewards/grpo_reward_function/std": 0.4192140996456146, "sampling/importance_sampling_ratio/max": 0.15043866001069545, "sampling/importance_sampling_ratio/mean": 0.0019588641240261494, "sampling/importance_sampling_ratio/min": 4.154415544199142e-41, "sampling/sampling_logp_difference/max": 1.778145396709442, "sampling/sampling_logp_difference/mean": 0.050116677582263944, "step": 40, "step_time": 187.47059447690845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02057291716337204, "completions/max_length": 4096.0, "completions/max_terminated_length": 3445.9, "completions/mean_length": 533.0474151611328, "completions/mean_terminated_length": 458.3273406982422, "completions/min_length": 89.6, "completions/min_terminated_length": 89.6, "entropy": 0.21524659655988215, "epoch": 0.4807692307692308, "frac_reward_zero_std": 0.03437500102445483, "grad_norm": 0.0005527222513477307, "learning_rate": 7.445169960349167e-06, "loss": 0.0001, "num_tokens": 35851893.0, "reward": -0.3463290870189667, "reward_std": 0.19029797241091728, "rewards/grpo_reward_function/mean": -0.34632907509803773, "rewards/grpo_reward_function/std": 0.468786346912384, "sampling/importance_sampling_ratio/max": 0.3124311394989491, "sampling/importance_sampling_ratio/mean": 0.003457725653424859, "sampling/importance_sampling_ratio/min": 6.522424396341872e-34, "sampling/sampling_logp_difference/max": 1.7508255124092102, "sampling/sampling_logp_difference/mean": 0.04619445130228996, "step": 50, "step_time": 187.6461409341544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.019531250465661287, "completions/max_length": 4096.0, "completions/max_terminated_length": 3405.2, "completions/mean_length": 567.5502746582031, "completions/mean_terminated_length": 497.2075164794922, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "entropy": 0.21499812975525856, "epoch": 0.5769230769230769, "frac_reward_zero_std": 0.023958333674818276, "grad_norm": 0.0002768893576569048, "learning_rate": 5.660448208208513e-06, "loss": -0.0, "num_tokens": 43292650.0, "reward": -0.3658351093530655, "reward_std": 0.2153971642255783, "rewards/grpo_reward_function/mean": -0.36583509147167204, "rewards/grpo_reward_function/std": 0.46224400103092195, "sampling/importance_sampling_ratio/max": 0.2361154653131962, "sampling/importance_sampling_ratio/mean": 0.0022639297298155726, "sampling/importance_sampling_ratio/min": 3.015787716454037e-38, "sampling/sampling_logp_difference/max": 1.769980216026306, "sampling/sampling_logp_difference/mean": 0.04610508680343628, "step": 60, "step_time": 187.66385944783687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.027604167629033328, "completions/max_length": 4096.0, "completions/max_terminated_length": 3738.3, "completions/mean_length": 598.5659057617188, "completions/mean_terminated_length": 499.3739959716797, "completions/min_length": 111.5, "completions/min_terminated_length": 111.5, "entropy": 0.21177774332463742, "epoch": 0.6730769230769231, "frac_reward_zero_std": 0.025000000931322576, "grad_norm": 7.337733368660454e-05, "learning_rate": 3.782230861445041e-06, "loss": 0.0, "num_tokens": 50694323.0, "reward": -0.4116409093141556, "reward_std": 0.22881191819906235, "rewards/grpo_reward_function/mean": -0.4116408973932266, "rewards/grpo_reward_function/std": 0.4689377576112747, "sampling/importance_sampling_ratio/max": 0.12385215684771538, "sampling/importance_sampling_ratio/mean": 0.0014503307553241029, "sampling/importance_sampling_ratio/min": 5.254869241218064e-41, "sampling/sampling_logp_difference/max": 1.805948269367218, "sampling/sampling_logp_difference/mean": 0.04531049989163875, "step": 70, "step_time": 189.22959316521883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.02161458395421505, "completions/max_length": 4096.0, "completions/max_terminated_length": 3813.9, "completions/mean_length": 575.7130401611328, "completions/mean_terminated_length": 497.99981689453125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "entropy": 0.211932235956192, "epoch": 0.7692307692307693, "frac_reward_zero_std": 0.03229166744276881, "grad_norm": 6.826014804673302e-05, "learning_rate": 2.0764056088797646e-06, "loss": -0.0001, "num_tokens": 58011305.0, "reward": -0.3900724709033966, "reward_std": 0.21758214086294175, "rewards/grpo_reward_function/mean": -0.3900724589824677, "rewards/grpo_reward_function/std": 0.4689822793006897, "sampling/importance_sampling_ratio/max": 0.2649656251072884, "sampling/importance_sampling_ratio/mean": 0.0030477925203740595, "sampling/importance_sampling_ratio/min": 1.6902078646393643e-36, "sampling/sampling_logp_difference/max": 1.7864648818969726, "sampling/sampling_logp_difference/mean": 0.04543661251664162, "step": 80, "step_time": 187.76453976780175 } ], "logging_steps": 10, "max_steps": 104, "num_input_tokens_seen": 60954036, "num_train_epochs": 1, "save_steps": 21, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 24, "trial_name": null, "trial_params": null }