Hallucination-R1-warmup / trainer_state.json
Yuwh07's picture
Upload folder using huggingface_hub
151deae verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.8333333333333334,
"eval_steps": 500,
"global_step": 80,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio": 0.0,
"completion_length": 153.1875,
"epoch": 0.010416666666666666,
"grad_norm": 2.2964696301344203,
"kl": 0.0008754730224609375,
"learning_rate": 1e-06,
"loss": -0.0104,
"num_tokens": 44706.0,
"reward": 0.46666670590639114,
"reward_std": 0.7099685594439507,
"rewards/warm_up_reward/mean": 0.3888888955116272,
"rewards/warm_up_reward/std": 0.7168056517839432,
"step": 1
},
{
"clip_ratio": 0.0,
"completion_length": 147.95833587646484,
"epoch": 0.020833333333333332,
"grad_norm": 2.332332033549255,
"kl": 0.0011844635009765625,
"learning_rate": 1e-06,
"loss": 0.0071,
"num_tokens": 88904.0,
"reward": 0.5250000506639481,
"reward_std": 0.748512014746666,
"rewards/warm_up_reward/mean": 0.4375,
"rewards/warm_up_reward/std": 0.6997176110744476,
"step": 2
},
{
"clip_ratio": 0.0,
"completion_length": 154.00000381469727,
"epoch": 0.03125,
"grad_norm": 2.04671487874671,
"kl": 0.0013523101806640625,
"learning_rate": 1e-06,
"loss": 0.0124,
"num_tokens": 133634.0,
"reward": 0.500000037252903,
"reward_std": 0.667382538318634,
"rewards/warm_up_reward/mean": 0.416666679084301,
"rewards/warm_up_reward/std": 0.7197580486536026,
"step": 3
},
{
"clip_ratio": 0.0,
"completion_length": 151.69792556762695,
"epoch": 0.041666666666666664,
"grad_norm": 2.2153093232246226,
"kl": 0.00243377685546875,
"learning_rate": 1e-06,
"loss": 0.0232,
"num_tokens": 178089.0,
"reward": 0.43125002086162567,
"reward_std": 0.662388876080513,
"rewards/warm_up_reward/mean": 0.359375,
"rewards/warm_up_reward/std": 0.6697845309972763,
"step": 4
},
{
"clip_ratio": 0.0,
"completion_length": 148.84375762939453,
"epoch": 0.052083333333333336,
"grad_norm": 2.4723944423129782,
"kl": 0.004669189453125,
"learning_rate": 1e-06,
"loss": 0.0002,
"num_tokens": 222360.0,
"reward": 0.6250000596046448,
"reward_std": 0.8341160118579865,
"rewards/warm_up_reward/mean": 0.5208333358168602,
"rewards/warm_up_reward/std": 0.7749323397874832,
"step": 5
},
{
"clip_ratio": 0.0,
"completion_length": 147.4791717529297,
"epoch": 0.0625,
"grad_norm": 2.452268566541841,
"kl": 0.00980377197265625,
"learning_rate": 1e-06,
"loss": 0.05,
"num_tokens": 266434.0,
"reward": 0.5625000298023224,
"reward_std": 0.8674589395523071,
"rewards/warm_up_reward/mean": 0.46875,
"rewards/warm_up_reward/std": 0.7468476742506027,
"step": 6
},
{
"clip_ratio": 0.0,
"completion_length": 147.79166793823242,
"epoch": 0.07291666666666667,
"grad_norm": 55.84129867431237,
"kl": 0.194091796875,
"learning_rate": 1e-06,
"loss": 0.0511,
"num_tokens": 310532.0,
"reward": 0.9333333820104599,
"reward_std": 0.9551109671592712,
"rewards/warm_up_reward/mean": 0.7777777910232544,
"rewards/warm_up_reward/std": 0.8151216059923172,
"step": 7
},
{
"clip_ratio": 0.0,
"completion_length": 157.65625381469727,
"epoch": 0.08333333333333333,
"grad_norm": 29.072228981795064,
"kl": 0.1204376220703125,
"learning_rate": 1e-06,
"loss": 0.0653,
"num_tokens": 355673.0,
"reward": 0.8281250894069672,
"reward_std": 0.9019797444343567,
"rewards/warm_up_reward/mean": 0.6901041716337204,
"rewards/warm_up_reward/std": 0.800490528345108,
"step": 8
},
{
"clip_ratio": 0.0,
"completion_length": 147.62500381469727,
"epoch": 0.09375,
"grad_norm": 6.698990666563141,
"kl": 0.04058837890625,
"learning_rate": 1e-06,
"loss": 0.0742,
"num_tokens": 399929.0,
"reward": 0.6916667073965073,
"reward_std": 0.7857400476932526,
"rewards/warm_up_reward/mean": 0.5763888955116272,
"rewards/warm_up_reward/std": 0.7836765646934509,
"step": 9
},
{
"clip_ratio": 0.0,
"completion_length": 150.2291717529297,
"epoch": 0.10416666666666667,
"grad_norm": 5.758187436239667,
"kl": 0.1109619140625,
"learning_rate": 1e-06,
"loss": 0.0303,
"num_tokens": 444279.0,
"reward": 0.9000000357627869,
"reward_std": 0.9381224364042282,
"rewards/warm_up_reward/mean": 0.75,
"rewards/warm_up_reward/std": 0.820982426404953,
"step": 10
},
{
"clip_ratio": 0.0,
"completion_length": 150.67708587646484,
"epoch": 0.11458333333333333,
"grad_norm": 24.363046434997802,
"kl": 0.2352294921875,
"learning_rate": 1e-06,
"loss": 0.0567,
"num_tokens": 488708.0,
"reward": 0.8645834177732468,
"reward_std": 0.9693308770656586,
"rewards/warm_up_reward/mean": 0.720486119389534,
"rewards/warm_up_reward/std": 0.8197668194770813,
"step": 11
},
{
"clip_ratio": 0.0,
"completion_length": 155.2916717529297,
"epoch": 0.125,
"grad_norm": 3.1714199544174053,
"kl": 0.08984375,
"learning_rate": 1e-06,
"loss": 0.0481,
"num_tokens": 533526.0,
"reward": 1.0781250596046448,
"reward_std": 0.9790745824575424,
"rewards/warm_up_reward/mean": 0.8984375,
"rewards/warm_up_reward/std": 0.8148495256900787,
"step": 12
},
{
"clip_ratio": 0.0,
"completion_length": 140.2395896911621,
"epoch": 0.13541666666666666,
"grad_norm": 5.687529927842251,
"kl": 0.12872314453125,
"learning_rate": 1e-06,
"loss": 0.0633,
"num_tokens": 576965.0,
"reward": 0.9635417610406876,
"reward_std": 0.9496043026447296,
"rewards/warm_up_reward/mean": 0.802951380610466,
"rewards/warm_up_reward/std": 0.7969174236059189,
"step": 13
},
{
"clip_ratio": 0.0,
"completion_length": 137.87500762939453,
"epoch": 0.14583333333333334,
"grad_norm": 4.785248899065895,
"kl": 0.09088134765625,
"learning_rate": 1e-06,
"loss": 0.0956,
"num_tokens": 620003.0,
"reward": 0.8687500655651093,
"reward_std": 0.8496406525373459,
"rewards/warm_up_reward/mean": 0.7239583283662796,
"rewards/warm_up_reward/std": 0.7836050242185593,
"step": 14
},
{
"clip_ratio": 0.0,
"completion_length": 140.87500381469727,
"epoch": 0.15625,
"grad_norm": 3.043938278706536,
"kl": 0.06243896484375,
"learning_rate": 1e-06,
"loss": 0.048,
"num_tokens": 663347.0,
"reward": 0.8843750804662704,
"reward_std": 0.9135490357875824,
"rewards/warm_up_reward/mean": 0.7369791716337204,
"rewards/warm_up_reward/std": 0.8226732462644577,
"step": 15
},
{
"clip_ratio": 0.0,
"completion_length": 145.0833396911621,
"epoch": 0.16666666666666666,
"grad_norm": 4.964409464782977,
"kl": 0.0794677734375,
"learning_rate": 1e-06,
"loss": 0.0574,
"num_tokens": 707341.0,
"reward": 0.9437500536441803,
"reward_std": 0.8649309277534485,
"rewards/warm_up_reward/mean": 0.7864583283662796,
"rewards/warm_up_reward/std": 0.795333594083786,
"step": 16
},
{
"clip_ratio": 0.0,
"completion_length": 143.76041793823242,
"epoch": 0.17708333333333334,
"grad_norm": 2.5933725635254965,
"kl": 0.0601806640625,
"learning_rate": 1e-06,
"loss": 0.0293,
"num_tokens": 751028.0,
"reward": 0.7000000327825546,
"reward_std": 0.9639299660921097,
"rewards/warm_up_reward/mean": 0.5833333283662796,
"rewards/warm_up_reward/std": 0.7868598401546478,
"step": 17
},
{
"clip_ratio": 0.0,
"completion_length": 140.90625381469727,
"epoch": 0.1875,
"grad_norm": 2.560263758795748,
"kl": 0.0513916015625,
"learning_rate": 1e-06,
"loss": 0.0333,
"num_tokens": 794597.0,
"reward": 0.947916716337204,
"reward_std": 0.9378172904253006,
"rewards/warm_up_reward/mean": 0.7899305373430252,
"rewards/warm_up_reward/std": 0.8166805952787399,
"step": 18
},
{
"clip_ratio": 0.0,
"completion_length": 134.42708587646484,
"epoch": 0.19791666666666666,
"grad_norm": 2.827678010439988,
"kl": 0.05694580078125,
"learning_rate": 1e-06,
"loss": 0.0819,
"num_tokens": 837364.0,
"reward": 0.9250000715255737,
"reward_std": 0.9560818523168564,
"rewards/warm_up_reward/mean": 0.7708333283662796,
"rewards/warm_up_reward/std": 0.8091117739677429,
"step": 19
},
{
"clip_ratio": 0.0,
"completion_length": 137.16666793823242,
"epoch": 0.20833333333333334,
"grad_norm": 2.92256682920272,
"kl": 0.05963134765625,
"learning_rate": 1e-06,
"loss": 0.0522,
"num_tokens": 880442.0,
"reward": 0.9791667610406876,
"reward_std": 0.9076904356479645,
"rewards/warm_up_reward/mean": 0.8159722089767456,
"rewards/warm_up_reward/std": 0.8102934062480927,
"step": 20
},
{
"clip_ratio": 0.0,
"completion_length": 137.84375381469727,
"epoch": 0.21875,
"grad_norm": 2.864931852177023,
"kl": 0.04974365234375,
"learning_rate": 1e-06,
"loss": 0.0248,
"num_tokens": 923615.0,
"reward": 0.9281250536441803,
"reward_std": 0.9918985664844513,
"rewards/warm_up_reward/mean": 0.7734375,
"rewards/warm_up_reward/std": 0.8185379058122635,
"step": 21
},
{
"clip_ratio": 0.0,
"completion_length": 130.8750057220459,
"epoch": 0.22916666666666666,
"grad_norm": 2.8348309851740456,
"kl": 0.05242919921875,
"learning_rate": 1e-06,
"loss": 0.0427,
"num_tokens": 966107.0,
"reward": 1.0531250834465027,
"reward_std": 0.9989801347255707,
"rewards/warm_up_reward/mean": 0.8776041567325592,
"rewards/warm_up_reward/std": 0.8102044314146042,
"step": 22
},
{
"clip_ratio": 0.0,
"completion_length": 144.26041793823242,
"epoch": 0.23958333333333334,
"grad_norm": 2.6071380741149004,
"kl": 0.06549072265625,
"learning_rate": 1e-06,
"loss": 0.0523,
"num_tokens": 1009998.0,
"reward": 0.9625000357627869,
"reward_std": 0.9378929734230042,
"rewards/warm_up_reward/mean": 0.8020833432674408,
"rewards/warm_up_reward/std": 0.8023640215396881,
"step": 23
},
{
"clip_ratio": 0.0,
"completion_length": 139.73958587646484,
"epoch": 0.25,
"grad_norm": 2.697375275936146,
"kl": 0.05670166015625,
"learning_rate": 1e-06,
"loss": 0.0705,
"num_tokens": 1053401.0,
"reward": 1.031250074505806,
"reward_std": 0.9829376488924026,
"rewards/warm_up_reward/mean": 0.859375,
"rewards/warm_up_reward/std": 0.813440352678299,
"step": 24
},
{
"clip_ratio": 0.0,
"completion_length": 138.8854217529297,
"epoch": 0.2604166666666667,
"grad_norm": 85.52835351864486,
"kl": 0.198486328125,
"learning_rate": 1e-06,
"loss": 0.0333,
"num_tokens": 1096650.0,
"reward": 1.089583471417427,
"reward_std": 0.9481612741947174,
"rewards/warm_up_reward/mean": 0.9079861044883728,
"rewards/warm_up_reward/std": 0.7854074388742447,
"step": 25
},
{
"clip_ratio": 0.0,
"completion_length": 130.62500381469727,
"epoch": 0.2708333333333333,
"grad_norm": 3.4064495832019297,
"kl": 0.07513427734375,
"learning_rate": 1e-06,
"loss": 0.0544,
"num_tokens": 1139058.0,
"reward": 0.9885417520999908,
"reward_std": 0.990489736199379,
"rewards/warm_up_reward/mean": 0.8237847238779068,
"rewards/warm_up_reward/std": 0.8235566318035126,
"step": 26
},
{
"clip_ratio": 0.0,
"completion_length": 134.50000381469727,
"epoch": 0.28125,
"grad_norm": 118.32186343464612,
"kl": 0.2930908203125,
"learning_rate": 1e-06,
"loss": 0.011,
"num_tokens": 1181802.0,
"reward": 1.0552084296941757,
"reward_std": 0.868858814239502,
"rewards/warm_up_reward/mean": 0.8793402910232544,
"rewards/warm_up_reward/std": 0.8186527788639069,
"step": 27
},
{
"clip_ratio": 0.0,
"completion_length": 140.8541717529297,
"epoch": 0.2916666666666667,
"grad_norm": 6.711195724360143,
"kl": 0.1092529296875,
"learning_rate": 1e-06,
"loss": 0.0187,
"num_tokens": 1225408.0,
"reward": 0.9666667431592941,
"reward_std": 0.9556048065423965,
"rewards/warm_up_reward/mean": 0.8055555671453476,
"rewards/warm_up_reward/std": 0.8192583322525024,
"step": 28
},
{
"clip_ratio": 0.0,
"completion_length": 137.48958778381348,
"epoch": 0.3020833333333333,
"grad_norm": 4.712858469896548,
"kl": 0.13214111328125,
"learning_rate": 1e-06,
"loss": 0.0356,
"num_tokens": 1268481.0,
"reward": 0.9000000655651093,
"reward_std": 1.0170713812112808,
"rewards/warm_up_reward/mean": 0.7500000149011612,
"rewards/warm_up_reward/std": 0.8409183472394943,
"step": 29
},
{
"clip_ratio": 0.0,
"completion_length": 142.09375381469727,
"epoch": 0.3125,
"grad_norm": 2.604660112727859,
"kl": 0.063720703125,
"learning_rate": 1e-06,
"loss": -0.0023,
"num_tokens": 1312008.0,
"reward": 0.9000000059604645,
"reward_std": 0.9869166016578674,
"rewards/warm_up_reward/mean": 0.75,
"rewards/warm_up_reward/std": 0.8337783664464951,
"step": 30
},
{
"clip_ratio": 0.0,
"completion_length": 145.59375381469727,
"epoch": 0.3229166666666667,
"grad_norm": 6.185404235919849,
"kl": 0.157470703125,
"learning_rate": 1e-06,
"loss": 0.0097,
"num_tokens": 1356039.0,
"reward": 1.0770834237337112,
"reward_std": 0.9777155965566635,
"rewards/warm_up_reward/mean": 0.8975694477558136,
"rewards/warm_up_reward/std": 0.7948237210512161,
"step": 31
},
{
"clip_ratio": 0.0,
"completion_length": 146.37500762939453,
"epoch": 0.3333333333333333,
"grad_norm": 2.4537091049295956,
"kl": 0.06231689453125,
"learning_rate": 1e-06,
"loss": 0.0046,
"num_tokens": 1400007.0,
"reward": 0.9812500327825546,
"reward_std": 0.911426916718483,
"rewards/warm_up_reward/mean": 0.8177083283662796,
"rewards/warm_up_reward/std": 0.7836276739835739,
"step": 32
},
{
"clip_ratio": 0.0,
"completion_length": 145.19792556762695,
"epoch": 0.34375,
"grad_norm": 3.0265092714690702,
"kl": 0.061767578125,
"learning_rate": 1e-06,
"loss": 0.0423,
"num_tokens": 1443874.0,
"reward": 1.0354167222976685,
"reward_std": 0.9364243745803833,
"rewards/warm_up_reward/mean": 0.8628472238779068,
"rewards/warm_up_reward/std": 0.8118415027856827,
"step": 33
},
{
"clip_ratio": 0.0,
"completion_length": 143.8645896911621,
"epoch": 0.3541666666666667,
"grad_norm": 2.4770416085087477,
"kl": 0.05682373046875,
"learning_rate": 1e-06,
"loss": 0.0242,
"num_tokens": 1487601.0,
"reward": 0.9906250834465027,
"reward_std": 1.0336193144321442,
"rewards/warm_up_reward/mean": 0.8255208432674408,
"rewards/warm_up_reward/std": 0.8268236815929413,
"step": 34
},
{
"clip_ratio": 0.0,
"completion_length": 137.34375762939453,
"epoch": 0.3645833333333333,
"grad_norm": 2.787024849258649,
"kl": 0.051513671875,
"learning_rate": 1e-06,
"loss": 0.0409,
"num_tokens": 1530594.0,
"reward": 1.0822917073965073,
"reward_std": 0.8545732349157333,
"rewards/warm_up_reward/mean": 0.9019097238779068,
"rewards/warm_up_reward/std": 0.7748938798904419,
"step": 35
},
{
"clip_ratio": 0.0,
"completion_length": 145.71875381469727,
"epoch": 0.375,
"grad_norm": 2.515335560097886,
"kl": 0.0687255859375,
"learning_rate": 1e-06,
"loss": 0.0284,
"num_tokens": 1574409.0,
"reward": 0.9875000715255737,
"reward_std": 0.9168877303600311,
"rewards/warm_up_reward/mean": 0.8229166567325592,
"rewards/warm_up_reward/std": 0.8267286717891693,
"step": 36
},
{
"clip_ratio": 0.0,
"completion_length": 136.58333778381348,
"epoch": 0.3854166666666667,
"grad_norm": 11.790654099301383,
"kl": 0.1207275390625,
"learning_rate": 1e-06,
"loss": 0.0652,
"num_tokens": 1617503.0,
"reward": 1.031250074505806,
"reward_std": 0.9142753481864929,
"rewards/warm_up_reward/mean": 0.859375,
"rewards/warm_up_reward/std": 0.7878352403640747,
"step": 37
},
{
"clip_ratio": 0.0,
"completion_length": 151.8645896911621,
"epoch": 0.3958333333333333,
"grad_norm": 3.4202024786499643,
"kl": 0.08099365234375,
"learning_rate": 1e-06,
"loss": 0.0212,
"num_tokens": 1661980.0,
"reward": 0.6229167133569717,
"reward_std": 0.8097958564758301,
"rewards/warm_up_reward/mean": 0.5190972313284874,
"rewards/warm_up_reward/std": 0.7617596387863159,
"step": 38
},
{
"clip_ratio": 0.0,
"completion_length": 142.8854217529297,
"epoch": 0.40625,
"grad_norm": 2.4994048524913364,
"kl": 0.06414794921875,
"learning_rate": 1e-06,
"loss": 0.0569,
"num_tokens": 1705637.0,
"reward": 1.0656251087784767,
"reward_std": 0.8034301698207855,
"rewards/warm_up_reward/mean": 0.8880208432674408,
"rewards/warm_up_reward/std": 0.7390912175178528,
"step": 39
},
{
"clip_ratio": 0.0,
"completion_length": 146.18750762939453,
"epoch": 0.4166666666666667,
"grad_norm": 2.4999575203414186,
"kl": 0.0628662109375,
"learning_rate": 1e-06,
"loss": 0.0137,
"num_tokens": 1749719.0,
"reward": 1.0104167312383652,
"reward_std": 0.9736887365579605,
"rewards/warm_up_reward/mean": 0.8420138955116272,
"rewards/warm_up_reward/std": 0.822531133890152,
"step": 40
},
{
"clip_ratio": 0.0,
"completion_length": 143.9479217529297,
"epoch": 0.4270833333333333,
"grad_norm": 2.2639443093591205,
"kl": 0.0675048828125,
"learning_rate": 1e-06,
"loss": 0.0087,
"num_tokens": 1793436.0,
"reward": 0.9229167401790619,
"reward_std": 0.9153347015380859,
"rewards/warm_up_reward/mean": 0.7690972089767456,
"rewards/warm_up_reward/std": 0.8183709383010864,
"step": 41
},
{
"clip_ratio": 0.0,
"completion_length": 147.58333587646484,
"epoch": 0.4375,
"grad_norm": 2.5193978009218965,
"kl": 0.07177734375,
"learning_rate": 1e-06,
"loss": 0.0392,
"num_tokens": 1837562.0,
"reward": 1.006250038743019,
"reward_std": 0.8919505327939987,
"rewards/warm_up_reward/mean": 0.8385416716337204,
"rewards/warm_up_reward/std": 0.8046689182519913,
"step": 42
},
{
"clip_ratio": 0.0,
"completion_length": 141.37500381469727,
"epoch": 0.4479166666666667,
"grad_norm": 2.5496207567669034,
"kl": 0.06622314453125,
"learning_rate": 1e-06,
"loss": 0.0452,
"num_tokens": 1880972.0,
"reward": 1.068750038743019,
"reward_std": 0.9111972749233246,
"rewards/warm_up_reward/mean": 0.890625,
"rewards/warm_up_reward/std": 0.7989336252212524,
"step": 43
},
{
"clip_ratio": 0.0,
"completion_length": 130.83333587646484,
"epoch": 0.4583333333333333,
"grad_norm": 5.245713134644536,
"kl": 0.0999755859375,
"learning_rate": 1e-06,
"loss": -0.0051,
"num_tokens": 1923460.0,
"reward": 0.929166704416275,
"reward_std": 0.8137651234865189,
"rewards/warm_up_reward/mean": 0.7743055373430252,
"rewards/warm_up_reward/std": 0.8136637955904007,
"step": 44
},
{
"clip_ratio": 0.0,
"completion_length": 144.2604217529297,
"epoch": 0.46875,
"grad_norm": 6.317144379422631,
"kl": 0.10107421875,
"learning_rate": 1e-06,
"loss": -0.0175,
"num_tokens": 1967315.0,
"reward": 1.1012500673532486,
"reward_std": 0.7948171943426132,
"rewards/warm_up_reward/mean": 0.9177083224058151,
"rewards/warm_up_reward/std": 0.7791551500558853,
"step": 45
},
{
"clip_ratio": 0.0,
"completion_length": 141.5104217529297,
"epoch": 0.4791666666666667,
"grad_norm": 2.8932825846308172,
"kl": 0.0859375,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 2010882.0,
"reward": 1.0656251162290573,
"reward_std": 0.9644656330347061,
"rewards/warm_up_reward/mean": 0.8880208432674408,
"rewards/warm_up_reward/std": 0.8194199502468109,
"step": 46
},
{
"clip_ratio": 0.0,
"completion_length": 140.21875762939453,
"epoch": 0.4895833333333333,
"grad_norm": 2.9093719889125174,
"kl": 0.1007080078125,
"learning_rate": 1e-06,
"loss": -0.0275,
"num_tokens": 2054307.0,
"reward": 0.991666704416275,
"reward_std": 0.9324973523616791,
"rewards/warm_up_reward/mean": 0.8263888955116272,
"rewards/warm_up_reward/std": 0.8247981667518616,
"step": 47
},
{
"clip_ratio": 0.0,
"completion_length": 136.5729217529297,
"epoch": 0.5,
"grad_norm": 2.3209209113606146,
"kl": 0.08441162109375,
"learning_rate": 1e-06,
"loss": 0.001,
"num_tokens": 2097424.0,
"reward": 1.1010417491197586,
"reward_std": 0.8305719494819641,
"rewards/warm_up_reward/mean": 0.9175347238779068,
"rewards/warm_up_reward/std": 0.7910451591014862,
"step": 48
},
{
"clip_ratio": 0.0,
"completion_length": 142.50000762939453,
"epoch": 0.5104166666666666,
"grad_norm": 2.470827147583925,
"kl": 0.08355712890625,
"learning_rate": 1e-06,
"loss": 0.0433,
"num_tokens": 2141092.0,
"reward": 1.0208334028720856,
"reward_std": 0.9052031934261322,
"rewards/warm_up_reward/mean": 0.8506944477558136,
"rewards/warm_up_reward/std": 0.8133653849363327,
"step": 49
},
{
"clip_ratio": 0.0,
"completion_length": 138.58333587646484,
"epoch": 0.5208333333333334,
"grad_norm": 2.314915736053297,
"kl": 0.0765380859375,
"learning_rate": 1e-06,
"loss": 0.0188,
"num_tokens": 2184306.0,
"reward": 1.050000086426735,
"reward_std": 0.9242848604917526,
"rewards/warm_up_reward/mean": 0.8749999850988388,
"rewards/warm_up_reward/std": 0.7892495840787888,
"step": 50
},
{
"clip_ratio": 0.0,
"completion_length": 138.92708778381348,
"epoch": 0.53125,
"grad_norm": 16.28250872141934,
"kl": 0.26171875,
"learning_rate": 1e-06,
"loss": 0.0328,
"num_tokens": 2227673.0,
"reward": 1.1312500685453415,
"reward_std": 1.0009342432022095,
"rewards/warm_up_reward/mean": 0.9427083432674408,
"rewards/warm_up_reward/std": 0.8114291131496429,
"step": 51
},
{
"clip_ratio": 0.0,
"completion_length": 140.92708587646484,
"epoch": 0.5416666666666666,
"grad_norm": 2.4052567403889364,
"kl": 0.0528564453125,
"learning_rate": 1e-06,
"loss": 0.0201,
"num_tokens": 2271064.0,
"reward": 1.2375000417232513,
"reward_std": 0.8487301468849182,
"rewards/warm_up_reward/mean": 1.03125,
"rewards/warm_up_reward/std": 0.7739475220441818,
"step": 52
},
{
"clip_ratio": 0.0,
"completion_length": 146.1458396911621,
"epoch": 0.5520833333333334,
"grad_norm": 2.4582483635661467,
"kl": 0.05413818359375,
"learning_rate": 1e-06,
"loss": 0.0429,
"num_tokens": 2315052.0,
"reward": 1.1104167699813843,
"reward_std": 0.9700468927621841,
"rewards/warm_up_reward/mean": 0.9253472238779068,
"rewards/warm_up_reward/std": 0.7955830246210098,
"step": 53
},
{
"clip_ratio": 0.0,
"completion_length": 145.2083396911621,
"epoch": 0.5625,
"grad_norm": 2.4577305981193165,
"kl": 0.0728759765625,
"learning_rate": 1e-06,
"loss": -0.0131,
"num_tokens": 2358962.0,
"reward": 1.1750001013278961,
"reward_std": 0.9326367676258087,
"rewards/warm_up_reward/mean": 0.9791666716337204,
"rewards/warm_up_reward/std": 0.7897387892007828,
"step": 54
},
{
"clip_ratio": 0.0,
"completion_length": 145.7916717529297,
"epoch": 0.5729166666666666,
"grad_norm": 2.302326467124225,
"kl": 0.0810546875,
"learning_rate": 1e-06,
"loss": 0.0291,
"num_tokens": 2402922.0,
"reward": 0.9875000566244125,
"reward_std": 1.0013651847839355,
"rewards/warm_up_reward/mean": 0.8229166716337204,
"rewards/warm_up_reward/std": 0.8247637003660202,
"step": 55
},
{
"clip_ratio": 0.0,
"completion_length": 147.9479217529297,
"epoch": 0.5833333333333334,
"grad_norm": 2.580316913788417,
"kl": 0.068115234375,
"learning_rate": 1e-06,
"loss": 0.0509,
"num_tokens": 2447065.0,
"reward": 1.183750033378601,
"reward_std": 0.9039967954158783,
"rewards/warm_up_reward/mean": 0.9864583313465118,
"rewards/warm_up_reward/std": 0.787983849644661,
"step": 56
},
{
"clip_ratio": 0.0,
"completion_length": 142.05208587646484,
"epoch": 0.59375,
"grad_norm": 2.748934001160214,
"kl": 0.06414794921875,
"learning_rate": 1e-06,
"loss": -0.0078,
"num_tokens": 2490552.0,
"reward": 1.07750004529953,
"reward_std": 0.9763932228088379,
"rewards/warm_up_reward/mean": 0.8979166746139526,
"rewards/warm_up_reward/std": 0.8105349242687225,
"step": 57
},
{
"clip_ratio": 0.0,
"completion_length": 146.0729217529297,
"epoch": 0.6041666666666666,
"grad_norm": 2.279728805159882,
"kl": 0.05682373046875,
"learning_rate": 1e-06,
"loss": 0.0054,
"num_tokens": 2534503.0,
"reward": 1.2562500685453415,
"reward_std": 0.8728772848844528,
"rewards/warm_up_reward/mean": 1.046875,
"rewards/warm_up_reward/std": 0.715716764330864,
"step": 58
},
{
"clip_ratio": 0.0,
"completion_length": 136.77083587646484,
"epoch": 0.6145833333333334,
"grad_norm": 2.301940905359112,
"kl": 0.0634765625,
"learning_rate": 1e-06,
"loss": 0.0198,
"num_tokens": 2577603.0,
"reward": 1.2200001031160355,
"reward_std": 0.9238942861557007,
"rewards/warm_up_reward/mean": 1.0166666805744171,
"rewards/warm_up_reward/std": 0.7837828695774078,
"step": 59
},
{
"clip_ratio": 0.0,
"completion_length": 149.3645896911621,
"epoch": 0.625,
"grad_norm": 2.3088797777576535,
"kl": 0.0701904296875,
"learning_rate": 1e-06,
"loss": 0.0668,
"num_tokens": 2621852.0,
"reward": 1.0885417461395264,
"reward_std": 0.9921838045120239,
"rewards/warm_up_reward/mean": 0.9071180820465088,
"rewards/warm_up_reward/std": 0.8187949508428574,
"step": 60
},
{
"clip_ratio": 0.0,
"completion_length": 144.9791717529297,
"epoch": 0.6354166666666666,
"grad_norm": 2.3531788106957063,
"kl": 0.0830078125,
"learning_rate": 1e-06,
"loss": 0.0715,
"num_tokens": 2665584.0,
"reward": 1.103541761636734,
"reward_std": 0.8904776722192764,
"rewards/warm_up_reward/mean": 0.9196180552244186,
"rewards/warm_up_reward/std": 0.7996000051498413,
"step": 61
},
{
"clip_ratio": 0.0,
"completion_length": 140.5416717529297,
"epoch": 0.6458333333333334,
"grad_norm": 2.3780245058749894,
"kl": 0.0545654296875,
"learning_rate": 1e-06,
"loss": 0.0144,
"num_tokens": 2709058.0,
"reward": 1.2650001347064972,
"reward_std": 0.8583473563194275,
"rewards/warm_up_reward/mean": 1.0541666597127914,
"rewards/warm_up_reward/std": 0.753357321023941,
"step": 62
},
{
"clip_ratio": 0.0,
"completion_length": 150.5208396911621,
"epoch": 0.65625,
"grad_norm": 2.291724896507479,
"kl": 0.05914306640625,
"learning_rate": 1e-06,
"loss": 0.0102,
"num_tokens": 2753538.0,
"reward": 1.2431251406669617,
"reward_std": 0.8321643471717834,
"rewards/warm_up_reward/mean": 1.0359375476837158,
"rewards/warm_up_reward/std": 0.74222831428051,
"step": 63
},
{
"clip_ratio": 0.0,
"completion_length": 143.2395896911621,
"epoch": 0.6666666666666666,
"grad_norm": 3.018275289311917,
"kl": 0.085693359375,
"learning_rate": 1e-06,
"loss": 0.0078,
"num_tokens": 2797253.0,
"reward": 1.1802085041999817,
"reward_std": 0.948539987206459,
"rewards/warm_up_reward/mean": 0.9835069626569748,
"rewards/warm_up_reward/std": 0.7748740911483765,
"step": 64
},
{
"clip_ratio": 0.0,
"completion_length": 138.9791717529297,
"epoch": 0.6770833333333334,
"grad_norm": 2.219343433713507,
"kl": 0.06298828125,
"learning_rate": 1e-06,
"loss": 0.0203,
"num_tokens": 2840571.0,
"reward": 1.2229167520999908,
"reward_std": 0.783911868929863,
"rewards/warm_up_reward/mean": 1.0190972089767456,
"rewards/warm_up_reward/std": 0.7600821256637573,
"step": 65
},
{
"clip_ratio": 0.0,
"completion_length": 152.8958396911621,
"epoch": 0.6875,
"grad_norm": 2.2126312475293877,
"kl": 0.0704345703125,
"learning_rate": 1e-06,
"loss": 0.0425,
"num_tokens": 2885201.0,
"reward": 1.1691668182611465,
"reward_std": 0.8853475451469421,
"rewards/warm_up_reward/mean": 0.9743055552244186,
"rewards/warm_up_reward/std": 0.7726792246103287,
"step": 66
},
{
"clip_ratio": 0.0,
"completion_length": 147.61459350585938,
"epoch": 0.6979166666666666,
"grad_norm": 2.3729988144079748,
"kl": 0.05316162109375,
"learning_rate": 1e-06,
"loss": 0.0235,
"num_tokens": 2929396.0,
"reward": 1.3312501311302185,
"reward_std": 0.7987091541290283,
"rewards/warm_up_reward/mean": 1.109375,
"rewards/warm_up_reward/std": 0.7549550831317902,
"step": 67
},
{
"clip_ratio": 0.0,
"completion_length": 144.1041717529297,
"epoch": 0.7083333333333334,
"grad_norm": 2.268946790543885,
"kl": 0.04766845703125,
"learning_rate": 1e-06,
"loss": -0.0027,
"num_tokens": 2973200.0,
"reward": 1.325416773557663,
"reward_std": 0.8379913568496704,
"rewards/warm_up_reward/mean": 1.1045138835906982,
"rewards/warm_up_reward/std": 0.7519785463809967,
"step": 68
},
{
"clip_ratio": 0.0,
"completion_length": 140.4791717529297,
"epoch": 0.71875,
"grad_norm": 2.40410834715932,
"kl": 0.0625,
"learning_rate": 1e-06,
"loss": 0.0508,
"num_tokens": 3016728.0,
"reward": 1.143750011920929,
"reward_std": 0.8826231509447098,
"rewards/warm_up_reward/mean": 0.953125,
"rewards/warm_up_reward/std": 0.7572390139102936,
"step": 69
},
{
"clip_ratio": 0.0,
"completion_length": 145.9791717529297,
"epoch": 0.7291666666666666,
"grad_norm": 2.377991017189631,
"kl": 0.06512451171875,
"learning_rate": 1e-06,
"loss": 0.0505,
"num_tokens": 3060790.0,
"reward": 1.2854167819023132,
"reward_std": 0.9141092300415039,
"rewards/warm_up_reward/mean": 1.0711805671453476,
"rewards/warm_up_reward/std": 0.7661919444799423,
"step": 70
},
{
"clip_ratio": 0.0,
"completion_length": 144.7916717529297,
"epoch": 0.7395833333333334,
"grad_norm": 5.490360584080418,
"kl": 0.068115234375,
"learning_rate": 1e-06,
"loss": 0.0061,
"num_tokens": 3104672.0,
"reward": 1.2937501072883606,
"reward_std": 0.9385685622692108,
"rewards/warm_up_reward/mean": 1.078125,
"rewards/warm_up_reward/std": 0.7564428001642227,
"step": 71
},
{
"clip_ratio": 0.0,
"completion_length": 132.94792366027832,
"epoch": 0.75,
"grad_norm": 2.705450393270237,
"kl": 0.0748291015625,
"learning_rate": 1e-06,
"loss": 0.0032,
"num_tokens": 3147441.0,
"reward": 1.2687500715255737,
"reward_std": 0.9319685697555542,
"rewards/warm_up_reward/mean": 1.0572916567325592,
"rewards/warm_up_reward/std": 0.7771977633237839,
"step": 72
},
{
"clip_ratio": 0.0,
"completion_length": 151.91667556762695,
"epoch": 0.7604166666666666,
"grad_norm": 2.254735299608175,
"kl": 0.07171630859375,
"learning_rate": 1e-06,
"loss": 0.0105,
"num_tokens": 3192121.0,
"reward": 1.0045834183692932,
"reward_std": 0.8630332052707672,
"rewards/warm_up_reward/mean": 0.8371527940034866,
"rewards/warm_up_reward/std": 0.7969614416360855,
"step": 73
},
{
"clip_ratio": 0.0,
"completion_length": 140.31250190734863,
"epoch": 0.7708333333333334,
"grad_norm": 2.4523220833578345,
"kl": 0.05816650390625,
"learning_rate": 1e-06,
"loss": 0.0158,
"num_tokens": 3235453.0,
"reward": 1.1837501227855682,
"reward_std": 0.9126418828964233,
"rewards/warm_up_reward/mean": 0.9864583015441895,
"rewards/warm_up_reward/std": 0.7863775044679642,
"step": 74
},
{
"clip_ratio": 0.0,
"completion_length": 141.7395896911621,
"epoch": 0.78125,
"grad_norm": 2.2478331336635513,
"kl": 0.0609130859375,
"learning_rate": 1e-06,
"loss": 0.0284,
"num_tokens": 3278964.0,
"reward": 1.2054167687892914,
"reward_std": 0.9118891954421997,
"rewards/warm_up_reward/mean": 1.0045138746500015,
"rewards/warm_up_reward/std": 0.7829048186540604,
"step": 75
},
{
"clip_ratio": 0.0,
"completion_length": 139.71875381469727,
"epoch": 0.7916666666666666,
"grad_norm": 2.6147718994418203,
"kl": 0.0904541015625,
"learning_rate": 1e-06,
"loss": 0.0143,
"num_tokens": 3322407.0,
"reward": 1.2660417556762695,
"reward_std": 0.9197860509157181,
"rewards/warm_up_reward/mean": 1.0550346970558167,
"rewards/warm_up_reward/std": 0.7622723281383514,
"step": 76
},
{
"clip_ratio": 0.0,
"completion_length": 141.53125762939453,
"epoch": 0.8020833333333334,
"grad_norm": 2.530432648354832,
"kl": 0.093505859375,
"learning_rate": 1e-06,
"loss": -0.0104,
"num_tokens": 3366036.0,
"reward": 1.1020834147930145,
"reward_std": 0.902558371424675,
"rewards/warm_up_reward/mean": 0.9184028059244156,
"rewards/warm_up_reward/std": 0.7975014746189117,
"step": 77
},
{
"clip_ratio": 0.0,
"completion_length": 146.0416717529297,
"epoch": 0.8125,
"grad_norm": 2.2386503327740015,
"kl": 0.0811767578125,
"learning_rate": 1e-06,
"loss": 0.0184,
"num_tokens": 3410026.0,
"reward": 1.3406251072883606,
"reward_std": 0.8862900286912918,
"rewards/warm_up_reward/mean": 1.1171874850988388,
"rewards/warm_up_reward/std": 0.7199237793684006,
"step": 78
},
{
"clip_ratio": 0.0,
"completion_length": 133.6041717529297,
"epoch": 0.8229166666666666,
"grad_norm": 2.9736551256376744,
"kl": 0.0992431640625,
"learning_rate": 1e-06,
"loss": 0.0029,
"num_tokens": 3452750.0,
"reward": 1.3556251227855682,
"reward_std": 0.8310635536909103,
"rewards/warm_up_reward/mean": 1.129687488079071,
"rewards/warm_up_reward/std": 0.7288718819618225,
"step": 79
},
{
"clip_ratio": 0.0,
"completion_length": 139.45834350585938,
"epoch": 0.8333333333333334,
"grad_norm": 4.081234898460602,
"kl": 0.15234375,
"learning_rate": 1e-06,
"loss": -0.0154,
"num_tokens": 3496018.0,
"reward": 1.0906250923871994,
"reward_std": 0.8768025040626526,
"rewards/warm_up_reward/mean": 0.9088541716337204,
"rewards/warm_up_reward/std": 0.8029628545045853,
"step": 80
}
],
"logging_steps": 1.0,
"max_steps": 96,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 16,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 6,
"trial_name": null,
"trial_params": null
}