File size: 23,357 Bytes
4f94f6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
{
  "best_global_step": null,
  "best_metric": null,
  "best_model_checkpoint": null,
  "epoch": 0.20048019207683074,
  "eval_steps": 500,
  "global_step": 167,
  "is_hyper_param_search": false,
  "is_local_process_zero": true,
  "is_world_process_zero": true,
  "log_history": [
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.012500000186264515,
      "completions/max_length": 2954.7,
      "completions/max_terminated_length": 1951.6,
      "completions/mean_length": 545.102099609375,
      "completions/mean_terminated_length": 500.0404846191406,
      "completions/min_length": 117.8,
      "completions/min_terminated_length": 117.8,
      "entropy": 0.1633674878627062,
      "epoch": 0.012004801920768308,
      "frac_reward_zero_std": 0.05000000149011612,
      "grad_norm": 0.0819886103272438,
      "learning_rate": 5.389221556886228e-07,
      "loss": 0.0538,
      "num_tokens": 1060997.0,
      "reward": -0.349642014503479,
      "reward_std": 0.18912948295474052,
      "rewards/grpo_reward_function/mean": -0.3496420085430145,
      "rewards/grpo_reward_function/std": 0.4486300081014633,
      "sampling/importance_sampling_ratio/max": 2.3219674229621887,
      "sampling/importance_sampling_ratio/mean": 0.3698740124702454,
      "sampling/importance_sampling_ratio/min": 1.1996005980563495e-06,
      "sampling/sampling_logp_difference/max": 2.5826863408088685,
      "sampling/sampling_logp_difference/mean": 0.019079525023698807,
      "step": 10,
      "step_time": 591.6221411965787
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01250000037252903,
      "completions/max_length": 2285.6,
      "completions/max_terminated_length": 1834.2,
      "completions/mean_length": 525.1979339599609,
      "completions/mean_terminated_length": 481.30433044433596,
      "completions/min_length": 111.4,
      "completions/min_terminated_length": 111.4,
      "entropy": 0.15604820642620326,
      "epoch": 0.024009603841536616,
      "frac_reward_zero_std": 0.0416666679084301,
      "grad_norm": 0.09551126509904861,
      "learning_rate": 1.1377245508982037e-06,
      "loss": -0.0139,
      "num_tokens": 2123212.0,
      "reward": -0.298923921585083,
      "reward_std": 0.21090517602860928,
      "rewards/grpo_reward_function/mean": -0.2989239178597927,
      "rewards/grpo_reward_function/std": 0.4665490254759789,
      "sampling/importance_sampling_ratio/max": 2.135484504699707,
      "sampling/importance_sampling_ratio/mean": 0.40152732133865354,
      "sampling/importance_sampling_ratio/min": 2.7301041336613706e-05,
      "sampling/sampling_logp_difference/max": 2.5806180000305177,
      "sampling/sampling_logp_difference/mean": 0.019163084402680396,
      "step": 20,
      "step_time": 554.5896356501617
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01666666716337204,
      "completions/max_length": 3269.3,
      "completions/max_terminated_length": 2072.6,
      "completions/mean_length": 556.327099609375,
      "completions/mean_terminated_length": 496.14097595214844,
      "completions/min_length": 132.3,
      "completions/min_terminated_length": 132.3,
      "entropy": 0.18481314480304717,
      "epoch": 0.03601440576230492,
      "frac_reward_zero_std": 0.05000000074505806,
      "grad_norm": 0.11139781028032303,
      "learning_rate": 1.7365269461077847e-06,
      "loss": 0.0256,
      "num_tokens": 3227189.0,
      "reward": -0.409818297624588,
      "reward_std": 0.22780176997184753,
      "rewards/grpo_reward_function/mean": -0.4098182961344719,
      "rewards/grpo_reward_function/std": 0.5334153980016708,
      "sampling/importance_sampling_ratio/max": 2.277985179424286,
      "sampling/importance_sampling_ratio/mean": 0.3370798110961914,
      "sampling/importance_sampling_ratio/min": 2.6906073216806556e-05,
      "sampling/sampling_logp_difference/max": 2.5000483632087707,
      "sampling/sampling_logp_difference/mean": 0.02049510907381773,
      "step": 30,
      "step_time": 555.5971007851883
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01041666679084301,
      "completions/max_length": 2135.6,
      "completions/max_terminated_length": 1478.5,
      "completions/mean_length": 508.5041778564453,
      "completions/mean_terminated_length": 472.1120574951172,
      "completions/min_length": 147.4,
      "completions/min_terminated_length": 147.4,
      "entropy": 0.1670758031308651,
      "epoch": 0.04801920768307323,
      "frac_reward_zero_std": 0.0416666679084301,
      "grad_norm": 0.06704321503639221,
      "learning_rate": 2.3353293413173654e-06,
      "loss": -0.0127,
      "num_tokens": 4318559.0,
      "reward": -0.2258751168847084,
      "reward_std": 0.16097248084843158,
      "rewards/grpo_reward_function/mean": -0.22587510757148266,
      "rewards/grpo_reward_function/std": 0.49552616477012634,
      "sampling/importance_sampling_ratio/max": 2.3126969814300535,
      "sampling/importance_sampling_ratio/mean": 0.35644740611314774,
      "sampling/importance_sampling_ratio/min": 2.7391963689638034e-05,
      "sampling/sampling_logp_difference/max": 2.8060175657272337,
      "sampling/sampling_logp_difference/mean": 0.01994446888566017,
      "step": 40,
      "step_time": 554.565231207572
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00416666679084301,
      "completions/max_length": 1859.8,
      "completions/max_terminated_length": 1757.3,
      "completions/mean_length": 513.3979309082031,
      "completions/mean_terminated_length": 499.66423950195315,
      "completions/min_length": 117.0,
      "completions/min_terminated_length": 117.0,
      "entropy": 0.17271990440785884,
      "epoch": 0.060024009603841535,
      "frac_reward_zero_std": 0.06666666865348816,
      "grad_norm": 0.09258268028497696,
      "learning_rate": 2.9341317365269463e-06,
      "loss": 0.0015,
      "num_tokens": 5370018.0,
      "reward": -0.3178896278142929,
      "reward_std": 0.1635244082659483,
      "rewards/grpo_reward_function/mean": -0.3178896352648735,
      "rewards/grpo_reward_function/std": 0.45689679607748984,
      "sampling/importance_sampling_ratio/max": 2.195555794239044,
      "sampling/importance_sampling_ratio/mean": 0.3522403955459595,
      "sampling/importance_sampling_ratio/min": 7.164277021729504e-05,
      "sampling/sampling_logp_difference/max": 2.4301879167556764,
      "sampling/sampling_logp_difference/mean": 0.02071673283353448,
      "step": 50,
      "step_time": 548.9367319711484
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00416666679084301,
      "completions/max_length": 2202.6,
      "completions/max_terminated_length": 1812.6,
      "completions/mean_length": 506.99168395996094,
      "completions/mean_terminated_length": 492.1666778564453,
      "completions/min_length": 127.8,
      "completions/min_terminated_length": 127.8,
      "entropy": 0.160004629381001,
      "epoch": 0.07202881152460984,
      "frac_reward_zero_std": 0.0416666679084301,
      "grad_norm": 0.05922295153141022,
      "learning_rate": 3.5329341317365273e-06,
      "loss": -0.0033,
      "num_tokens": 6466162.0,
      "reward": -0.34021527171134947,
      "reward_std": 0.18639734461903573,
      "rewards/grpo_reward_function/mean": -0.34021526128053664,
      "rewards/grpo_reward_function/std": 0.5187867254018783,
      "sampling/importance_sampling_ratio/max": 2.010967791080475,
      "sampling/importance_sampling_ratio/mean": 0.30313637256622317,
      "sampling/importance_sampling_ratio/min": 3.7146345167826667e-06,
      "sampling/sampling_logp_difference/max": 2.6932833194732666,
      "sampling/sampling_logp_difference/mean": 0.02041825857013464,
      "step": 60,
      "step_time": 530.4414538932033
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.016666666977107523,
      "completions/max_length": 2579.7,
      "completions/max_terminated_length": 2077.7,
      "completions/mean_length": 557.7854309082031,
      "completions/mean_terminated_length": 499.6593933105469,
      "completions/min_length": 121.5,
      "completions/min_terminated_length": 121.5,
      "entropy": 0.16313071362674236,
      "epoch": 0.08403361344537816,
      "frac_reward_zero_std": 0.10000000223517418,
      "grad_norm": 0.014785214327275753,
      "learning_rate": 4.131736526946108e-06,
      "loss": 0.0424,
      "num_tokens": 7608683.0,
      "reward": -0.33393135815858843,
      "reward_std": 0.19106332510709761,
      "rewards/grpo_reward_function/mean": -0.33393134772777555,
      "rewards/grpo_reward_function/std": 0.5677398703992367,
      "sampling/importance_sampling_ratio/max": 1.9730541229248046,
      "sampling/importance_sampling_ratio/mean": 0.3427995890378952,
      "sampling/importance_sampling_ratio/min": 1.0688633483368904e-05,
      "sampling/sampling_logp_difference/max": 2.8619158267974854,
      "sampling/sampling_logp_difference/mean": 0.0201931843534112,
      "step": 70,
      "step_time": 541.6015901661478
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01666666716337204,
      "completions/max_length": 3292.7,
      "completions/max_terminated_length": 2336.3,
      "completions/mean_length": 605.0791870117188,
      "completions/mean_terminated_length": 546.5141571044921,
      "completions/min_length": 142.3,
      "completions/min_terminated_length": 142.3,
      "entropy": 0.16853776723146438,
      "epoch": 0.09603841536614646,
      "frac_reward_zero_std": 0.10000000298023223,
      "grad_norm": 0.0669359341263771,
      "learning_rate": 4.730538922155689e-06,
      "loss": 0.003,
      "num_tokens": 8693089.0,
      "reward": -0.36407424658536913,
      "reward_std": 0.15138040184974672,
      "rewards/grpo_reward_function/mean": -0.364074233174324,
      "rewards/grpo_reward_function/std": 0.4984104484319687,
      "sampling/importance_sampling_ratio/max": 2.417657721042633,
      "sampling/importance_sampling_ratio/mean": 0.3272848010063171,
      "sampling/importance_sampling_ratio/min": 6.19425904005766e-05,
      "sampling/sampling_logp_difference/max": 2.832179582118988,
      "sampling/sampling_logp_difference/mean": 0.019849142245948314,
      "step": 80,
      "step_time": 559.4207322074101
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.018750000558793545,
      "completions/max_length": 3217.2,
      "completions/max_terminated_length": 2202.3,
      "completions/mean_length": 652.0937744140625,
      "completions/mean_terminated_length": 587.0378173828125,
      "completions/min_length": 164.3,
      "completions/min_terminated_length": 164.3,
      "entropy": 0.16202539429068566,
      "epoch": 0.10804321728691477,
      "frac_reward_zero_std": 0.10000000223517418,
      "grad_norm": 0.03904345631599426,
      "learning_rate": 5.32934131736527e-06,
      "loss": -0.0041,
      "num_tokens": 9849890.0,
      "reward": -0.3628114402294159,
      "reward_std": 0.2544385172426701,
      "rewards/grpo_reward_function/mean": -0.3628114327788353,
      "rewards/grpo_reward_function/std": 0.6255016416311264,
      "sampling/importance_sampling_ratio/max": 1.7914996325969696,
      "sampling/importance_sampling_ratio/mean": 0.3064177379012108,
      "sampling/importance_sampling_ratio/min": 1.5591655392199753e-05,
      "sampling/sampling_logp_difference/max": 2.840235471725464,
      "sampling/sampling_logp_difference/mean": 0.01814730800688267,
      "step": 90,
      "step_time": 555.9057860235683
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.010416666977107525,
      "completions/max_length": 2567.4,
      "completions/max_terminated_length": 2092.1,
      "completions/mean_length": 548.7250213623047,
      "completions/mean_terminated_length": 511.10838317871094,
      "completions/min_length": 116.3,
      "completions/min_terminated_length": 116.3,
      "entropy": 0.16799122765660285,
      "epoch": 0.12004801920768307,
      "frac_reward_zero_std": 0.10000000223517418,
      "grad_norm": 0.07849112898111343,
      "learning_rate": 5.928143712574851e-06,
      "loss": -0.0009,
      "num_tokens": 10951862.0,
      "reward": -0.37651871144771576,
      "reward_std": 0.17827629819512367,
      "rewards/grpo_reward_function/mean": -0.3765186980366707,
      "rewards/grpo_reward_function/std": 0.4988637834787369,
      "sampling/importance_sampling_ratio/max": 2.2177215456962585,
      "sampling/importance_sampling_ratio/mean": 0.3616850808262825,
      "sampling/importance_sampling_ratio/min": 0.0,
      "sampling/sampling_logp_difference/max": 2.7038833022117617,
      "sampling/sampling_logp_difference/mean": 0.019389390759170056,
      "step": 100,
      "step_time": 539.0174913492053
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.012500000186264515,
      "completions/max_length": 2678.3,
      "completions/max_terminated_length": 1888.8,
      "completions/mean_length": 535.3271057128907,
      "completions/mean_terminated_length": 491.43089904785154,
      "completions/min_length": 127.5,
      "completions/min_terminated_length": 127.5,
      "entropy": 0.1744688918814063,
      "epoch": 0.13205282112845138,
      "frac_reward_zero_std": 0.0833333358168602,
      "grad_norm": 0.043163955211639404,
      "learning_rate": 6.526946107784432e-06,
      "loss": -0.0173,
      "num_tokens": 12060171.0,
      "reward": -0.2666824638843536,
      "reward_std": 0.12587157338857652,
      "rewards/grpo_reward_function/mean": -0.26668245121836665,
      "rewards/grpo_reward_function/std": 0.40698017328977587,
      "sampling/importance_sampling_ratio/max": 1.899654006958008,
      "sampling/importance_sampling_ratio/mean": 0.37773958817124365,
      "sampling/importance_sampling_ratio/min": 3.46376573256979e-14,
      "sampling/sampling_logp_difference/max": 2.272650396823883,
      "sampling/sampling_logp_difference/mean": 0.01878545032814145,
      "step": 110,
      "step_time": 554.1334780954755
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01666666716337204,
      "completions/max_length": 3077.6,
      "completions/max_terminated_length": 1915.6,
      "completions/mean_length": 565.0541870117188,
      "completions/mean_terminated_length": 504.5126983642578,
      "completions/min_length": 138.5,
      "completions/min_terminated_length": 138.5,
      "entropy": 0.17697170842438936,
      "epoch": 0.14405762304921968,
      "frac_reward_zero_std": 0.0833333358168602,
      "grad_norm": 0.07391675561666489,
      "learning_rate": 7.125748502994012e-06,
      "loss": 0.042,
      "num_tokens": 13168921.0,
      "reward": -0.3262443482875824,
      "reward_std": 0.20909521877765655,
      "rewards/grpo_reward_function/mean": -0.3262443423271179,
      "rewards/grpo_reward_function/std": 0.5003126785159111,
      "sampling/importance_sampling_ratio/max": 2.301289737224579,
      "sampling/importance_sampling_ratio/mean": 0.38998747766017916,
      "sampling/importance_sampling_ratio/min": 7.411608444201079e-05,
      "sampling/sampling_logp_difference/max": 2.7557363152503966,
      "sampling/sampling_logp_difference/mean": 0.01885297931730747,
      "step": 120,
      "step_time": 549.9578140962869
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.00416666679084301,
      "completions/max_length": 2290.1,
      "completions/max_terminated_length": 1886.2,
      "completions/mean_length": 499.789599609375,
      "completions/mean_terminated_length": 484.77695617675784,
      "completions/min_length": 127.0,
      "completions/min_terminated_length": 127.0,
      "entropy": 0.19473983831703662,
      "epoch": 0.15606242496998798,
      "frac_reward_zero_std": 0.03333333432674408,
      "grad_norm": 0.06443756073713303,
      "learning_rate": 7.724550898203594e-06,
      "loss": -0.0203,
      "num_tokens": 14212700.0,
      "reward": -0.26288305670022966,
      "reward_std": 0.1738448791205883,
      "rewards/grpo_reward_function/mean": -0.26288305073976515,
      "rewards/grpo_reward_function/std": 0.48268924951553344,
      "sampling/importance_sampling_ratio/max": 2.1470563650131225,
      "sampling/importance_sampling_ratio/mean": 0.35831653475761416,
      "sampling/importance_sampling_ratio/min": 1.968140890369341e-05,
      "sampling/sampling_logp_difference/max": 2.0650948524475097,
      "sampling/sampling_logp_difference/mean": 0.01968124657869339,
      "step": 130,
      "step_time": 535.0768479405903
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 0.0,
      "completions/clipped_ratio": 0.01250000037252903,
      "completions/max_length": 3015.9,
      "completions/max_terminated_length": 1749.5,
      "completions/mean_length": 492.37709045410156,
      "completions/mean_terminated_length": 446.93010864257815,
      "completions/min_length": 114.7,
      "completions/min_terminated_length": 114.7,
      "entropy": 0.21128173358738422,
      "epoch": 0.16806722689075632,
      "frac_reward_zero_std": 0.05833333432674408,
      "grad_norm": 0.09337731450796127,
      "learning_rate": 8.323353293413174e-06,
      "loss": 0.0277,
      "num_tokens": 15293077.0,
      "reward": -0.26287811398506167,
      "reward_std": 0.14675465896725653,
      "rewards/grpo_reward_function/mean": -0.26287810802459716,
      "rewards/grpo_reward_function/std": 0.37649901360273363,
      "sampling/importance_sampling_ratio/max": 2.3814776659011843,
      "sampling/importance_sampling_ratio/mean": 0.46577124297618866,
      "sampling/importance_sampling_ratio/min": 1.3427511260960534e-06,
      "sampling/sampling_logp_difference/max": 2.155888545513153,
      "sampling/sampling_logp_difference/mean": 0.019245322328060865,
      "step": 140,
      "step_time": 552.9976656335406
    },
    {
      "clip_ratio/high_max": 0.0,
      "clip_ratio/high_mean": 0.0,
      "clip_ratio/low_mean": 4.9971032422035935e-05,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 4.9971032422035935e-05,
      "completions/clipped_ratio": 0.01041666679084301,
      "completions/max_length": 2447.6,
      "completions/max_terminated_length": 1785.5,
      "completions/mean_length": 552.295849609375,
      "completions/mean_terminated_length": 515.5673645019531,
      "completions/min_length": 144.7,
      "completions/min_terminated_length": 144.7,
      "entropy": 0.2773955374956131,
      "epoch": 0.18007202881152462,
      "frac_reward_zero_std": 0.02500000074505806,
      "grad_norm": 0.1007687970995903,
      "learning_rate": 8.922155688622756e-06,
      "loss": -0.0141,
      "num_tokens": 16425531.0,
      "reward": -0.26935882605612277,
      "reward_std": 0.12829533144831656,
      "rewards/grpo_reward_function/mean": -0.2693588202819228,
      "rewards/grpo_reward_function/std": 0.33063299730420115,
      "sampling/importance_sampling_ratio/max": 2.1382891178131103,
      "sampling/importance_sampling_ratio/mean": 0.4445547193288803,
      "sampling/importance_sampling_ratio/min": 7.412413807410812e-05,
      "sampling/sampling_logp_difference/max": 1.7894923090934753,
      "sampling/sampling_logp_difference/mean": 0.019091704115271568,
      "step": 150,
      "step_time": 547.844954107888
    },
    {
      "clip_ratio/high_max": 4.80769231216982e-05,
      "clip_ratio/high_mean": 8.012820762814954e-06,
      "clip_ratio/low_mean": 0.0,
      "clip_ratio/low_min": 0.0,
      "clip_ratio/region_mean": 8.012820762814954e-06,
      "completions/clipped_ratio": 0.010416666977107525,
      "completions/max_length": 2388.9,
      "completions/max_terminated_length": 1982.4,
      "completions/mean_length": 544.4896026611328,
      "completions/mean_terminated_length": 507.44931945800784,
      "completions/min_length": 122.8,
      "completions/min_terminated_length": 122.8,
      "entropy": 0.3212181769311428,
      "epoch": 0.19207683073229292,
      "frac_reward_zero_std": 0.06666666865348816,
      "grad_norm": 0.06833557039499283,
      "learning_rate": 9.520958083832336e-06,
      "loss": 0.0134,
      "num_tokens": 17564686.0,
      "reward": -0.28017824441194533,
      "reward_std": 0.20728585943579675,
      "rewards/grpo_reward_function/mean": -0.28017824441194533,
      "rewards/grpo_reward_function/std": 0.5502792000770569,
      "sampling/importance_sampling_ratio/max": 2.1408735513687134,
      "sampling/importance_sampling_ratio/mean": 0.5016659319400787,
      "sampling/importance_sampling_ratio/min": 0.000778414961314411,
      "sampling/sampling_logp_difference/max": 1.89249027967453,
      "sampling/sampling_logp_difference/mean": 0.01902961954474449,
      "step": 160,
      "step_time": 536.2867619435303
    }
  ],
  "logging_steps": 10,
  "max_steps": 833,
  "num_input_tokens_seen": 18381921,
  "num_train_epochs": 1,
  "save_steps": 167,
  "stateful_callbacks": {
    "TrainerControl": {
      "args": {
        "should_epoch_stop": false,
        "should_evaluate": false,
        "should_log": false,
        "should_save": true,
        "should_training_stop": false
      },
      "attributes": {}
    }
  },
  "total_flos": 0.0,
  "train_batch_size": 2,
  "trial_name": null,
  "trial_params": null
}