ChiefTheLord commited on
Commit
e585be0
·
verified ·
1 Parent(s): c335738

Delete checkpoints-v3.0-discrete-conditional/checkpoint-7936

Browse files
checkpoints-v3.0-discrete-conditional/checkpoint-7936/model.safetensors DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e887f1b6e879d16aeeb945ac33cb90023f09063ee9794f96a646dfddd37a7cad
3
- size 24272320
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/optimizer.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:128a1fddbb7b77e2d5768ecdbfd209a56dc26dc9c1c229df5be3fb2e35853375
3
- size 519947
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/rng_state.pth DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:a0b4c40d315b7b8d9f17bcfcf4edb1280a4a07504377c1268d285373fb8811ca
3
- size 14645
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/scaler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:1dd6dea7a37283e26a2e3ea2e85617aaaf31092e6ed791a6d2f45d466c879233
3
- size 1383
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/scheduler.pt DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:6e1898551e4594b5be5f763f4c93d36a51e8cefc331809d6bf24c41b291641ab
3
- size 1465
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/trainer_state.json DELETED
@@ -1,995 +0,0 @@
1
- {
2
- "best_global_step": null,
3
- "best_metric": null,
4
- "best_model_checkpoint": null,
5
- "epoch": 3.9190123456790125,
6
- "eval_steps": 256,
7
- "global_step": 7936,
8
- "is_hyper_param_search": false,
9
- "is_local_process_zero": true,
10
- "is_world_process_zero": true,
11
- "log_history": [
12
- {
13
- "epoch": 0.06320987654320988,
14
- "grad_norm": 0.23172101378440857,
15
- "learning_rate": 0.000248046875,
16
- "loss": 0.7997208833694458,
17
- "step": 128
18
- },
19
- {
20
- "epoch": 0.12641975308641976,
21
- "grad_norm": 0.21966563165187836,
22
- "learning_rate": 0.000498046875,
23
- "loss": 0.7981692552566528,
24
- "step": 256
25
- },
26
- {
27
- "epoch": 0.12641975308641976,
28
- "eval_cos_loss": 0.2296824600638413,
29
- "eval_loss": 0.7970464796554751,
30
- "eval_mse_loss": 0.7740782310323018,
31
- "step": 256
32
- },
33
- {
34
- "epoch": 0.12641975308641976,
35
- "eval_cos_loss": 0.2296824600638413,
36
- "eval_loss": 0.7970464796554751,
37
- "eval_mse_loss": 0.7740782310323018,
38
- "eval_runtime": 3.972,
39
- "eval_samples_per_second": 659.115,
40
- "eval_steps_per_second": 10.322,
41
- "step": 256
42
- },
43
- {
44
- "epoch": 0.18962962962962962,
45
- "grad_norm": 0.2943345010280609,
46
- "learning_rate": 0.000748046875,
47
- "loss": 0.7972801327705383,
48
- "step": 384
49
- },
50
- {
51
- "epoch": 0.2528395061728395,
52
- "grad_norm": 0.2626311182975769,
53
- "learning_rate": 0.000998046875,
54
- "loss": 0.7905202507972717,
55
- "step": 512
56
- },
57
- {
58
- "epoch": 0.2528395061728395,
59
- "eval_cos_loss": 0.22880334061820332,
60
- "eval_loss": 0.7939348089985732,
61
- "eval_mse_loss": 0.7710544714113561,
62
- "step": 512
63
- },
64
- {
65
- "epoch": 0.2528395061728395,
66
- "eval_cos_loss": 0.22880334061820332,
67
- "eval_loss": 0.7939348089985732,
68
- "eval_mse_loss": 0.7710544714113561,
69
- "eval_runtime": 3.8814,
70
- "eval_samples_per_second": 674.492,
71
- "eval_steps_per_second": 10.563,
72
- "step": 512
73
- },
74
- {
75
- "epoch": 0.3160493827160494,
76
- "grad_norm": 0.22872641682624817,
77
- "learning_rate": 0.0009993089770195807,
78
- "loss": 0.8048543334007263,
79
- "step": 640
80
- },
81
- {
82
- "epoch": 0.37925925925925924,
83
- "grad_norm": 0.25428205728530884,
84
- "learning_rate": 0.0009972160460972733,
85
- "loss": 0.797626793384552,
86
- "step": 768
87
- },
88
- {
89
- "epoch": 0.37925925925925924,
90
- "eval_cos_loss": 0.23112982111733135,
91
- "eval_loss": 0.8032825109435291,
92
- "eval_mse_loss": 0.7801695291588946,
93
- "step": 768
94
- },
95
- {
96
- "epoch": 0.37925925925925924,
97
- "eval_cos_loss": 0.23112982111733135,
98
- "eval_loss": 0.8032825109435291,
99
- "eval_mse_loss": 0.7801695291588946,
100
- "eval_runtime": 3.0002,
101
- "eval_samples_per_second": 872.605,
102
- "eval_steps_per_second": 13.666,
103
- "step": 768
104
- },
105
- {
106
- "epoch": 0.44246913580246916,
107
- "grad_norm": 0.28274211287498474,
108
- "learning_rate": 0.0009937270408736224,
109
- "loss": 0.793694794178009,
110
- "step": 896
111
- },
112
- {
113
- "epoch": 0.505679012345679,
114
- "grad_norm": 0.24109429121017456,
115
- "learning_rate": 0.0009888517577149526,
116
- "loss": 0.7927389740943909,
117
- "step": 1024
118
- },
119
- {
120
- "epoch": 0.505679012345679,
121
- "eval_cos_loss": 0.22725565804214012,
122
- "eval_loss": 0.7899522345240523,
123
- "eval_mse_loss": 0.7672266640314241,
124
- "step": 1024
125
- },
126
- {
127
- "epoch": 0.505679012345679,
128
- "eval_cos_loss": 0.22725565804214012,
129
- "eval_loss": 0.7899522345240523,
130
- "eval_mse_loss": 0.7672266640314241,
131
- "eval_runtime": 2.9714,
132
- "eval_samples_per_second": 881.052,
133
- "eval_steps_per_second": 13.798,
134
- "step": 1024
135
- },
136
- {
137
- "epoch": 0.5688888888888889,
138
- "grad_norm": 0.20911817252635956,
139
- "learning_rate": 0.0009826038853539248,
140
- "loss": 0.7938523292541504,
141
- "step": 1152
142
- },
143
- {
144
- "epoch": 0.6320987654320988,
145
- "grad_norm": 0.4241364896297455,
146
- "learning_rate": 0.0009750009664545572,
147
- "loss": 0.7987790107727051,
148
- "step": 1280
149
- },
150
- {
151
- "epoch": 0.6320987654320988,
152
- "eval_cos_loss": 0.22850282671974925,
153
- "eval_loss": 0.7918573341718534,
154
- "eval_mse_loss": 0.7690070562246369,
155
- "step": 1280
156
- },
157
- {
158
- "epoch": 0.6320987654320988,
159
- "eval_cos_loss": 0.22850282671974925,
160
- "eval_loss": 0.7918573341718534,
161
- "eval_mse_loss": 0.7690070562246369,
162
- "eval_runtime": 2.9523,
163
- "eval_samples_per_second": 886.781,
164
- "eval_steps_per_second": 13.888,
165
- "step": 1280
166
- },
167
- {
168
- "epoch": 0.6953086419753086,
169
- "grad_norm": 0.1920246183872223,
170
- "learning_rate": 0.0009660643483562486,
171
- "loss": 0.7907214760780334,
172
- "step": 1408
173
- },
174
- {
175
- "epoch": 0.7585185185185185,
176
- "grad_norm": 0.2991013526916504,
177
- "learning_rate": 0.0009558191231351013,
178
- "loss": 0.795460045337677,
179
- "step": 1536
180
- },
181
- {
182
- "epoch": 0.7585185185185185,
183
- "eval_cos_loss": 0.22320823516787552,
184
- "eval_loss": 0.780292726144558,
185
- "eval_mse_loss": 0.7579719002653913,
186
- "step": 1536
187
- },
188
- {
189
- "epoch": 0.7585185185185185,
190
- "eval_cos_loss": 0.22320823516787552,
191
- "eval_loss": 0.780292726144558,
192
- "eval_mse_loss": 0.7579719002653913,
193
- "eval_runtime": 2.8726,
194
- "eval_samples_per_second": 911.377,
195
- "eval_steps_per_second": 14.273,
196
- "step": 1536
197
- },
198
- {
199
- "epoch": 0.8217283950617283,
200
- "grad_norm": 0.2981093227863312,
201
- "learning_rate": 0.0009442940571508399,
202
- "loss": 0.7922792434692383,
203
- "step": 1664
204
- },
205
- {
206
- "epoch": 0.8849382716049383,
207
- "grad_norm": 0.3410004675388336,
208
- "learning_rate": 0.0009315215102771411,
209
- "loss": 0.7970292568206787,
210
- "step": 1792
211
- },
212
- {
213
- "epoch": 0.8849382716049383,
214
- "eval_cos_loss": 0.2277334343369414,
215
- "eval_loss": 0.7910735476307753,
216
- "eval_mse_loss": 0.7683002018347019,
217
- "step": 1792
218
- },
219
- {
220
- "epoch": 0.8849382716049383,
221
- "eval_cos_loss": 0.2277334343369414,
222
- "eval_loss": 0.7910735476307753,
223
- "eval_mse_loss": 0.7683002018347019,
224
- "eval_runtime": 2.8417,
225
- "eval_samples_per_second": 921.294,
226
- "eval_steps_per_second": 14.428,
227
- "step": 1792
228
- },
229
- {
230
- "epoch": 0.9481481481481482,
231
- "grad_norm": 0.21089138090610504,
232
- "learning_rate": 0.0009175373450421618,
233
- "loss": 0.791707456111908,
234
- "step": 1920
235
- },
236
- {
237
- "epoch": 1.011358024691358,
238
- "grad_norm": 0.28280070424079895,
239
- "learning_rate": 0.0009023808259343743,
240
- "loss": 0.7900460958480835,
241
- "step": 2048
242
- },
243
- {
244
- "epoch": 1.011358024691358,
245
- "eval_cos_loss": 0.22489730068823185,
246
- "eval_loss": 0.7806248490403338,
247
- "eval_mse_loss": 0.7581351210431355,
248
- "step": 2048
249
- },
250
- {
251
- "epoch": 1.011358024691358,
252
- "eval_cos_loss": 0.22489730068823185,
253
- "eval_loss": 0.7806248490403338,
254
- "eval_mse_loss": 0.7581351210431355,
255
- "eval_runtime": 2.9074,
256
- "eval_samples_per_second": 900.46,
257
- "eval_steps_per_second": 14.102,
258
- "step": 2048
259
- },
260
- {
261
- "epoch": 1.074567901234568,
262
- "grad_norm": 0.2784762382507324,
263
- "learning_rate": 0.0008860945091564399,
264
- "loss": 0.79193115234375,
265
- "step": 2176
266
- },
267
- {
268
- "epoch": 1.1377777777777778,
269
- "grad_norm": 0.1766098588705063,
270
- "learning_rate": 0.0008687241231366662,
271
- "loss": 0.7897006273269653,
272
- "step": 2304
273
- },
274
- {
275
- "epoch": 1.1377777777777778,
276
- "eval_cos_loss": 0.2275779985073136,
277
- "eval_loss": 0.7958266517011131,
278
- "eval_mse_loss": 0.7730688496333796,
279
- "step": 2304
280
- },
281
- {
282
- "epoch": 1.1377777777777778,
283
- "eval_cos_loss": 0.2275779985073136,
284
- "eval_loss": 0.7958266517011131,
285
- "eval_mse_loss": 0.7730688496333796,
286
- "eval_runtime": 2.8954,
287
- "eval_samples_per_second": 904.193,
288
- "eval_steps_per_second": 14.16,
289
- "step": 2304
290
- },
291
- {
292
- "epoch": 1.2009876543209876,
293
- "grad_norm": 0.19487471878528595,
294
- "learning_rate": 0.0008503184401335448,
295
- "loss": 0.7908310294151306,
296
- "step": 2432
297
- },
298
- {
299
- "epoch": 1.2641975308641975,
300
- "grad_norm": 0.2605104446411133,
301
- "learning_rate": 0.0008309291392938795,
302
- "loss": 0.7922152280807495,
303
- "step": 2560
304
- },
305
- {
306
- "epoch": 1.2641975308641975,
307
- "eval_cos_loss": 0.22808007823257911,
308
- "eval_loss": 0.7948527496035506,
309
- "eval_mse_loss": 0.7720447400721108,
310
- "step": 2560
311
- },
312
- {
313
- "epoch": 1.2641975308641975,
314
- "eval_cos_loss": 0.22808007823257911,
315
- "eval_loss": 0.7948527496035506,
316
- "eval_mse_loss": 0.7720447400721108,
317
- "eval_runtime": 3.0225,
318
- "eval_samples_per_second": 866.174,
319
- "eval_steps_per_second": 13.565,
320
- "step": 2560
321
- },
322
- {
323
- "epoch": 1.3274074074074074,
324
- "grad_norm": 0.18549929559230804,
325
- "learning_rate": 0.0008106106615490032,
326
- "loss": 0.7880870699882507,
327
- "step": 2688
328
- },
329
- {
330
- "epoch": 1.3906172839506172,
331
- "grad_norm": 0.19430062174797058,
332
- "learning_rate": 0.0007894200567565075,
333
- "loss": 0.7790378332138062,
334
- "step": 2816
335
- },
336
- {
337
- "epoch": 1.3906172839506172,
338
- "eval_cos_loss": 0.22629769473541073,
339
- "eval_loss": 0.7880184243365032,
340
- "eval_mse_loss": 0.765388655953291,
341
- "step": 2816
342
- },
343
- {
344
- "epoch": 1.3906172839506172,
345
- "eval_cos_loss": 0.22629769473541073,
346
- "eval_loss": 0.7880184243365032,
347
- "eval_mse_loss": 0.765388655953291,
348
- "eval_runtime": 2.9958,
349
- "eval_samples_per_second": 873.889,
350
- "eval_steps_per_second": 13.686,
351
- "step": 2816
352
- },
353
- {
354
- "epoch": 1.453827160493827,
355
- "grad_norm": 0.193603977560997,
356
- "learning_rate": 0.0007674168235166747,
357
- "loss": 0.7834969758987427,
358
- "step": 2944
359
- },
360
- {
361
- "epoch": 1.5170370370370372,
362
- "grad_norm": 0.25885820388793945,
363
- "learning_rate": 0.000744662742113375,
364
- "loss": 0.7839168906211853,
365
- "step": 3072
366
- },
367
- {
368
- "epoch": 1.5170370370370372,
369
- "eval_cos_loss": 0.22938175281373466,
370
- "eval_loss": 0.7993329240054619,
371
- "eval_mse_loss": 0.7763947539213227,
372
- "step": 3072
373
- },
374
- {
375
- "epoch": 1.5170370370370372,
376
- "eval_cos_loss": 0.22938175281373466,
377
- "eval_loss": 0.7993329240054619,
378
- "eval_mse_loss": 0.7763947539213227,
379
- "eval_runtime": 3.8739,
380
- "eval_samples_per_second": 675.802,
381
- "eval_steps_per_second": 10.584,
382
- "step": 3072
383
- },
384
- {
385
- "epoch": 1.5802469135802468,
386
- "grad_norm": 0.31186336278915405,
387
- "learning_rate": 0.0007212217010484917,
388
- "loss": 0.7857620716094971,
389
- "step": 3200
390
- },
391
- {
392
- "epoch": 1.643456790123457,
393
- "grad_norm": 0.2609359323978424,
394
- "learning_rate": 0.0006971595176569332,
395
- "loss": 0.7916860580444336,
396
- "step": 3328
397
- },
398
- {
399
- "epoch": 1.643456790123457,
400
- "eval_cos_loss": 0.22667416457722828,
401
- "eval_loss": 0.7874437686873645,
402
- "eval_mse_loss": 0.7647763519752316,
403
- "step": 3328
404
- },
405
- {
406
- "epoch": 1.643456790123457,
407
- "eval_cos_loss": 0.22667416457722828,
408
- "eval_loss": 0.7874437686873645,
409
- "eval_mse_loss": 0.7647763519752316,
410
- "eval_runtime": 3.9867,
411
- "eval_samples_per_second": 656.685,
412
- "eval_steps_per_second": 10.284,
413
- "step": 3328
414
- },
415
- {
416
- "epoch": 1.7066666666666666,
417
- "grad_norm": 0.2544788718223572,
418
- "learning_rate": 0.0006725437533058971,
419
- "loss": 0.7838412523269653,
420
- "step": 3456
421
- },
422
- {
423
- "epoch": 1.7698765432098766,
424
- "grad_norm": 0.14100751280784607,
425
- "learning_rate": 0.0006474435236972767,
426
- "loss": 0.7874654531478882,
427
- "step": 3584
428
- },
429
- {
430
- "epoch": 1.7698765432098766,
431
- "eval_cos_loss": 0.22428179041641513,
432
- "eval_loss": 0.7828184845970898,
433
- "eval_mse_loss": 0.760390307845139,
434
- "step": 3584
435
- },
436
- {
437
- "epoch": 1.7698765432098766,
438
- "eval_cos_loss": 0.22428179041641513,
439
- "eval_loss": 0.7828184845970898,
440
- "eval_mse_loss": 0.760390307845139,
441
- "eval_runtime": 3.4413,
442
- "eval_samples_per_second": 760.764,
443
- "eval_steps_per_second": 11.914,
444
- "step": 3584
445
- },
446
- {
447
- "epoch": 1.8330864197530863,
448
- "grad_norm": 0.19871363043785095,
449
- "learning_rate": 0.0006219293048058301,
450
- "loss": 0.7843989729881287,
451
- "step": 3712
452
- },
453
- {
454
- "epoch": 1.8962962962962964,
455
- "grad_norm": 0.1531391590833664,
456
- "learning_rate": 0.0005960727349980042,
457
- "loss": 0.7820008993148804,
458
- "step": 3840
459
- },
460
- {
461
- "epoch": 1.8962962962962964,
462
- "eval_cos_loss": 0.2254106565946486,
463
- "eval_loss": 0.7854831189644046,
464
- "eval_mse_loss": 0.7629420510152491,
465
- "step": 3840
466
- },
467
- {
468
- "epoch": 1.8962962962962964,
469
- "eval_cos_loss": 0.2254106565946486,
470
- "eval_loss": 0.7854831189644046,
471
- "eval_mse_loss": 0.7629420510152491,
472
- "eval_runtime": 3.0265,
473
- "eval_samples_per_second": 865.028,
474
- "eval_steps_per_second": 13.547,
475
- "step": 3840
476
- },
477
- {
478
- "epoch": 1.959506172839506,
479
- "grad_norm": 0.30385342240333557,
480
- "learning_rate": 0.0005699464138870106,
481
- "loss": 0.7758321762084961,
482
- "step": 3968
483
- },
484
- {
485
- "epoch": 2.022716049382716,
486
- "grad_norm": 0.3034447133541107,
487
- "learning_rate": 0.0005436236984889356,
488
- "loss": 0.7818131446838379,
489
- "step": 4096
490
- },
491
- {
492
- "epoch": 2.022716049382716,
493
- "eval_cos_loss": 0.22541342421275815,
494
- "eval_loss": 0.7809779222418622,
495
- "eval_mse_loss": 0.758436579529832,
496
- "step": 4096
497
- },
498
- {
499
- "epoch": 2.022716049382716,
500
- "eval_cos_loss": 0.22541342421275815,
501
- "eval_loss": 0.7809779222418622,
502
- "eval_mse_loss": 0.758436579529832,
503
- "eval_runtime": 3.01,
504
- "eval_samples_per_second": 869.76,
505
- "eval_steps_per_second": 13.621,
506
- "step": 4096
507
- },
508
- {
509
- "epoch": 2.0859259259259257,
510
- "grad_norm": 0.1902398020029068,
511
- "learning_rate": 0.0005171784972522235,
512
- "loss": 0.7848505973815918,
513
- "step": 4224
514
- },
515
- {
516
- "epoch": 2.149135802469136,
517
- "grad_norm": 0.18901203572750092,
518
- "learning_rate": 0.0004906850625388614,
519
- "loss": 0.7853664755821228,
520
- "step": 4352
521
- },
522
- {
523
- "epoch": 2.149135802469136,
524
- "eval_cos_loss": 0.2236430724946464,
525
- "eval_loss": 0.7772586345672607,
526
- "eval_mse_loss": 0.7548943234653007,
527
- "step": 4352
528
- },
529
- {
530
- "epoch": 2.149135802469136,
531
- "eval_cos_loss": 0.2236430724946464,
532
- "eval_loss": 0.7772586345672607,
533
- "eval_mse_loss": 0.7548943234653007,
534
- "eval_runtime": 2.898,
535
- "eval_samples_per_second": 903.388,
536
- "eval_steps_per_second": 14.148,
537
- "step": 4352
538
- },
539
- {
540
- "epoch": 2.212345679012346,
541
- "grad_norm": 0.24396322667598724,
542
- "learning_rate": 0.0004642177821399269,
543
- "loss": 0.773828387260437,
544
- "step": 4480
545
- },
546
- {
547
- "epoch": 2.2755555555555556,
548
- "grad_norm": 0.2060992270708084,
549
- "learning_rate": 0.00043785097041088293,
550
- "loss": 0.7780256271362305,
551
- "step": 4608
552
- },
553
- {
554
- "epoch": 2.2755555555555556,
555
- "eval_cos_loss": 0.22296342435406474,
556
- "eval_loss": 0.7785176024204348,
557
- "eval_mse_loss": 0.7562212595125524,
558
- "step": 4608
559
- },
560
- {
561
- "epoch": 2.2755555555555556,
562
- "eval_cos_loss": 0.22296342435406474,
563
- "eval_loss": 0.7785176024204348,
564
- "eval_mse_loss": 0.7562212595125524,
565
- "eval_runtime": 2.9308,
566
- "eval_samples_per_second": 893.271,
567
- "eval_steps_per_second": 13.989,
568
- "step": 4608
569
- },
570
- {
571
- "epoch": 2.338765432098765,
572
- "grad_norm": 0.3069784641265869,
573
- "learning_rate": 0.00041165865961306135,
574
- "loss": 0.7798830270767212,
575
- "step": 4736
576
- },
577
- {
578
- "epoch": 2.4019753086419753,
579
- "grad_norm": 0.2529975175857544,
580
- "learning_rate": 0.000385714392047208,
581
- "loss": 0.7820331454277039,
582
- "step": 4864
583
- },
584
- {
585
- "epoch": 2.4019753086419753,
586
- "eval_cos_loss": 0.22622913849063037,
587
- "eval_loss": 0.7879578732862705,
588
- "eval_mse_loss": 0.7653349579834356,
589
- "step": 4864
590
- },
591
- {
592
- "epoch": 2.4019753086419753,
593
- "eval_cos_loss": 0.22622913849063037,
594
- "eval_loss": 0.7879578732862705,
595
- "eval_mse_loss": 0.7653349579834356,
596
- "eval_runtime": 2.8737,
597
- "eval_samples_per_second": 911.028,
598
- "eval_steps_per_second": 14.267,
599
- "step": 4864
600
- },
601
- {
602
- "epoch": 2.4651851851851854,
603
- "grad_norm": 0.1531030684709549,
604
- "learning_rate": 0.00036009101356272814,
605
- "loss": 0.7813113331794739,
606
- "step": 4992
607
- },
608
- {
609
- "epoch": 2.528395061728395,
610
- "grad_norm": 0.28510212898254395,
611
- "learning_rate": 0.00033486046902241664,
612
- "loss": 0.7822959423065186,
613
- "step": 5120
614
- },
615
- {
616
- "epoch": 2.528395061728395,
617
- "eval_cos_loss": 0.22616689961131026,
618
- "eval_loss": 0.7882982463371463,
619
- "eval_mse_loss": 0.7656815560852609,
620
- "step": 5120
621
- },
622
- {
623
- "epoch": 2.528395061728395,
624
- "eval_cos_loss": 0.22616689961131026,
625
- "eval_loss": 0.7882982463371463,
626
- "eval_mse_loss": 0.7656815560852609,
627
- "eval_runtime": 3.4221,
628
- "eval_samples_per_second": 765.035,
629
- "eval_steps_per_second": 11.981,
630
- "step": 5120
631
- },
632
- {
633
- "epoch": 2.591604938271605,
634
- "grad_norm": 0.3471938371658325,
635
- "learning_rate": 0.00031009360029696107,
636
- "loss": 0.7768149971961975,
637
- "step": 5248
638
- },
639
- {
640
- "epoch": 2.6548148148148147,
641
- "grad_norm": 0.30711060762405396,
642
- "learning_rate": 0.00028585994735640823,
643
- "loss": 0.7821690440177917,
644
- "step": 5376
645
- },
646
- {
647
- "epoch": 2.6548148148148147,
648
- "eval_cos_loss": 0.22484926334241542,
649
- "eval_loss": 0.7831642933008147,
650
- "eval_mse_loss": 0.7606793656581785,
651
- "step": 5376
652
- },
653
- {
654
- "epoch": 2.6548148148148147,
655
- "eval_cos_loss": 0.22484926334241542,
656
- "eval_loss": 0.7831642933008147,
657
- "eval_mse_loss": 0.7606793656581785,
658
- "eval_runtime": 3.9797,
659
- "eval_samples_per_second": 657.845,
660
- "eval_steps_per_second": 10.302,
661
- "step": 5376
662
- },
663
- {
664
- "epoch": 2.718024691358025,
665
- "grad_norm": 0.3152957558631897,
666
- "learning_rate": 0.0002622275530170825,
667
- "loss": 0.780761182308197,
668
- "step": 5504
669
- },
670
- {
671
- "epoch": 2.7812345679012345,
672
- "grad_norm": 0.20758290588855743,
673
- "learning_rate": 0.0002392627718921852,
674
- "loss": 0.7799718976020813,
675
- "step": 5632
676
- },
677
- {
678
- "epoch": 2.7812345679012345,
679
- "eval_cos_loss": 0.22278807185045102,
680
- "eval_loss": 0.7812715899653551,
681
- "eval_mse_loss": 0.758992785360755,
682
- "step": 5632
683
- },
684
- {
685
- "epoch": 2.7812345679012345,
686
- "eval_cos_loss": 0.22278807185045102,
687
- "eval_loss": 0.7812715899653551,
688
- "eval_mse_loss": 0.758992785360755,
689
- "eval_runtime": 3.9125,
690
- "eval_samples_per_second": 669.129,
691
- "eval_steps_per_second": 10.479,
692
- "step": 5632
693
- },
694
- {
695
- "epoch": 2.8444444444444446,
696
- "grad_norm": 0.12054615467786789,
697
- "learning_rate": 0.00021703008408250187,
698
- "loss": 0.7806150913238525,
699
- "step": 5760
700
- },
701
- {
702
- "epoch": 2.907654320987654,
703
- "grad_norm": 0.2542758584022522,
704
- "learning_rate": 0.00019559191413033017,
705
- "loss": 0.7816909551620483,
706
- "step": 5888
707
- },
708
- {
709
- "epoch": 2.907654320987654,
710
- "eval_cos_loss": 0.2219865827298746,
711
- "eval_loss": 0.7693439911051494,
712
- "eval_mse_loss": 0.7471453373025103,
713
- "step": 5888
714
- },
715
- {
716
- "epoch": 2.907654320987654,
717
- "eval_cos_loss": 0.2219865827298746,
718
- "eval_loss": 0.7693439911051494,
719
- "eval_mse_loss": 0.7471453373025103,
720
- "eval_runtime": 3.1964,
721
- "eval_samples_per_second": 819.045,
722
- "eval_steps_per_second": 12.827,
723
- "step": 5888
724
- },
725
- {
726
- "epoch": 2.9708641975308643,
727
- "grad_norm": 0.18909312784671783,
728
- "learning_rate": 0.00017500845574497032,
729
- "loss": 0.7817902565002441,
730
- "step": 6016
731
- },
732
- {
733
- "epoch": 3.034074074074074,
734
- "grad_norm": 0.22536472976207733,
735
- "learning_rate": 0.00015533750279190617,
736
- "loss": 0.7782471179962158,
737
- "step": 6144
738
- },
739
- {
740
- "epoch": 3.034074074074074,
741
- "eval_cos_loss": 0.2228206588727672,
742
- "eval_loss": 0.7775220769207652,
743
- "eval_mse_loss": 0.7552400100521925,
744
- "step": 6144
745
- },
746
- {
747
- "epoch": 3.034074074074074,
748
- "eval_cos_loss": 0.2228206588727672,
749
- "eval_loss": 0.7775220769207652,
750
- "eval_mse_loss": 0.7552400100521925,
751
- "eval_runtime": 2.9098,
752
- "eval_samples_per_second": 899.731,
753
- "eval_steps_per_second": 14.091,
754
- "step": 6144
755
- },
756
- {
757
- "epoch": 3.097283950617284,
758
- "grad_norm": 0.1673809289932251,
759
- "learning_rate": 0.000136634287020226,
760
- "loss": 0.7807482481002808,
761
- "step": 6272
762
- },
763
- {
764
- "epoch": 3.1604938271604937,
765
- "grad_norm": 0.3716048300266266,
766
- "learning_rate": 0.00011895132298390743,
767
- "loss": 0.7781609296798706,
768
- "step": 6400
769
- },
770
- {
771
- "epoch": 3.1604938271604937,
772
- "eval_cos_loss": 0.2247363121771231,
773
- "eval_loss": 0.7763603416884818,
774
- "eval_mse_loss": 0.7538867098529164,
775
- "step": 6400
776
- },
777
- {
778
- "epoch": 3.1604938271604937,
779
- "eval_cos_loss": 0.2247363121771231,
780
- "eval_loss": 0.7763603416884818,
781
- "eval_mse_loss": 0.7538867098529164,
782
- "eval_runtime": 2.9575,
783
- "eval_samples_per_second": 885.206,
784
- "eval_steps_per_second": 13.863,
785
- "step": 6400
786
- },
787
- {
788
- "epoch": 3.2237037037037037,
789
- "grad_norm": 0.23243093490600586,
790
- "learning_rate": 0.00010233826059239426,
791
- "loss": 0.7826894521713257,
792
- "step": 6528
793
- },
794
- {
795
- "epoch": 3.286913580246914,
796
- "grad_norm": 0.20378775894641876,
797
- "learning_rate": 8.68417457044704e-05,
798
- "loss": 0.7802485823631287,
799
- "step": 6656
800
- },
801
- {
802
- "epoch": 3.286913580246914,
803
- "eval_cos_loss": 0.22500916079777042,
804
- "eval_loss": 0.7808182821041201,
805
- "eval_mse_loss": 0.7583173658789658,
806
- "step": 6656
807
- },
808
- {
809
- "epoch": 3.286913580246914,
810
- "eval_cos_loss": 0.22500916079777042,
811
- "eval_loss": 0.7808182821041201,
812
- "eval_mse_loss": 0.7583173658789658,
813
- "eval_runtime": 2.8855,
814
- "eval_samples_per_second": 907.28,
815
- "eval_steps_per_second": 14.209,
816
- "step": 6656
817
- },
818
- {
819
- "epoch": 3.3501234567901235,
820
- "grad_norm": 0.12901104986667633,
821
- "learning_rate": 7.250528915685422e-05,
822
- "loss": 0.7797313332557678,
823
- "step": 6784
824
- },
825
- {
826
- "epoch": 3.413333333333333,
827
- "grad_norm": 0.1314738541841507,
828
- "learning_rate": 5.93691445952505e-05,
829
- "loss": 0.7752676606178284,
830
- "step": 6912
831
- },
832
- {
833
- "epoch": 3.413333333333333,
834
- "eval_cos_loss": 0.22601125480198278,
835
- "eval_loss": 0.7906920822655282,
836
- "eval_mse_loss": 0.7680909546410165,
837
- "step": 6912
838
- },
839
- {
840
- "epoch": 3.413333333333333,
841
- "eval_cos_loss": 0.22601125480198278,
842
- "eval_loss": 0.7906920822655282,
843
- "eval_mse_loss": 0.7680909546410165,
844
- "eval_runtime": 2.9757,
845
- "eval_samples_per_second": 879.801,
846
- "eval_steps_per_second": 13.778,
847
- "step": 6912
848
- },
849
- {
850
- "epoch": 3.476543209876543,
851
- "grad_norm": 0.48443037271499634,
852
- "learning_rate": 4.7470195450886376e-05,
853
- "loss": 0.7797003388404846,
854
- "step": 7040
855
- },
856
- {
857
- "epoch": 3.5397530864197533,
858
- "grad_norm": 0.2399303913116455,
859
- "learning_rate": 3.684185137987378e-05,
860
- "loss": 0.7735744714736938,
861
- "step": 7168
862
- },
863
- {
864
- "epoch": 3.5397530864197533,
865
- "eval_cos_loss": 0.22420256748432066,
866
- "eval_loss": 0.784631618639318,
867
- "eval_mse_loss": 0.7622113591287194,
868
- "step": 7168
869
- },
870
- {
871
- "epoch": 3.5397530864197533,
872
- "eval_cos_loss": 0.22420256748432066,
873
- "eval_loss": 0.784631618639318,
874
- "eval_mse_loss": 0.7622113591287194,
875
- "eval_runtime": 2.8474,
876
- "eval_samples_per_second": 919.424,
877
- "eval_steps_per_second": 14.399,
878
- "step": 7168
879
- },
880
- {
881
- "epoch": 3.602962962962963,
882
- "grad_norm": 0.2262357771396637,
883
- "learning_rate": 2.751395445617594e-05,
884
- "loss": 0.7803007364273071,
885
- "step": 7296
886
- },
887
- {
888
- "epoch": 3.6661728395061726,
889
- "grad_norm": 0.12137070298194885,
890
- "learning_rate": 1.9512695381567302e-05,
891
- "loss": 0.776282548904419,
892
- "step": 7424
893
- },
894
- {
895
- "epoch": 3.6661728395061726,
896
- "eval_cos_loss": 0.22227415696876804,
897
- "eval_loss": 0.7707422797272845,
898
- "eval_mse_loss": 0.7485148630491117,
899
- "step": 7424
900
- },
901
- {
902
- "epoch": 3.6661728395061726,
903
- "eval_cos_loss": 0.22227415696876804,
904
- "eval_loss": 0.7707422797272845,
905
- "eval_mse_loss": 0.7485148630491117,
906
- "eval_runtime": 2.937,
907
- "eval_samples_per_second": 891.381,
908
- "eval_steps_per_second": 13.96,
909
- "step": 7424
910
- },
911
- {
912
- "epoch": 3.7293827160493827,
913
- "grad_norm": 0.18187913298606873,
914
- "learning_rate": 1.2860539947850303e-05,
915
- "loss": 0.780539870262146,
916
- "step": 7552
917
- },
918
- {
919
- "epoch": 3.7925925925925927,
920
- "grad_norm": 0.17140205204486847,
921
- "learning_rate": 7.5761659578078565e-06,
922
- "loss": 0.7720261812210083,
923
- "step": 7680
924
- },
925
- {
926
- "epoch": 3.7925925925925927,
927
- "eval_cos_loss": 0.22543459512838504,
928
- "eval_loss": 0.7825955297888779,
929
- "eval_mse_loss": 0.7600520703850723,
930
- "step": 7680
931
- },
932
- {
933
- "epoch": 3.7925925925925927,
934
- "eval_cos_loss": 0.22543459512838504,
935
- "eval_loss": 0.7825955297888779,
936
- "eval_mse_loss": 0.7600520703850723,
937
- "eval_runtime": 3.0241,
938
- "eval_samples_per_second": 865.716,
939
- "eval_steps_per_second": 13.558,
940
- "step": 7680
941
- },
942
- {
943
- "epoch": 3.8558024691358024,
944
- "grad_norm": 0.2181410789489746,
945
- "learning_rate": 3.674410782003812e-06,
946
- "loss": 0.7785236835479736,
947
- "step": 7808
948
- },
949
- {
950
- "epoch": 3.9190123456790125,
951
- "grad_norm": 0.1741209775209427,
952
- "learning_rate": 1.1662296986795217e-06,
953
- "loss": 0.7693132162094116,
954
- "step": 7936
955
- },
956
- {
957
- "epoch": 3.9190123456790125,
958
- "eval_cos_loss": 0.22448242837336005,
959
- "eval_loss": 0.7836573894430952,
960
- "eval_mse_loss": 0.7612091462786604,
961
- "step": 7936
962
- },
963
- {
964
- "epoch": 3.9190123456790125,
965
- "eval_cos_loss": 0.22448242837336005,
966
- "eval_loss": 0.7836573894430952,
967
- "eval_mse_loss": 0.7612091462786604,
968
- "eval_runtime": 3.6021,
969
- "eval_samples_per_second": 726.789,
970
- "eval_steps_per_second": 11.382,
971
- "step": 7936
972
- }
973
- ],
974
- "logging_steps": 128,
975
- "max_steps": 8100,
976
- "num_input_tokens_seen": 0,
977
- "num_train_epochs": 4,
978
- "save_steps": 256,
979
- "stateful_callbacks": {
980
- "TrainerControl": {
981
- "args": {
982
- "should_epoch_stop": false,
983
- "should_evaluate": false,
984
- "should_log": false,
985
- "should_save": true,
986
- "should_training_stop": false
987
- },
988
- "attributes": {}
989
- }
990
- },
991
- "total_flos": 0.0,
992
- "train_batch_size": 64,
993
- "trial_name": null,
994
- "trial_params": null
995
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
checkpoints-v3.0-discrete-conditional/checkpoint-7936/training_args.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c6c0fa495a5924736d222e27760d060d662f12f7b3a2b9af86925d4d1f405f6c
3
- size 5137