Attila1011 commited on
Commit
00b6eba
·
verified ·
1 Parent(s): 3d27857

Upload folder using huggingface_hub

Browse files
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:94aee3109ec017ff4fa956c66618a861668dabc8c4cac4cc1f9443872b26837a
3
+ size 45167832
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a53e14e82cbb90183b17a6db69be50f86112169fd99bf58584941f9bafa66d14
3
+ size 42312267
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ea11996454b5587fcf33ae0ab5cf14b2031bf5f53f8c2ed5a48e87de31e29c84
3
+ size 14645
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f52db02d1ef343c718b20949e8402af29ccf7d4ae00897235ad12dfc91f027cb
3
+ size 1383
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:71760c942926f7fa3025971e1a39a433eb072d72b2a0429a47879487d0239308
3
+ size 1465
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/trainer_state.json ADDED
@@ -0,0 +1,608 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 4.0,
6
+ "eval_steps": 1024,
7
+ "global_step": 8100,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06320987654320988,
14
+ "grad_norm": 1.8168452978134155,
15
+ "learning_rate": 0.000248046875,
16
+ "loss": 3.4849891662597656,
17
+ "step": 128
18
+ },
19
+ {
20
+ "epoch": 0.12641975308641976,
21
+ "grad_norm": 3.4383368492126465,
22
+ "learning_rate": 0.000498046875,
23
+ "loss": 1.282984972000122,
24
+ "step": 256
25
+ },
26
+ {
27
+ "epoch": 0.18962962962962962,
28
+ "grad_norm": 1.2715426683425903,
29
+ "learning_rate": 0.000748046875,
30
+ "loss": 1.0911226272583008,
31
+ "step": 384
32
+ },
33
+ {
34
+ "epoch": 0.2528395061728395,
35
+ "grad_norm": 1.0402259826660156,
36
+ "learning_rate": 0.000998046875,
37
+ "loss": 1.0667308568954468,
38
+ "step": 512
39
+ },
40
+ {
41
+ "epoch": 0.3160493827160494,
42
+ "grad_norm": 0.7335332036018372,
43
+ "learning_rate": 0.0009993089770195807,
44
+ "loss": 1.0423768758773804,
45
+ "step": 640
46
+ },
47
+ {
48
+ "epoch": 0.37925925925925924,
49
+ "grad_norm": 1.2647337913513184,
50
+ "learning_rate": 0.0009972160460972733,
51
+ "loss": 1.0243909358978271,
52
+ "step": 768
53
+ },
54
+ {
55
+ "epoch": 0.44246913580246916,
56
+ "grad_norm": 1.6875989437103271,
57
+ "learning_rate": 0.0009937270408736224,
58
+ "loss": 1.015304446220398,
59
+ "step": 896
60
+ },
61
+ {
62
+ "epoch": 0.505679012345679,
63
+ "grad_norm": 0.8772087097167969,
64
+ "learning_rate": 0.0009888517577149526,
65
+ "loss": 1.0097696781158447,
66
+ "step": 1024
67
+ },
68
+ {
69
+ "epoch": 0.505679012345679,
70
+ "eval_bleu": 0.0,
71
+ "eval_cos_loss": 0.6681160955894284,
72
+ "eval_loss": 1.0068177740748336,
73
+ "eval_mse_loss": 1.0068177740748336,
74
+ "step": 1024
75
+ },
76
+ {
77
+ "epoch": 0.505679012345679,
78
+ "eval_bleu": 0.0,
79
+ "eval_cos_loss": 0.6681160955894284,
80
+ "eval_loss": 1.0068177740748336,
81
+ "eval_mse_loss": 1.0068177740748336,
82
+ "eval_runtime": 17.5866,
83
+ "eval_samples_per_second": 148.863,
84
+ "eval_steps_per_second": 2.331,
85
+ "step": 1024
86
+ },
87
+ {
88
+ "epoch": 0.5688888888888889,
89
+ "grad_norm": 0.9257954359054565,
90
+ "learning_rate": 0.0009826038853539248,
91
+ "loss": 1.0054709911346436,
92
+ "step": 1152
93
+ },
94
+ {
95
+ "epoch": 0.6320987654320988,
96
+ "grad_norm": 1.2401124238967896,
97
+ "learning_rate": 0.0009750009664545572,
98
+ "loss": 1.0001628398895264,
99
+ "step": 1280
100
+ },
101
+ {
102
+ "epoch": 0.6953086419753086,
103
+ "grad_norm": 0.9945711493492126,
104
+ "learning_rate": 0.0009660643483562486,
105
+ "loss": 0.9950642585754395,
106
+ "step": 1408
107
+ },
108
+ {
109
+ "epoch": 0.7585185185185185,
110
+ "grad_norm": 0.4016058146953583,
111
+ "learning_rate": 0.0009558191231351013,
112
+ "loss": 0.9919660091400146,
113
+ "step": 1536
114
+ },
115
+ {
116
+ "epoch": 0.8217283950617283,
117
+ "grad_norm": 0.4495377540588379,
118
+ "learning_rate": 0.0009442940571508399,
119
+ "loss": 0.9906907081604004,
120
+ "step": 1664
121
+ },
122
+ {
123
+ "epoch": 0.8849382716049383,
124
+ "grad_norm": 1.0224924087524414,
125
+ "learning_rate": 0.0009315215102771411,
126
+ "loss": 0.990159809589386,
127
+ "step": 1792
128
+ },
129
+ {
130
+ "epoch": 0.9481481481481482,
131
+ "grad_norm": 0.7064864039421082,
132
+ "learning_rate": 0.0009175373450421618,
133
+ "loss": 0.9885232448577881,
134
+ "step": 1920
135
+ },
136
+ {
137
+ "epoch": 1.011358024691358,
138
+ "grad_norm": 0.6735867857933044,
139
+ "learning_rate": 0.0009023808259343743,
140
+ "loss": 0.9828425645828247,
141
+ "step": 2048
142
+ },
143
+ {
144
+ "epoch": 1.011358024691358,
145
+ "eval_bleu": 0.0020327926577703463,
146
+ "eval_cos_loss": 0.6557952049301892,
147
+ "eval_loss": 0.9808391550692116,
148
+ "eval_mse_loss": 0.9808391550692116,
149
+ "step": 2048
150
+ },
151
+ {
152
+ "epoch": 1.011358024691358,
153
+ "eval_bleu": 0.0020327926577703463,
154
+ "eval_cos_loss": 0.6557952049301892,
155
+ "eval_loss": 0.9808391550692116,
156
+ "eval_mse_loss": 0.9808391550692116,
157
+ "eval_runtime": 17.3438,
158
+ "eval_samples_per_second": 150.948,
159
+ "eval_steps_per_second": 2.364,
160
+ "step": 2048
161
+ },
162
+ {
163
+ "epoch": 1.074567901234568,
164
+ "grad_norm": 0.8397714495658875,
165
+ "learning_rate": 0.0008860945091564399,
166
+ "loss": 0.9799662232398987,
167
+ "step": 2176
168
+ },
169
+ {
170
+ "epoch": 1.1377777777777778,
171
+ "grad_norm": 0.48910048604011536,
172
+ "learning_rate": 0.0008687241231366662,
173
+ "loss": 0.9816181063652039,
174
+ "step": 2304
175
+ },
176
+ {
177
+ "epoch": 1.2009876543209876,
178
+ "grad_norm": 0.5660194158554077,
179
+ "learning_rate": 0.0008503184401335448,
180
+ "loss": 0.9848842620849609,
181
+ "step": 2432
182
+ },
183
+ {
184
+ "epoch": 1.2641975308641975,
185
+ "grad_norm": 0.4576466977596283,
186
+ "learning_rate": 0.0008309291392938795,
187
+ "loss": 0.982272744178772,
188
+ "step": 2560
189
+ },
190
+ {
191
+ "epoch": 1.3274074074074074,
192
+ "grad_norm": 1.350696086883545,
193
+ "learning_rate": 0.0008106106615490032,
194
+ "loss": 0.9800283908843994,
195
+ "step": 2688
196
+ },
197
+ {
198
+ "epoch": 1.3906172839506172,
199
+ "grad_norm": 0.606335461139679,
200
+ "learning_rate": 0.0007894200567565075,
201
+ "loss": 0.9786688685417175,
202
+ "step": 2816
203
+ },
204
+ {
205
+ "epoch": 1.453827160493827,
206
+ "grad_norm": 0.8053555488586426,
207
+ "learning_rate": 0.0007674168235166747,
208
+ "loss": 0.9759098887443542,
209
+ "step": 2944
210
+ },
211
+ {
212
+ "epoch": 1.5170370370370372,
213
+ "grad_norm": 0.4296761751174927,
214
+ "learning_rate": 0.000744662742113375,
215
+ "loss": 0.9750839471817017,
216
+ "step": 3072
217
+ },
218
+ {
219
+ "epoch": 1.5170370370370372,
220
+ "eval_bleu": 0.001550567530729039,
221
+ "eval_cos_loss": 0.653304009902768,
222
+ "eval_loss": 0.9743910940682016,
223
+ "eval_mse_loss": 0.9743910940682016,
224
+ "step": 3072
225
+ },
226
+ {
227
+ "epoch": 1.5170370370370372,
228
+ "eval_bleu": 0.001550567530729039,
229
+ "eval_cos_loss": 0.653304009902768,
230
+ "eval_loss": 0.9743910940682016,
231
+ "eval_mse_loss": 0.9743910940682016,
232
+ "eval_runtime": 17.0839,
233
+ "eval_samples_per_second": 153.244,
234
+ "eval_steps_per_second": 2.4,
235
+ "step": 3072
236
+ },
237
+ {
238
+ "epoch": 1.5802469135802468,
239
+ "grad_norm": 0.5009872317314148,
240
+ "learning_rate": 0.0007212217010484917,
241
+ "loss": 0.9776103496551514,
242
+ "step": 3200
243
+ },
244
+ {
245
+ "epoch": 1.643456790123457,
246
+ "grad_norm": 0.4625374674797058,
247
+ "learning_rate": 0.0006971595176569332,
248
+ "loss": 0.9762816429138184,
249
+ "step": 3328
250
+ },
251
+ {
252
+ "epoch": 1.7066666666666666,
253
+ "grad_norm": 0.5958267450332642,
254
+ "learning_rate": 0.0006725437533058971,
255
+ "loss": 0.9745242595672607,
256
+ "step": 3456
257
+ },
258
+ {
259
+ "epoch": 1.7698765432098766,
260
+ "grad_norm": 0.9703472852706909,
261
+ "learning_rate": 0.0006474435236972767,
262
+ "loss": 0.9753237366676331,
263
+ "step": 3584
264
+ },
265
+ {
266
+ "epoch": 1.8330864197530863,
267
+ "grad_norm": 0.346620112657547,
268
+ "learning_rate": 0.0006219293048058301,
269
+ "loss": 0.9756260514259338,
270
+ "step": 3712
271
+ },
272
+ {
273
+ "epoch": 1.8962962962962964,
274
+ "grad_norm": 0.5703808665275574,
275
+ "learning_rate": 0.0005960727349980042,
276
+ "loss": 0.970862865447998,
277
+ "step": 3840
278
+ },
279
+ {
280
+ "epoch": 1.959506172839506,
281
+ "grad_norm": 0.42382702231407166,
282
+ "learning_rate": 0.0005699464138870106,
283
+ "loss": 0.9738116264343262,
284
+ "step": 3968
285
+ },
286
+ {
287
+ "epoch": 2.022716049382716,
288
+ "grad_norm": 0.42642295360565186,
289
+ "learning_rate": 0.0005436236984889356,
290
+ "loss": 0.9707122445106506,
291
+ "step": 4096
292
+ },
293
+ {
294
+ "epoch": 2.022716049382716,
295
+ "eval_bleu": 0.003457283045849753,
296
+ "eval_cos_loss": 0.6549375914945835,
297
+ "eval_loss": 0.9716728445960254,
298
+ "eval_mse_loss": 0.9716728445960254,
299
+ "step": 4096
300
+ },
301
+ {
302
+ "epoch": 2.022716049382716,
303
+ "eval_bleu": 0.003457283045849753,
304
+ "eval_cos_loss": 0.6549375914945835,
305
+ "eval_loss": 0.9716728445960254,
306
+ "eval_mse_loss": 0.9716728445960254,
307
+ "eval_runtime": 17.5486,
308
+ "eval_samples_per_second": 149.186,
309
+ "eval_steps_per_second": 2.336,
310
+ "step": 4096
311
+ },
312
+ {
313
+ "epoch": 2.0859259259259257,
314
+ "grad_norm": 0.36422199010849,
315
+ "learning_rate": 0.0005171784972522235,
316
+ "loss": 0.9708704352378845,
317
+ "step": 4224
318
+ },
319
+ {
320
+ "epoch": 2.149135802469136,
321
+ "grad_norm": 0.6977311372756958,
322
+ "learning_rate": 0.0004906850625388614,
323
+ "loss": 0.9703112244606018,
324
+ "step": 4352
325
+ },
326
+ {
327
+ "epoch": 2.212345679012346,
328
+ "grad_norm": 0.37942081689834595,
329
+ "learning_rate": 0.0004642177821399269,
330
+ "loss": 0.9709182977676392,
331
+ "step": 4480
332
+ },
333
+ {
334
+ "epoch": 2.2755555555555556,
335
+ "grad_norm": 0.3576072156429291,
336
+ "learning_rate": 0.00043785097041088293,
337
+ "loss": 0.9707843065261841,
338
+ "step": 4608
339
+ },
340
+ {
341
+ "epoch": 2.338765432098765,
342
+ "grad_norm": 0.3725082278251648,
343
+ "learning_rate": 0.00041165865961306135,
344
+ "loss": 0.966793954372406,
345
+ "step": 4736
346
+ },
347
+ {
348
+ "epoch": 2.4019753086419753,
349
+ "grad_norm": 0.2623865008354187,
350
+ "learning_rate": 0.000385714392047208,
351
+ "loss": 0.9677779078483582,
352
+ "step": 4864
353
+ },
354
+ {
355
+ "epoch": 2.4651851851851854,
356
+ "grad_norm": 0.4568561911582947,
357
+ "learning_rate": 0.00036009101356272814,
358
+ "loss": 0.9753569960594177,
359
+ "step": 4992
360
+ },
361
+ {
362
+ "epoch": 2.528395061728395,
363
+ "grad_norm": 0.49066415429115295,
364
+ "learning_rate": 0.00033486046902241664,
365
+ "loss": 0.9661346673965454,
366
+ "step": 5120
367
+ },
368
+ {
369
+ "epoch": 2.528395061728395,
370
+ "eval_bleu": 0.003345180086375018,
371
+ "eval_cos_loss": 0.6469777037457722,
372
+ "eval_loss": 0.9654292958538707,
373
+ "eval_mse_loss": 0.9654292958538707,
374
+ "step": 5120
375
+ },
376
+ {
377
+ "epoch": 2.528395061728395,
378
+ "eval_bleu": 0.003345180086375018,
379
+ "eval_cos_loss": 0.6469777037457722,
380
+ "eval_loss": 0.9654292958538707,
381
+ "eval_mse_loss": 0.9654292958538707,
382
+ "eval_runtime": 16.8316,
383
+ "eval_samples_per_second": 155.541,
384
+ "eval_steps_per_second": 2.436,
385
+ "step": 5120
386
+ },
387
+ {
388
+ "epoch": 2.591604938271605,
389
+ "grad_norm": 0.6127116680145264,
390
+ "learning_rate": 0.00031009360029696107,
391
+ "loss": 0.9650511145591736,
392
+ "step": 5248
393
+ },
394
+ {
395
+ "epoch": 2.6548148148148147,
396
+ "grad_norm": 0.37896695733070374,
397
+ "learning_rate": 0.00028585994735640823,
398
+ "loss": 0.9659166932106018,
399
+ "step": 5376
400
+ },
401
+ {
402
+ "epoch": 2.718024691358025,
403
+ "grad_norm": 0.4591119885444641,
404
+ "learning_rate": 0.0002622275530170825,
405
+ "loss": 0.9637454152107239,
406
+ "step": 5504
407
+ },
408
+ {
409
+ "epoch": 2.7812345679012345,
410
+ "grad_norm": 0.5013575553894043,
411
+ "learning_rate": 0.0002392627718921852,
412
+ "loss": 0.9639162421226501,
413
+ "step": 5632
414
+ },
415
+ {
416
+ "epoch": 2.8444444444444446,
417
+ "grad_norm": 0.3516771197319031,
418
+ "learning_rate": 0.00021703008408250187,
419
+ "loss": 0.9626727104187012,
420
+ "step": 5760
421
+ },
422
+ {
423
+ "epoch": 2.907654320987654,
424
+ "grad_norm": 0.3407333791255951,
425
+ "learning_rate": 0.00019559191413033017,
426
+ "loss": 0.9621551036834717,
427
+ "step": 5888
428
+ },
429
+ {
430
+ "epoch": 2.9708641975308643,
431
+ "grad_norm": 0.26509255170822144,
432
+ "learning_rate": 0.00017500845574497032,
433
+ "loss": 0.9614520072937012,
434
+ "step": 6016
435
+ },
436
+ {
437
+ "epoch": 3.034074074074074,
438
+ "grad_norm": 0.33290010690689087,
439
+ "learning_rate": 0.00015533750279190617,
440
+ "loss": 0.9614716172218323,
441
+ "step": 6144
442
+ },
443
+ {
444
+ "epoch": 3.034074074074074,
445
+ "eval_bleu": 0.004510767224176567,
446
+ "eval_cos_loss": 0.6460333800897365,
447
+ "eval_loss": 0.9615877198009957,
448
+ "eval_mse_loss": 0.9615877198009957,
449
+ "step": 6144
450
+ },
451
+ {
452
+ "epoch": 3.034074074074074,
453
+ "eval_bleu": 0.004510767224176567,
454
+ "eval_cos_loss": 0.6460333800897365,
455
+ "eval_loss": 0.9615877198009957,
456
+ "eval_mse_loss": 0.9615877198009957,
457
+ "eval_runtime": 16.8514,
458
+ "eval_samples_per_second": 155.358,
459
+ "eval_steps_per_second": 2.433,
460
+ "step": 6144
461
+ },
462
+ {
463
+ "epoch": 3.097283950617284,
464
+ "grad_norm": 0.33247849345207214,
465
+ "learning_rate": 0.000136634287020226,
466
+ "loss": 0.9614198207855225,
467
+ "step": 6272
468
+ },
469
+ {
470
+ "epoch": 3.1604938271604937,
471
+ "grad_norm": 0.2738645374774933,
472
+ "learning_rate": 0.00011895132298390743,
473
+ "loss": 0.9605588316917419,
474
+ "step": 6400
475
+ },
476
+ {
477
+ "epoch": 3.2237037037037037,
478
+ "grad_norm": 0.2882324755191803,
479
+ "learning_rate": 0.00010233826059239426,
480
+ "loss": 0.9604360461235046,
481
+ "step": 6528
482
+ },
483
+ {
484
+ "epoch": 3.286913580246914,
485
+ "grad_norm": 0.302536278963089,
486
+ "learning_rate": 8.68417457044704e-05,
487
+ "loss": 0.9597956538200378,
488
+ "step": 6656
489
+ },
490
+ {
491
+ "epoch": 3.3501234567901235,
492
+ "grad_norm": 0.5232012867927551,
493
+ "learning_rate": 7.250528915685422e-05,
494
+ "loss": 0.9590287804603577,
495
+ "step": 6784
496
+ },
497
+ {
498
+ "epoch": 3.413333333333333,
499
+ "grad_norm": 0.21842055022716522,
500
+ "learning_rate": 5.93691445952505e-05,
501
+ "loss": 0.9591075778007507,
502
+ "step": 6912
503
+ },
504
+ {
505
+ "epoch": 3.476543209876543,
506
+ "grad_norm": 0.6795418858528137,
507
+ "learning_rate": 4.7470195450886376e-05,
508
+ "loss": 0.9593430161476135,
509
+ "step": 7040
510
+ },
511
+ {
512
+ "epoch": 3.5397530864197533,
513
+ "grad_norm": 0.18848949670791626,
514
+ "learning_rate": 3.684185137987378e-05,
515
+ "loss": 0.958720862865448,
516
+ "step": 7168
517
+ },
518
+ {
519
+ "epoch": 3.5397530864197533,
520
+ "eval_bleu": 0.004545481313600588,
521
+ "eval_cos_loss": 0.6461245112302827,
522
+ "eval_loss": 0.9590820045005984,
523
+ "eval_mse_loss": 0.9590820045005984,
524
+ "step": 7168
525
+ },
526
+ {
527
+ "epoch": 3.5397530864197533,
528
+ "eval_bleu": 0.004545481313600588,
529
+ "eval_cos_loss": 0.6461245112302827,
530
+ "eval_loss": 0.9590820045005984,
531
+ "eval_mse_loss": 0.9590820045005984,
532
+ "eval_runtime": 17.8655,
533
+ "eval_samples_per_second": 146.539,
534
+ "eval_steps_per_second": 2.295,
535
+ "step": 7168
536
+ },
537
+ {
538
+ "epoch": 3.602962962962963,
539
+ "grad_norm": 0.18368160724639893,
540
+ "learning_rate": 2.751395445617594e-05,
541
+ "loss": 0.9586146473884583,
542
+ "step": 7296
543
+ },
544
+ {
545
+ "epoch": 3.6661728395061726,
546
+ "grad_norm": 0.1991145759820938,
547
+ "learning_rate": 1.9512695381567302e-05,
548
+ "loss": 0.9582223892211914,
549
+ "step": 7424
550
+ },
551
+ {
552
+ "epoch": 3.7293827160493827,
553
+ "grad_norm": 0.3184373080730438,
554
+ "learning_rate": 1.2860539947850303e-05,
555
+ "loss": 0.9586706757545471,
556
+ "step": 7552
557
+ },
558
+ {
559
+ "epoch": 3.7925925925925927,
560
+ "grad_norm": 0.21128305792808533,
561
+ "learning_rate": 7.5761659578078565e-06,
562
+ "loss": 0.9583309888839722,
563
+ "step": 7680
564
+ },
565
+ {
566
+ "epoch": 3.8558024691358024,
567
+ "grad_norm": 0.17950057983398438,
568
+ "learning_rate": 3.674410782003812e-06,
569
+ "loss": 0.9576842784881592,
570
+ "step": 7808
571
+ },
572
+ {
573
+ "epoch": 3.9190123456790125,
574
+ "grad_norm": 0.1886749416589737,
575
+ "learning_rate": 1.1662296986795217e-06,
576
+ "loss": 0.9585922360420227,
577
+ "step": 7936
578
+ },
579
+ {
580
+ "epoch": 3.982222222222222,
581
+ "grad_norm": 0.18049107491970062,
582
+ "learning_rate": 5.866513372004834e-08,
583
+ "loss": 0.9581732749938965,
584
+ "step": 8064
585
+ }
586
+ ],
587
+ "logging_steps": 128,
588
+ "max_steps": 8100,
589
+ "num_input_tokens_seen": 0,
590
+ "num_train_epochs": 4,
591
+ "save_steps": 1024,
592
+ "stateful_callbacks": {
593
+ "TrainerControl": {
594
+ "args": {
595
+ "should_epoch_stop": false,
596
+ "should_evaluate": false,
597
+ "should_log": false,
598
+ "should_save": true,
599
+ "should_training_stop": true
600
+ },
601
+ "attributes": {}
602
+ }
603
+ },
604
+ "total_flos": 0.0,
605
+ "train_batch_size": 64,
606
+ "trial_name": null,
607
+ "trial_params": null
608
+ }
checkpoints-v3.2.1-discrete-conditional/checkpoint-8100/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c53f3e10d2a055b85b1814f3e029cdcd51dd4f5e4025edaf0233172e301d1199
3
+ size 5137