ChiefTheLord commited on
Commit
c4dfd95
·
verified ·
1 Parent(s): eaf41df

Upload folder using huggingface_hub

Browse files
checkpoints-v1/checkpoint-14336/ema.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:42037039f322c66e4db015fad3ecc3ae3e427264853ff9d0066aa6a7ee231f01
3
+ size 571816
checkpoints-v1/checkpoint-14336/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4151eb5921dad621a3441c3b6a39b6d8702f93fae0d4f8d5ba00a31d216219d1
3
+ size 19265392
checkpoints-v1/checkpoint-14336/optimizer.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88946300e31ba750dde4ed985f5abfb8cb17c978e5106d43da1339fdf32d9a2f
3
+ size 1212939
checkpoints-v1/checkpoint-14336/rng_state.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fb1ced3c1a73919bac75ddd04d7fab6d5228194743b711df8de676564c549fd
3
+ size 14645
checkpoints-v1/checkpoint-14336/scaler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bd345a16eac44d3b6973d1b101c7e88ac305f5d055942cd287e3b7f78937bd29
3
+ size 1383
checkpoints-v1/checkpoint-14336/scheduler.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6d083abe2a0e28e3d2a41416c38d70182db4da2363a1af9e95380e20ec2d4d1e
3
+ size 1465
checkpoints-v1/checkpoint-14336/trainer_state.json ADDED
@@ -0,0 +1,748 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.6621403168444876,
6
+ "eval_steps": 1024,
7
+ "global_step": 14336,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.011823934229365849,
14
+ "grad_norm": 0.17313924431800842,
15
+ "learning_rate": 0.000498046875,
16
+ "loss": 1.7064213752746582,
17
+ "step": 256
18
+ },
19
+ {
20
+ "epoch": 0.023647868458731697,
21
+ "grad_norm": 0.15720121562480927,
22
+ "learning_rate": 0.000998046875,
23
+ "loss": 1.1101791858673096,
24
+ "step": 512
25
+ },
26
+ {
27
+ "epoch": 0.03547180268809755,
28
+ "grad_norm": 0.145066499710083,
29
+ "learning_rate": 0.000999640996023194,
30
+ "loss": 1.0319945812225342,
31
+ "step": 768
32
+ },
33
+ {
34
+ "epoch": 0.047295736917463395,
35
+ "grad_norm": 0.1564043015241623,
36
+ "learning_rate": 0.0009985588674043958,
37
+ "loss": 0.9714428186416626,
38
+ "step": 1024
39
+ },
40
+ {
41
+ "epoch": 0.047295736917463395,
42
+ "eval_loss": 0.9119398776254698,
43
+ "eval_mse_loss": 0.9119398776254698,
44
+ "flow/cos_sim": 0.705863389794685,
45
+ "flow/improvement_ratio": 0.4745521226974383,
46
+ "flow/mag_ratio_mean": 0.6946878124317623,
47
+ "flow/mag_ratio_std": 0.17089739207127322,
48
+ "step": 1024
49
+ },
50
+ {
51
+ "epoch": 0.047295736917463395,
52
+ "eval_loss": 0.9119398776254698,
53
+ "eval_mse_loss": 0.9119398776254698,
54
+ "eval_runtime": 100.3557,
55
+ "eval_samples_per_second": 278.938,
56
+ "eval_steps_per_second": 4.364,
57
+ "flow/cos_sim": 0.705863389794685,
58
+ "flow/improvement_ratio": 0.4745521226974383,
59
+ "flow/mag_ratio_mean": 0.6946878124317623,
60
+ "flow/mag_ratio_std": 0.17089739207127322,
61
+ "step": 1024
62
+ },
63
+ {
64
+ "epoch": 0.05911967114682925,
65
+ "grad_norm": 0.16647565364837646,
66
+ "learning_rate": 0.0009967551747861387,
67
+ "loss": 0.929740846157074,
68
+ "step": 1280
69
+ },
70
+ {
71
+ "epoch": 0.0709436053761951,
72
+ "grad_norm": 0.18282100558280945,
73
+ "learning_rate": 0.000994232528651847,
74
+ "loss": 0.895330548286438,
75
+ "step": 1536
76
+ },
77
+ {
78
+ "epoch": 0.08276753960556095,
79
+ "grad_norm": 0.17141221463680267,
80
+ "learning_rate": 0.0009909945800260092,
81
+ "loss": 0.8683519959449768,
82
+ "step": 1792
83
+ },
84
+ {
85
+ "epoch": 0.09459147383492679,
86
+ "grad_norm": 0.19230426847934723,
87
+ "learning_rate": 0.0009870460151900522,
88
+ "loss": 0.8509060144424438,
89
+ "step": 2048
90
+ },
91
+ {
92
+ "epoch": 0.09459147383492679,
93
+ "eval_loss": 0.8054492728078746,
94
+ "eval_mse_loss": 0.8054492728078746,
95
+ "flow/cos_sim": 0.7493123941497716,
96
+ "flow/improvement_ratio": 0.476775190669652,
97
+ "flow/mag_ratio_mean": 0.7334817982699773,
98
+ "flow/mag_ratio_std": 0.1754700492846367,
99
+ "step": 2048
100
+ },
101
+ {
102
+ "epoch": 0.09459147383492679,
103
+ "eval_loss": 0.8054492728078746,
104
+ "eval_mse_loss": 0.8054492728078746,
105
+ "eval_runtime": 101.0321,
106
+ "eval_samples_per_second": 277.07,
107
+ "eval_steps_per_second": 4.335,
108
+ "flow/cos_sim": 0.7493123941497716,
109
+ "flow/improvement_ratio": 0.476775190669652,
110
+ "flow/mag_ratio_mean": 0.7334817982699773,
111
+ "flow/mag_ratio_std": 0.1754700492846367,
112
+ "step": 2048
113
+ },
114
+ {
115
+ "epoch": 0.10641540806429264,
116
+ "grad_norm": 0.1906282752752304,
117
+ "learning_rate": 0.0009823925488998885,
118
+ "loss": 0.8359999060630798,
119
+ "step": 2304
120
+ },
121
+ {
122
+ "epoch": 0.1182393422936585,
123
+ "grad_norm": 0.24338403344154358,
124
+ "learning_rate": 0.0009770409161149525,
125
+ "loss": 0.8268019556999207,
126
+ "step": 2560
127
+ },
128
+ {
129
+ "epoch": 0.13006327652302435,
130
+ "grad_norm": 0.2402328997850418,
131
+ "learning_rate": 0.0009709988622506973,
132
+ "loss": 0.8164141178131104,
133
+ "step": 2816
134
+ },
135
+ {
136
+ "epoch": 0.1418872107523902,
137
+ "grad_norm": 0.19165532290935516,
138
+ "learning_rate": 0.000964275131968659,
139
+ "loss": 0.8093737363815308,
140
+ "step": 3072
141
+ },
142
+ {
143
+ "epoch": 0.1418872107523902,
144
+ "eval_loss": 0.7729098539646357,
145
+ "eval_mse_loss": 0.7729098539646357,
146
+ "flow/cos_sim": 0.7615711595097633,
147
+ "flow/improvement_ratio": 0.48455443985113816,
148
+ "flow/mag_ratio_mean": 0.7450809510059009,
149
+ "flow/mag_ratio_std": 0.17553651608424645,
150
+ "step": 3072
151
+ },
152
+ {
153
+ "epoch": 0.1418872107523902,
154
+ "eval_loss": 0.7729098539646357,
155
+ "eval_mse_loss": 0.7729098539646357,
156
+ "eval_runtime": 100.8602,
157
+ "eval_samples_per_second": 277.543,
158
+ "eval_steps_per_second": 4.343,
159
+ "flow/cos_sim": 0.7615711595097633,
160
+ "flow/improvement_ratio": 0.48455443985113816,
161
+ "flow/mag_ratio_mean": 0.7450809510059009,
162
+ "flow/mag_ratio_std": 0.17553651608424645,
163
+ "step": 3072
164
+ },
165
+ {
166
+ "epoch": 0.15371114498175603,
167
+ "grad_norm": 0.23683102428913116,
168
+ "learning_rate": 0.0009568794565203123,
169
+ "loss": 0.8039254546165466,
170
+ "step": 3328
171
+ },
172
+ {
173
+ "epoch": 0.1655350792111219,
174
+ "grad_norm": 0.24094223976135254,
175
+ "learning_rate": 0.0009488225396630347,
176
+ "loss": 0.8009832501411438,
177
+ "step": 3584
178
+ },
179
+ {
180
+ "epoch": 0.17735901344048774,
181
+ "grad_norm": 0.22388198971748352,
182
+ "learning_rate": 0.0009401160421685646,
183
+ "loss": 0.7924661636352539,
184
+ "step": 3840
185
+ },
186
+ {
187
+ "epoch": 0.18918294766985358,
188
+ "grad_norm": 0.22257934510707855,
189
+ "learning_rate": 0.0009307725649463714,
190
+ "loss": 0.7945993542671204,
191
+ "step": 4096
192
+ },
193
+ {
194
+ "epoch": 0.18918294766985358,
195
+ "eval_loss": 0.7515348514465436,
196
+ "eval_mse_loss": 0.7515348514465436,
197
+ "flow/cos_sim": 0.7693686923479925,
198
+ "flow/improvement_ratio": 0.4754656074934354,
199
+ "flow/mag_ratio_mean": 0.7561575877884207,
200
+ "flow/mag_ratio_std": 0.1765889780570383,
201
+ "step": 4096
202
+ },
203
+ {
204
+ "epoch": 0.18918294766985358,
205
+ "eval_loss": 0.7515348514465436,
206
+ "eval_mse_loss": 0.7515348514465436,
207
+ "eval_runtime": 100.3871,
208
+ "eval_samples_per_second": 278.851,
209
+ "eval_steps_per_second": 4.363,
210
+ "flow/cos_sim": 0.7693686923479925,
211
+ "flow/improvement_ratio": 0.4754656074934354,
212
+ "flow/mag_ratio_mean": 0.7561575877884207,
213
+ "flow/mag_ratio_std": 0.1765889780570383,
214
+ "step": 4096
215
+ },
216
+ {
217
+ "epoch": 0.20100688189921945,
218
+ "grad_norm": 0.22517219185829163,
219
+ "learning_rate": 0.0009208056308063659,
220
+ "loss": 0.7901711463928223,
221
+ "step": 4352
222
+ },
223
+ {
224
+ "epoch": 0.2128308161285853,
225
+ "grad_norm": 0.2154047191143036,
226
+ "learning_rate": 0.0009102296648873445,
227
+ "loss": 0.7860417366027832,
228
+ "step": 4608
229
+ },
230
+ {
231
+ "epoch": 0.22465475035795113,
232
+ "grad_norm": 0.2263442873954773,
233
+ "learning_rate": 0.0008990599737794927,
234
+ "loss": 0.785137414932251,
235
+ "step": 4864
236
+ },
237
+ {
238
+ "epoch": 0.236478684587317,
239
+ "grad_norm": 0.2566796839237213,
240
+ "learning_rate": 0.0008873127233711644,
241
+ "loss": 0.7781056761741638,
242
+ "step": 5120
243
+ },
244
+ {
245
+ "epoch": 0.236478684587317,
246
+ "eval_loss": 0.7397306067486332,
247
+ "eval_mse_loss": 0.7397306067486332,
248
+ "flow/cos_sim": 0.7743443068304018,
249
+ "flow/improvement_ratio": 0.47565921052405824,
250
+ "flow/mag_ratio_mean": 0.7624385751545701,
251
+ "flow/mag_ratio_std": 0.17572799042615717,
252
+ "step": 5120
253
+ },
254
+ {
255
+ "epoch": 0.236478684587317,
256
+ "eval_loss": 0.7397306067486332,
257
+ "eval_mse_loss": 0.7397306067486332,
258
+ "eval_runtime": 100.3693,
259
+ "eval_samples_per_second": 278.9,
260
+ "eval_steps_per_second": 4.364,
261
+ "flow/cos_sim": 0.7743443068304018,
262
+ "flow/improvement_ratio": 0.47565921052405824,
263
+ "flow/mag_ratio_mean": 0.7624385751545701,
264
+ "flow/mag_ratio_std": 0.17572799042615717,
265
+ "step": 5120
266
+ },
267
+ {
268
+ "epoch": 0.24830261881668284,
269
+ "grad_norm": 0.2051643282175064,
270
+ "learning_rate": 0.0008750049154520011,
271
+ "loss": 0.7773663997650146,
272
+ "step": 5376
273
+ },
274
+ {
275
+ "epoch": 0.2601265530460487,
276
+ "grad_norm": 0.23271359503269196,
277
+ "learning_rate": 0.0008621543631062487,
278
+ "loss": 0.7789486050605774,
279
+ "step": 5632
280
+ },
281
+ {
282
+ "epoch": 0.27195048727541454,
283
+ "grad_norm": 0.22768262028694153,
284
+ "learning_rate": 0.0008487796649318904,
285
+ "loss": 0.7753859758377075,
286
+ "step": 5888
287
+ },
288
+ {
289
+ "epoch": 0.2837744215047804,
290
+ "grad_norm": 0.23808346688747406,
291
+ "learning_rate": 0.0008349001781229053,
292
+ "loss": 0.7760079503059387,
293
+ "step": 6144
294
+ },
295
+ {
296
+ "epoch": 0.2837744215047804,
297
+ "eval_loss": 0.7382390670580407,
298
+ "eval_mse_loss": 0.7382390670580407,
299
+ "flow/cos_sim": 0.7747776206225565,
300
+ "flow/improvement_ratio": 0.48073648889315185,
301
+ "flow/mag_ratio_mean": 0.7609562668354,
302
+ "flow/mag_ratio_std": 0.17754725645802336,
303
+ "step": 6144
304
+ },
305
+ {
306
+ "epoch": 0.2837744215047804,
307
+ "eval_loss": 0.7382390670580407,
308
+ "eval_mse_loss": 0.7382390670580407,
309
+ "eval_runtime": 101.1596,
310
+ "eval_samples_per_second": 276.721,
311
+ "eval_steps_per_second": 4.33,
312
+ "flow/cos_sim": 0.7747776206225565,
313
+ "flow/improvement_ratio": 0.48073648889315185,
314
+ "flow/mag_ratio_mean": 0.7609562668354,
315
+ "flow/mag_ratio_std": 0.17754725645802336,
316
+ "step": 6144
317
+ },
318
+ {
319
+ "epoch": 0.2955983557341462,
320
+ "grad_norm": 0.22078828513622284,
321
+ "learning_rate": 0.0008205359904536107,
322
+ "loss": 0.7707664966583252,
323
+ "step": 6400
324
+ },
325
+ {
326
+ "epoch": 0.30742228996351206,
327
+ "grad_norm": 0.21839560568332672,
328
+ "learning_rate": 0.0008057078912056363,
329
+ "loss": 0.7725305557250977,
330
+ "step": 6656
331
+ },
332
+ {
333
+ "epoch": 0.3192462241928779,
334
+ "grad_norm": 0.23544862866401672,
335
+ "learning_rate": 0.0007904373410796086,
336
+ "loss": 0.7722088694572449,
337
+ "step": 6912
338
+ },
339
+ {
340
+ "epoch": 0.3310701584222438,
341
+ "grad_norm": 0.23949392139911652,
342
+ "learning_rate": 0.0007747464411350876,
343
+ "loss": 0.7707281708717346,
344
+ "step": 7168
345
+ },
346
+ {
347
+ "epoch": 0.3310701584222438,
348
+ "eval_loss": 0.7320967807889529,
349
+ "eval_mse_loss": 0.7320967807889529,
350
+ "flow/cos_sim": 0.7774923427464211,
351
+ "flow/improvement_ratio": 0.48102973714538905,
352
+ "flow/mag_ratio_mean": 0.7606384374507486,
353
+ "flow/mag_ratio_std": 0.1749520302839475,
354
+ "step": 7168
355
+ },
356
+ {
357
+ "epoch": 0.3310701584222438,
358
+ "eval_loss": 0.7320967807889529,
359
+ "eval_mse_loss": 0.7320967807889529,
360
+ "eval_runtime": 101.0362,
361
+ "eval_samples_per_second": 277.059,
362
+ "eval_steps_per_second": 4.335,
363
+ "flow/cos_sim": 0.7774923427464211,
364
+ "flow/improvement_ratio": 0.48102973714538905,
365
+ "flow/mag_ratio_mean": 0.7606384374507486,
366
+ "flow/mag_ratio_std": 0.1749520302839475,
367
+ "step": 7168
368
+ },
369
+ {
370
+ "epoch": 0.34289409265160964,
371
+ "grad_norm": 0.21281319856643677,
372
+ "learning_rate": 0.000758657900803716,
373
+ "loss": 0.7705408930778503,
374
+ "step": 7424
375
+ },
376
+ {
377
+ "epoch": 0.3547180268809755,
378
+ "grad_norm": 0.2182183712720871,
379
+ "learning_rate": 0.000742195005021869,
380
+ "loss": 0.7691267728805542,
381
+ "step": 7680
382
+ },
383
+ {
384
+ "epoch": 0.3665419611103413,
385
+ "grad_norm": 0.20540344715118408,
386
+ "learning_rate": 0.0007253815805303786,
387
+ "loss": 0.7687782049179077,
388
+ "step": 7936
389
+ },
390
+ {
391
+ "epoch": 0.37836589533970716,
392
+ "grad_norm": 0.25242024660110474,
393
+ "learning_rate": 0.000708309515673374,
394
+ "loss": 0.7675922513008118,
395
+ "step": 8192
396
+ },
397
+ {
398
+ "epoch": 0.37836589533970716,
399
+ "eval_loss": 0.72715279745729,
400
+ "eval_mse_loss": 0.72715279745729,
401
+ "flow/cos_sim": 0.7793520910826992,
402
+ "flow/improvement_ratio": 0.47947503408612724,
403
+ "flow/mag_ratio_mean": 0.7625162698090349,
404
+ "flow/mag_ratio_std": 0.17530882001331408,
405
+ "step": 8192
406
+ },
407
+ {
408
+ "epoch": 0.37836589533970716,
409
+ "eval_loss": 0.72715279745729,
410
+ "eval_mse_loss": 0.72715279745729,
411
+ "eval_runtime": 100.656,
412
+ "eval_samples_per_second": 278.106,
413
+ "eval_steps_per_second": 4.351,
414
+ "flow/cos_sim": 0.7793520910826992,
415
+ "flow/improvement_ratio": 0.47947503408612724,
416
+ "flow/mag_ratio_mean": 0.7625162698090349,
417
+ "flow/mag_ratio_std": 0.17530882001331408,
418
+ "step": 8192
419
+ },
420
+ {
421
+ "epoch": 0.390189829569073,
422
+ "grad_norm": 0.18503619730472565,
423
+ "learning_rate": 0.0006908696365085842,
424
+ "loss": 0.7652720808982849,
425
+ "step": 8448
426
+ },
427
+ {
428
+ "epoch": 0.4020137637984389,
429
+ "grad_norm": 0.20311063528060913,
430
+ "learning_rate": 0.0006731535118143318,
431
+ "loss": 0.7671474814414978,
432
+ "step": 8704
433
+ },
434
+ {
435
+ "epoch": 0.41383769802780473,
436
+ "grad_norm": 0.17226819694042206,
437
+ "learning_rate": 0.0006551867821290267,
438
+ "loss": 0.7648245096206665,
439
+ "step": 8960
440
+ },
441
+ {
442
+ "epoch": 0.4256616322571706,
443
+ "grad_norm": 0.20406724512577057,
444
+ "learning_rate": 0.0006369954506915572,
445
+ "loss": 0.7627850770950317,
446
+ "step": 9216
447
+ },
448
+ {
449
+ "epoch": 0.4256616322571706,
450
+ "eval_loss": 0.7245442253541728,
451
+ "eval_mse_loss": 0.7245442253541728,
452
+ "flow/cos_sim": 0.7804642662610093,
453
+ "flow/improvement_ratio": 0.478297841671395,
454
+ "flow/mag_ratio_mean": 0.7637837749637969,
455
+ "flow/mag_ratio_std": 0.17519798711554646,
456
+ "step": 9216
457
+ },
458
+ {
459
+ "epoch": 0.4256616322571706,
460
+ "eval_loss": 0.7245442253541728,
461
+ "eval_mse_loss": 0.7245442253541728,
462
+ "eval_runtime": 100.5176,
463
+ "eval_samples_per_second": 278.488,
464
+ "eval_steps_per_second": 4.357,
465
+ "flow/cos_sim": 0.7804642662610093,
466
+ "flow/improvement_ratio": 0.478297841671395,
467
+ "flow/mag_ratio_mean": 0.7637837749637969,
468
+ "flow/mag_ratio_std": 0.17519798711554646,
469
+ "step": 9216
470
+ },
471
+ {
472
+ "epoch": 0.4374855664865364,
473
+ "grad_norm": 0.19549715518951416,
474
+ "learning_rate": 0.0006186058458068149,
475
+ "loss": 0.7646156549453735,
476
+ "step": 9472
477
+ },
478
+ {
479
+ "epoch": 0.44930950071590225,
480
+ "grad_norm": 0.22935132682323456,
481
+ "learning_rate": 0.0006000445827407526,
482
+ "loss": 0.7637522220611572,
483
+ "step": 9728
484
+ },
485
+ {
486
+ "epoch": 0.4611334349452681,
487
+ "grad_norm": 0.1892293095588684,
488
+ "learning_rate": 0.0005813385252001232,
489
+ "loss": 0.7614347338676453,
490
+ "step": 9984
491
+ },
492
+ {
493
+ "epoch": 0.472957369174634,
494
+ "grad_norm": 0.19596756994724274,
495
+ "learning_rate": 0.00056251474645265,
496
+ "loss": 0.7602492570877075,
497
+ "step": 10240
498
+ },
499
+ {
500
+ "epoch": 0.472957369174634,
501
+ "eval_loss": 0.7225635307564583,
502
+ "eval_mse_loss": 0.7225635307564583,
503
+ "flow/cos_sim": 0.781412697137763,
504
+ "flow/improvement_ratio": 0.48148406279958,
505
+ "flow/mag_ratio_mean": 0.7642885947336345,
506
+ "flow/mag_ratio_std": 0.17388889067657462,
507
+ "step": 10240
508
+ },
509
+ {
510
+ "epoch": 0.472957369174634,
511
+ "eval_loss": 0.7225635307564583,
512
+ "eval_mse_loss": 0.7225635307564583,
513
+ "eval_runtime": 100.4132,
514
+ "eval_samples_per_second": 278.778,
515
+ "eval_steps_per_second": 4.362,
516
+ "flow/cos_sim": 0.781412697137763,
517
+ "flow/improvement_ratio": 0.48148406279958,
518
+ "flow/mag_ratio_mean": 0.7642885947336345,
519
+ "flow/mag_ratio_std": 0.17388889067657462,
520
+ "step": 10240
521
+ },
522
+ {
523
+ "epoch": 0.48478130340399983,
524
+ "grad_norm": 0.2039279341697693,
525
+ "learning_rate": 0.0005436004901439003,
526
+ "loss": 0.7591882944107056,
527
+ "step": 10496
528
+ },
529
+ {
530
+ "epoch": 0.49660523763336567,
531
+ "grad_norm": 0.18449348211288452,
532
+ "learning_rate": 0.0005246973484120841,
533
+ "loss": 0.7581324577331543,
534
+ "step": 10752
535
+ },
536
+ {
537
+ "epoch": 0.5084291718627315,
538
+ "grad_norm": 0.18734613060951233,
539
+ "learning_rate": 0.0005056844377834413,
540
+ "loss": 0.7585782408714294,
541
+ "step": 11008
542
+ },
543
+ {
544
+ "epoch": 0.5202531060920974,
545
+ "grad_norm": 0.19812439382076263,
546
+ "learning_rate": 0.0004866633000708374,
547
+ "loss": 0.758400559425354,
548
+ "step": 11264
549
+ },
550
+ {
551
+ "epoch": 0.5202531060920974,
552
+ "eval_loss": 0.7193547954025878,
553
+ "eval_mse_loss": 0.7193547954025878,
554
+ "flow/cos_sim": 0.7829214582160183,
555
+ "flow/improvement_ratio": 0.4840309496747849,
556
+ "flow/mag_ratio_mean": 0.7646983984398515,
557
+ "flow/mag_ratio_std": 0.17315611838614015,
558
+ "step": 11264
559
+ },
560
+ {
561
+ "epoch": 0.5202531060920974,
562
+ "eval_loss": 0.7193547954025878,
563
+ "eval_mse_loss": 0.7193547954025878,
564
+ "eval_runtime": 100.2674,
565
+ "eval_samples_per_second": 279.183,
566
+ "eval_steps_per_second": 4.368,
567
+ "flow/cos_sim": 0.7829214582160183,
568
+ "flow/improvement_ratio": 0.4840309496747849,
569
+ "flow/mag_ratio_mean": 0.7646983984398515,
570
+ "flow/mag_ratio_std": 0.17315611838614015,
571
+ "step": 11264
572
+ },
573
+ {
574
+ "epoch": 0.5320770403214632,
575
+ "grad_norm": 0.20949678122997284,
576
+ "learning_rate": 0.00046766146455737116,
577
+ "loss": 0.756182074546814,
578
+ "step": 11520
579
+ },
580
+ {
581
+ "epoch": 0.5439009745508291,
582
+ "grad_norm": 0.20575636625289917,
583
+ "learning_rate": 0.00044870643259007823,
584
+ "loss": 0.7595986723899841,
585
+ "step": 11776
586
+ },
587
+ {
588
+ "epoch": 0.5557249087801949,
589
+ "grad_norm": 0.18551619350910187,
590
+ "learning_rate": 0.000429825637777245,
591
+ "loss": 0.7588385939598083,
592
+ "step": 12032
593
+ },
594
+ {
595
+ "epoch": 0.5675488430095608,
596
+ "grad_norm": 0.21281953155994415,
597
+ "learning_rate": 0.00041104640628376166,
598
+ "loss": 0.755641758441925,
599
+ "step": 12288
600
+ },
601
+ {
602
+ "epoch": 0.5675488430095608,
603
+ "eval_loss": 0.7185944062934074,
604
+ "eval_mse_loss": 0.7185944062934074,
605
+ "flow/cos_sim": 0.7829709806943048,
606
+ "flow/improvement_ratio": 0.4833299268598426,
607
+ "flow/mag_ratio_mean": 0.7666111965429837,
608
+ "flow/mag_ratio_std": 0.17369252120114897,
609
+ "step": 12288
610
+ },
611
+ {
612
+ "epoch": 0.5675488430095608,
613
+ "eval_loss": 0.7185944062934074,
614
+ "eval_mse_loss": 0.7185944062934074,
615
+ "eval_runtime": 100.2742,
616
+ "eval_samples_per_second": 279.165,
617
+ "eval_steps_per_second": 4.368,
618
+ "flow/cos_sim": 0.7829709806943048,
619
+ "flow/improvement_ratio": 0.4833299268598426,
620
+ "flow/mag_ratio_mean": 0.7666111965429837,
621
+ "flow/mag_ratio_std": 0.17369252120114897,
622
+ "step": 12288
623
+ },
624
+ {
625
+ "epoch": 0.5793727772389267,
626
+ "grad_norm": 0.1611846536397934,
627
+ "learning_rate": 0.00039239591728197724,
628
+ "loss": 0.7571735382080078,
629
+ "step": 12544
630
+ },
631
+ {
632
+ "epoch": 0.5911967114682924,
633
+ "grad_norm": 0.17743456363677979,
634
+ "learning_rate": 0.0003739011636152962,
635
+ "loss": 0.7581431269645691,
636
+ "step": 12800
637
+ },
638
+ {
639
+ "epoch": 0.6030206456976583,
640
+ "grad_norm": 0.17940299212932587,
641
+ "learning_rate": 0.0003556600555022182,
642
+ "loss": 0.7566176652908325,
643
+ "step": 13056
644
+ },
645
+ {
646
+ "epoch": 0.6148445799270241,
647
+ "grad_norm": 0.1930851936340332,
648
+ "learning_rate": 0.00033755594309981104,
649
+ "loss": 0.7547513246536255,
650
+ "step": 13312
651
+ },
652
+ {
653
+ "epoch": 0.6148445799270241,
654
+ "eval_loss": 0.7150395362888842,
655
+ "eval_mse_loss": 0.7150395362888842,
656
+ "flow/cos_sim": 0.7844073471685523,
657
+ "flow/improvement_ratio": 0.47775867540542394,
658
+ "flow/mag_ratio_mean": 0.7674100999146292,
659
+ "flow/mag_ratio_std": 0.1724472795039007,
660
+ "step": 13312
661
+ },
662
+ {
663
+ "epoch": 0.6148445799270241,
664
+ "eval_loss": 0.7150395362888842,
665
+ "eval_mse_loss": 0.7150395362888842,
666
+ "eval_runtime": 100.7945,
667
+ "eval_samples_per_second": 277.724,
668
+ "eval_steps_per_second": 4.345,
669
+ "flow/cos_sim": 0.7844073471685523,
670
+ "flow/improvement_ratio": 0.47775867540542394,
671
+ "flow/mag_ratio_mean": 0.7674100999146292,
672
+ "flow/mag_ratio_std": 0.1724472795039007,
673
+ "step": 13312
674
+ },
675
+ {
676
+ "epoch": 0.62666851415639,
677
+ "grad_norm": 0.17439058423042297,
678
+ "learning_rate": 0.000319686935899715,
679
+ "loss": 0.7525946497917175,
680
+ "step": 13568
681
+ },
682
+ {
683
+ "epoch": 0.6384924483857558,
684
+ "grad_norm": 0.16240930557250977,
685
+ "learning_rate": 0.0003020788957071146,
686
+ "loss": 0.7547707557678223,
687
+ "step": 13824
688
+ },
689
+ {
690
+ "epoch": 0.6503163826151217,
691
+ "grad_norm": 0.16282112896442413,
692
+ "learning_rate": 0.0002847573066297746,
693
+ "loss": 0.7539600729942322,
694
+ "step": 14080
695
+ },
696
+ {
697
+ "epoch": 0.6621403168444876,
698
+ "grad_norm": 0.19493776559829712,
699
+ "learning_rate": 0.0002677472381949038,
700
+ "loss": 0.753170371055603,
701
+ "step": 14336
702
+ },
703
+ {
704
+ "epoch": 0.6621403168444876,
705
+ "eval_loss": 0.7136020788319035,
706
+ "eval_mse_loss": 0.7136020788319035,
707
+ "flow/cos_sim": 0.7848518109757062,
708
+ "flow/improvement_ratio": 0.47987128467592477,
709
+ "flow/mag_ratio_mean": 0.7672100954404161,
710
+ "flow/mag_ratio_std": 0.17320666945280005,
711
+ "step": 14336
712
+ },
713
+ {
714
+ "epoch": 0.6621403168444876,
715
+ "eval_loss": 0.7136020788319035,
716
+ "eval_mse_loss": 0.7136020788319035,
717
+ "eval_runtime": 100.4201,
718
+ "eval_samples_per_second": 278.759,
719
+ "eval_steps_per_second": 4.362,
720
+ "flow/cos_sim": 0.7848518109757062,
721
+ "flow/improvement_ratio": 0.47987128467592477,
722
+ "flow/mag_ratio_mean": 0.7672100954404161,
723
+ "flow/mag_ratio_std": 0.17320666945280005,
724
+ "step": 14336
725
+ }
726
+ ],
727
+ "logging_steps": 256,
728
+ "max_steps": 21651,
729
+ "num_input_tokens_seen": 0,
730
+ "num_train_epochs": 1,
731
+ "save_steps": 1024,
732
+ "stateful_callbacks": {
733
+ "TrainerControl": {
734
+ "args": {
735
+ "should_epoch_stop": false,
736
+ "should_evaluate": false,
737
+ "should_log": false,
738
+ "should_save": true,
739
+ "should_training_stop": false
740
+ },
741
+ "attributes": {}
742
+ }
743
+ },
744
+ "total_flos": 0.0,
745
+ "train_batch_size": 64,
746
+ "trial_name": null,
747
+ "trial_params": null
748
+ }
checkpoints-v1/checkpoint-14336/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:03a8c5803c6e5663005c9dc7a4c98c213c8267649ccec42581475db390cc8017
3
+ size 5137