Plofski commited on
Commit
f1b9a74
·
verified ·
1 Parent(s): 50052b6

Training in progress, step 4000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:7f96aaa5e97f3f83387afc0775efd5e922752a17138c7276a9efe7c9ff0bbeee
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:17ab6fbe9c97d82ef7dac860e0afd63f233555e8f23a9fd5286c2c92aa0de809
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:451881f3cab07a4e85e5f970801619f2d6aa94fada708d3b827ca3fafa636054
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a2fdccc0924c16c14bbca889730272d2d9adcc2fdeb5cc2188b22634e6a65ba6
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:cae1361ad95b650252f8194ff20a5669981349cd4f0f59f3528fb4497ea319b8
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b9d0e16227a53d102f718b321b6ebc380604ad5e862513fc6df0711cea1a67f
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 0.7052186177715092,
6
  "eval_steps": 500,
7
- "global_step": 3500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -3158,6 +3158,456 @@
3158
  "mean_token_accuracy": 0.7891253709793091,
3159
  "num_tokens": 3879065.0,
3160
  "step": 3500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3161
  }
3162
  ],
3163
  "logging_steps": 10,
@@ -3177,7 +3627,7 @@
3177
  "attributes": {}
3178
  }
3179
  },
3180
- "total_flos": 4699418269335552.0,
3181
  "train_batch_size": 8,
3182
  "trial_name": null,
3183
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 0.8059641345960105,
6
  "eval_steps": 500,
7
+ "global_step": 4000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
3158
  "mean_token_accuracy": 0.7891253709793091,
3159
  "num_tokens": 3879065.0,
3160
  "step": 3500
3161
+ },
3162
+ {
3163
+ "epoch": 0.7072335281079992,
3164
+ "grad_norm": 12.25,
3165
+ "learning_rate": 1.5286453086171e-05,
3166
+ "loss": 1.021,
3167
+ "mean_token_accuracy": 0.7534075140953064,
3168
+ "num_tokens": 3890409.0,
3169
+ "step": 3510
3170
+ },
3171
+ {
3172
+ "epoch": 0.7092484384444893,
3173
+ "grad_norm": 11.0625,
3174
+ "learning_rate": 1.52730203505944e-05,
3175
+ "loss": 0.9314,
3176
+ "mean_token_accuracy": 0.7765843331813812,
3177
+ "num_tokens": 3902208.0,
3178
+ "step": 3520
3179
+ },
3180
+ {
3181
+ "epoch": 0.7112633487809793,
3182
+ "grad_norm": 9.875,
3183
+ "learning_rate": 1.52595876150178e-05,
3184
+ "loss": 0.7786,
3185
+ "mean_token_accuracy": 0.8059293925762177,
3186
+ "num_tokens": 3913221.0,
3187
+ "step": 3530
3188
+ },
3189
+ {
3190
+ "epoch": 0.7132782591174692,
3191
+ "grad_norm": 13.1875,
3192
+ "learning_rate": 1.52461548794412e-05,
3193
+ "loss": 0.8675,
3194
+ "mean_token_accuracy": 0.7885148406028748,
3195
+ "num_tokens": 3923202.0,
3196
+ "step": 3540
3197
+ },
3198
+ {
3199
+ "epoch": 0.7152931694539593,
3200
+ "grad_norm": 11.625,
3201
+ "learning_rate": 1.52327221438646e-05,
3202
+ "loss": 0.945,
3203
+ "mean_token_accuracy": 0.7707227051258088,
3204
+ "num_tokens": 3933469.0,
3205
+ "step": 3550
3206
+ },
3207
+ {
3208
+ "epoch": 0.7173080797904493,
3209
+ "grad_norm": 11.75,
3210
+ "learning_rate": 1.5219289408287999e-05,
3211
+ "loss": 0.8829,
3212
+ "mean_token_accuracy": 0.7817340910434722,
3213
+ "num_tokens": 3944523.0,
3214
+ "step": 3560
3215
+ },
3216
+ {
3217
+ "epoch": 0.7193229901269393,
3218
+ "grad_norm": 13.5625,
3219
+ "learning_rate": 1.52058566727114e-05,
3220
+ "loss": 0.9075,
3221
+ "mean_token_accuracy": 0.7843019485473632,
3222
+ "num_tokens": 3955650.0,
3223
+ "step": 3570
3224
+ },
3225
+ {
3226
+ "epoch": 0.7213379004634294,
3227
+ "grad_norm": 11.25,
3228
+ "learning_rate": 1.51924239371348e-05,
3229
+ "loss": 0.9087,
3230
+ "mean_token_accuracy": 0.7732051312923431,
3231
+ "num_tokens": 3967014.0,
3232
+ "step": 3580
3233
+ },
3234
+ {
3235
+ "epoch": 0.7233528107999194,
3236
+ "grad_norm": 11.0625,
3237
+ "learning_rate": 1.5178991201558197e-05,
3238
+ "loss": 0.8595,
3239
+ "mean_token_accuracy": 0.7891602039337158,
3240
+ "num_tokens": 3977669.0,
3241
+ "step": 3590
3242
+ },
3243
+ {
3244
+ "epoch": 0.7253677211364095,
3245
+ "grad_norm": 9.4375,
3246
+ "learning_rate": 1.5165558465981597e-05,
3247
+ "loss": 0.8617,
3248
+ "mean_token_accuracy": 0.7840434491634369,
3249
+ "num_tokens": 3989279.0,
3250
+ "step": 3600
3251
+ },
3252
+ {
3253
+ "epoch": 0.7273826314728995,
3254
+ "grad_norm": 11.3125,
3255
+ "learning_rate": 1.5152125730404998e-05,
3256
+ "loss": 0.8872,
3257
+ "mean_token_accuracy": 0.7797149956226349,
3258
+ "num_tokens": 4000448.0,
3259
+ "step": 3610
3260
+ },
3261
+ {
3262
+ "epoch": 0.7293975418093894,
3263
+ "grad_norm": 9.375,
3264
+ "learning_rate": 1.5138692994828398e-05,
3265
+ "loss": 0.8267,
3266
+ "mean_token_accuracy": 0.7977706253528595,
3267
+ "num_tokens": 4010787.0,
3268
+ "step": 3620
3269
+ },
3270
+ {
3271
+ "epoch": 0.7314124521458795,
3272
+ "grad_norm": 11.5625,
3273
+ "learning_rate": 1.5125260259251797e-05,
3274
+ "loss": 0.9227,
3275
+ "mean_token_accuracy": 0.7788041710853577,
3276
+ "num_tokens": 4021823.0,
3277
+ "step": 3630
3278
+ },
3279
+ {
3280
+ "epoch": 0.7334273624823695,
3281
+ "grad_norm": 9.1875,
3282
+ "learning_rate": 1.5111827523675198e-05,
3283
+ "loss": 0.988,
3284
+ "mean_token_accuracy": 0.7647354364395141,
3285
+ "num_tokens": 4034278.0,
3286
+ "step": 3640
3287
+ },
3288
+ {
3289
+ "epoch": 0.7354422728188595,
3290
+ "grad_norm": 10.8125,
3291
+ "learning_rate": 1.5098394788098598e-05,
3292
+ "loss": 1.0355,
3293
+ "mean_token_accuracy": 0.7544133722782135,
3294
+ "num_tokens": 4045371.0,
3295
+ "step": 3650
3296
+ },
3297
+ {
3298
+ "epoch": 0.7374571831553496,
3299
+ "grad_norm": 11.875,
3300
+ "learning_rate": 1.5084962052521997e-05,
3301
+ "loss": 0.8856,
3302
+ "mean_token_accuracy": 0.7889864265918731,
3303
+ "num_tokens": 4056216.0,
3304
+ "step": 3660
3305
+ },
3306
+ {
3307
+ "epoch": 0.7394720934918396,
3308
+ "grad_norm": 11.0625,
3309
+ "learning_rate": 1.5071529316945398e-05,
3310
+ "loss": 0.942,
3311
+ "mean_token_accuracy": 0.7709968864917756,
3312
+ "num_tokens": 4066100.0,
3313
+ "step": 3670
3314
+ },
3315
+ {
3316
+ "epoch": 0.7414870038283297,
3317
+ "grad_norm": 10.5625,
3318
+ "learning_rate": 1.5058096581368798e-05,
3319
+ "loss": 0.8474,
3320
+ "mean_token_accuracy": 0.7917793452739715,
3321
+ "num_tokens": 4076638.0,
3322
+ "step": 3680
3323
+ },
3324
+ {
3325
+ "epoch": 0.7435019141648197,
3326
+ "grad_norm": 12.625,
3327
+ "learning_rate": 1.5044663845792197e-05,
3328
+ "loss": 0.8937,
3329
+ "mean_token_accuracy": 0.779036569595337,
3330
+ "num_tokens": 4088398.0,
3331
+ "step": 3690
3332
+ },
3333
+ {
3334
+ "epoch": 0.7455168245013097,
3335
+ "grad_norm": 13.0625,
3336
+ "learning_rate": 1.5031231110215596e-05,
3337
+ "loss": 0.9305,
3338
+ "mean_token_accuracy": 0.7710303366184235,
3339
+ "num_tokens": 4100854.0,
3340
+ "step": 3700
3341
+ },
3342
+ {
3343
+ "epoch": 0.7475317348377997,
3344
+ "grad_norm": 10.375,
3345
+ "learning_rate": 1.5017798374638996e-05,
3346
+ "loss": 0.9195,
3347
+ "mean_token_accuracy": 0.7792443215847016,
3348
+ "num_tokens": 4113144.0,
3349
+ "step": 3710
3350
+ },
3351
+ {
3352
+ "epoch": 0.7495466451742897,
3353
+ "grad_norm": 10.25,
3354
+ "learning_rate": 1.5004365639062397e-05,
3355
+ "loss": 0.8205,
3356
+ "mean_token_accuracy": 0.8025223255157471,
3357
+ "num_tokens": 4124241.0,
3358
+ "step": 3720
3359
+ },
3360
+ {
3361
+ "epoch": 0.7515615555107797,
3362
+ "grad_norm": 13.375,
3363
+ "learning_rate": 1.4990932903485796e-05,
3364
+ "loss": 0.7566,
3365
+ "mean_token_accuracy": 0.8099412024021149,
3366
+ "num_tokens": 4134131.0,
3367
+ "step": 3730
3368
+ },
3369
+ {
3370
+ "epoch": 0.7535764658472698,
3371
+ "grad_norm": 9.3125,
3372
+ "learning_rate": 1.4977500167909196e-05,
3373
+ "loss": 0.8882,
3374
+ "mean_token_accuracy": 0.7791573405265808,
3375
+ "num_tokens": 4144499.0,
3376
+ "step": 3740
3377
+ },
3378
+ {
3379
+ "epoch": 0.7555913761837598,
3380
+ "grad_norm": 13.4375,
3381
+ "learning_rate": 1.4964067432332597e-05,
3382
+ "loss": 0.8661,
3383
+ "mean_token_accuracy": 0.7914558589458466,
3384
+ "num_tokens": 4155442.0,
3385
+ "step": 3750
3386
+ },
3387
+ {
3388
+ "epoch": 0.7576062865202499,
3389
+ "grad_norm": 13.5625,
3390
+ "learning_rate": 1.4950634696755994e-05,
3391
+ "loss": 0.8986,
3392
+ "mean_token_accuracy": 0.7791661143302917,
3393
+ "num_tokens": 4165905.0,
3394
+ "step": 3760
3395
+ },
3396
+ {
3397
+ "epoch": 0.7596211968567399,
3398
+ "grad_norm": 10.875,
3399
+ "learning_rate": 1.4937201961179395e-05,
3400
+ "loss": 0.9857,
3401
+ "mean_token_accuracy": 0.7646209299564362,
3402
+ "num_tokens": 4177252.0,
3403
+ "step": 3770
3404
+ },
3405
+ {
3406
+ "epoch": 0.7616361071932299,
3407
+ "grad_norm": 13.1875,
3408
+ "learning_rate": 1.4923769225602795e-05,
3409
+ "loss": 0.8163,
3410
+ "mean_token_accuracy": 0.8001804709434509,
3411
+ "num_tokens": 4187603.0,
3412
+ "step": 3780
3413
+ },
3414
+ {
3415
+ "epoch": 0.76365101752972,
3416
+ "grad_norm": 12.5625,
3417
+ "learning_rate": 1.4910336490026196e-05,
3418
+ "loss": 0.8719,
3419
+ "mean_token_accuracy": 0.793983542919159,
3420
+ "num_tokens": 4198158.0,
3421
+ "step": 3790
3422
+ },
3423
+ {
3424
+ "epoch": 0.7656659278662099,
3425
+ "grad_norm": 11.625,
3426
+ "learning_rate": 1.4896903754449594e-05,
3427
+ "loss": 0.8003,
3428
+ "mean_token_accuracy": 0.8059770345687867,
3429
+ "num_tokens": 4209371.0,
3430
+ "step": 3800
3431
+ },
3432
+ {
3433
+ "epoch": 0.7676808382026999,
3434
+ "grad_norm": 11.375,
3435
+ "learning_rate": 1.4883471018872995e-05,
3436
+ "loss": 0.8484,
3437
+ "mean_token_accuracy": 0.791538542509079,
3438
+ "num_tokens": 4220051.0,
3439
+ "step": 3810
3440
+ },
3441
+ {
3442
+ "epoch": 0.76969574853919,
3443
+ "grad_norm": 11.4375,
3444
+ "learning_rate": 1.4870038283296395e-05,
3445
+ "loss": 0.8216,
3446
+ "mean_token_accuracy": 0.7945187032222748,
3447
+ "num_tokens": 4230922.0,
3448
+ "step": 3820
3449
+ },
3450
+ {
3451
+ "epoch": 0.77171065887568,
3452
+ "grad_norm": 10.1875,
3453
+ "learning_rate": 1.4856605547719794e-05,
3454
+ "loss": 0.8319,
3455
+ "mean_token_accuracy": 0.7939063310623169,
3456
+ "num_tokens": 4242793.0,
3457
+ "step": 3830
3458
+ },
3459
+ {
3460
+ "epoch": 0.7737255692121701,
3461
+ "grad_norm": 14.125,
3462
+ "learning_rate": 1.4843172812143193e-05,
3463
+ "loss": 0.8577,
3464
+ "mean_token_accuracy": 0.7900285601615906,
3465
+ "num_tokens": 4253881.0,
3466
+ "step": 3840
3467
+ },
3468
+ {
3469
+ "epoch": 0.7757404795486601,
3470
+ "grad_norm": 10.875,
3471
+ "learning_rate": 1.4829740076566594e-05,
3472
+ "loss": 0.836,
3473
+ "mean_token_accuracy": 0.7931070744991302,
3474
+ "num_tokens": 4266304.0,
3475
+ "step": 3850
3476
+ },
3477
+ {
3478
+ "epoch": 0.7777553898851501,
3479
+ "grad_norm": 11.125,
3480
+ "learning_rate": 1.4816307340989994e-05,
3481
+ "loss": 1.0042,
3482
+ "mean_token_accuracy": 0.7616709470748901,
3483
+ "num_tokens": 4276817.0,
3484
+ "step": 3860
3485
+ },
3486
+ {
3487
+ "epoch": 0.7797703002216402,
3488
+ "grad_norm": 12.0625,
3489
+ "learning_rate": 1.4802874605413393e-05,
3490
+ "loss": 0.7827,
3491
+ "mean_token_accuracy": 0.8023504674434662,
3492
+ "num_tokens": 4286833.0,
3493
+ "step": 3870
3494
+ },
3495
+ {
3496
+ "epoch": 0.7817852105581302,
3497
+ "grad_norm": 12.125,
3498
+ "learning_rate": 1.4789441869836794e-05,
3499
+ "loss": 0.8489,
3500
+ "mean_token_accuracy": 0.7849018990993499,
3501
+ "num_tokens": 4297516.0,
3502
+ "step": 3880
3503
+ },
3504
+ {
3505
+ "epoch": 0.7838001208946201,
3506
+ "grad_norm": 11.625,
3507
+ "learning_rate": 1.4776009134260194e-05,
3508
+ "loss": 0.8809,
3509
+ "mean_token_accuracy": 0.7819288611412049,
3510
+ "num_tokens": 4309049.0,
3511
+ "step": 3890
3512
+ },
3513
+ {
3514
+ "epoch": 0.7858150312311102,
3515
+ "grad_norm": 10.625,
3516
+ "learning_rate": 1.4762576398683593e-05,
3517
+ "loss": 0.9198,
3518
+ "mean_token_accuracy": 0.7767218172550201,
3519
+ "num_tokens": 4320154.0,
3520
+ "step": 3900
3521
+ },
3522
+ {
3523
+ "epoch": 0.7878299415676002,
3524
+ "grad_norm": 12.0625,
3525
+ "learning_rate": 1.4749143663106993e-05,
3526
+ "loss": 0.9142,
3527
+ "mean_token_accuracy": 0.7742327690124512,
3528
+ "num_tokens": 4334166.0,
3529
+ "step": 3910
3530
+ },
3531
+ {
3532
+ "epoch": 0.7898448519040903,
3533
+ "grad_norm": 16.0,
3534
+ "learning_rate": 1.4735710927530394e-05,
3535
+ "loss": 0.8259,
3536
+ "mean_token_accuracy": 0.798123425245285,
3537
+ "num_tokens": 4344500.0,
3538
+ "step": 3920
3539
+ },
3540
+ {
3541
+ "epoch": 0.7918597622405803,
3542
+ "grad_norm": 12.1875,
3543
+ "learning_rate": 1.4722278191953791e-05,
3544
+ "loss": 0.8897,
3545
+ "mean_token_accuracy": 0.7863348364830017,
3546
+ "num_tokens": 4355554.0,
3547
+ "step": 3930
3548
+ },
3549
+ {
3550
+ "epoch": 0.7938746725770703,
3551
+ "grad_norm": 10.3125,
3552
+ "learning_rate": 1.4708845456377192e-05,
3553
+ "loss": 0.8904,
3554
+ "mean_token_accuracy": 0.7880643427371978,
3555
+ "num_tokens": 4365823.0,
3556
+ "step": 3940
3557
+ },
3558
+ {
3559
+ "epoch": 0.7958895829135604,
3560
+ "grad_norm": 15.375,
3561
+ "learning_rate": 1.4695412720800592e-05,
3562
+ "loss": 0.8622,
3563
+ "mean_token_accuracy": 0.7930482983589172,
3564
+ "num_tokens": 4377033.0,
3565
+ "step": 3950
3566
+ },
3567
+ {
3568
+ "epoch": 0.7979044932500504,
3569
+ "grad_norm": 11.75,
3570
+ "learning_rate": 1.4681979985223993e-05,
3571
+ "loss": 0.9426,
3572
+ "mean_token_accuracy": 0.7710152387619018,
3573
+ "num_tokens": 4387397.0,
3574
+ "step": 3960
3575
+ },
3576
+ {
3577
+ "epoch": 0.7999194035865403,
3578
+ "grad_norm": 10.5625,
3579
+ "learning_rate": 1.4668547249647392e-05,
3580
+ "loss": 0.8042,
3581
+ "mean_token_accuracy": 0.8007622241973877,
3582
+ "num_tokens": 4397683.0,
3583
+ "step": 3970
3584
+ },
3585
+ {
3586
+ "epoch": 0.8019343139230304,
3587
+ "grad_norm": 9.75,
3588
+ "learning_rate": 1.4655114514070792e-05,
3589
+ "loss": 0.8862,
3590
+ "mean_token_accuracy": 0.7830459952354432,
3591
+ "num_tokens": 4408500.0,
3592
+ "step": 3980
3593
+ },
3594
+ {
3595
+ "epoch": 0.8039492242595204,
3596
+ "grad_norm": 13.375,
3597
+ "learning_rate": 1.4641681778494193e-05,
3598
+ "loss": 0.9356,
3599
+ "mean_token_accuracy": 0.7747329294681549,
3600
+ "num_tokens": 4419148.0,
3601
+ "step": 3990
3602
+ },
3603
+ {
3604
+ "epoch": 0.8059641345960105,
3605
+ "grad_norm": 13.0625,
3606
+ "learning_rate": 1.462824904291759e-05,
3607
+ "loss": 0.9006,
3608
+ "mean_token_accuracy": 0.7772108554840088,
3609
+ "num_tokens": 4430041.0,
3610
+ "step": 4000
3611
  }
3612
  ],
3613
  "logging_steps": 10,
 
3627
  "attributes": {}
3628
  }
3629
  },
3630
+ "total_flos": 5359648531077120.0,
3631
  "train_batch_size": 8,
3632
  "trial_name": null,
3633
  "trial_params": null