Plofski commited on
Commit
58a616c
·
verified ·
1 Parent(s): 897c7d8

Training in progress, step 8500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:872e13706948c7a141e635bc023a52fbe531ae28f59acde5c4f237db2a94c6b1
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16ca55f673b3ad7e95262a9d0296f5d8f2b7edb92a87d108841a282630107b61
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c368469d799ee657aa6f345b72b1b063d1207badee5ef2708584fc5b29dd1fa0
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d62e7293944847d060dba9b65b5eb64216e79d1a688e81c3a95ed8977d7ce35
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6f8c5daae46e22d0555f52515cb826d70a09c178d27140188b1fd68ded8645a9
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e3aeebf16be5d93156c95c5c47fce9ca30893837ac7097fcc26a2ec8d4dc9f51
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.611928269192021,
6
  "eval_steps": 500,
7
- "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7208,6 +7208,456 @@
7208
  "mean_token_accuracy": 0.7778131783008575,
7209
  "num_tokens": 8860114.0,
7210
  "step": 8000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7211
  }
7212
  ],
7213
  "logging_steps": 10,
@@ -7227,7 +7677,7 @@
7227
  "attributes": {}
7228
  }
7229
  },
7230
- "total_flos": 1.0725865607073792e+16,
7231
  "train_batch_size": 8,
7232
  "trial_name": null,
7233
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.7126737860165222,
6
  "eval_steps": 500,
7
+ "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7208
  "mean_token_accuracy": 0.7778131783008575,
7209
  "num_tokens": 8860114.0,
7210
  "step": 8000
7211
+ },
7212
+ {
7213
+ "epoch": 1.613943179528511,
7214
+ "grad_norm": 12.875,
7215
+ "learning_rate": 9.24172207670092e-06,
7216
+ "loss": 0.9062,
7217
+ "mean_token_accuracy": 0.7869289875030517,
7218
+ "num_tokens": 8870296.0,
7219
+ "step": 8010
7220
+ },
7221
+ {
7222
+ "epoch": 1.615958089865001,
7223
+ "grad_norm": 11.0625,
7224
+ "learning_rate": 9.22828934112432e-06,
7225
+ "loss": 0.7193,
7226
+ "mean_token_accuracy": 0.8147738158702851,
7227
+ "num_tokens": 8880957.0,
7228
+ "step": 8020
7229
+ },
7230
+ {
7231
+ "epoch": 1.617973000201491,
7232
+ "grad_norm": 11.625,
7233
+ "learning_rate": 9.21485660554772e-06,
7234
+ "loss": 0.9484,
7235
+ "mean_token_accuracy": 0.7725982308387757,
7236
+ "num_tokens": 8892907.0,
7237
+ "step": 8030
7238
+ },
7239
+ {
7240
+ "epoch": 1.619987910537981,
7241
+ "grad_norm": 11.0625,
7242
+ "learning_rate": 9.20142386997112e-06,
7243
+ "loss": 0.7605,
7244
+ "mean_token_accuracy": 0.8034783184528351,
7245
+ "num_tokens": 8904461.0,
7246
+ "step": 8040
7247
+ },
7248
+ {
7249
+ "epoch": 1.6220028208744712,
7250
+ "grad_norm": 11.8125,
7251
+ "learning_rate": 9.187991134394521e-06,
7252
+ "loss": 0.8351,
7253
+ "mean_token_accuracy": 0.7964996695518494,
7254
+ "num_tokens": 8915489.0,
7255
+ "step": 8050
7256
+ },
7257
+ {
7258
+ "epoch": 1.6240177312109612,
7259
+ "grad_norm": 10.625,
7260
+ "learning_rate": 9.17455839881792e-06,
7261
+ "loss": 0.8234,
7262
+ "mean_token_accuracy": 0.7960014402866363,
7263
+ "num_tokens": 8927916.0,
7264
+ "step": 8060
7265
+ },
7266
+ {
7267
+ "epoch": 1.626032641547451,
7268
+ "grad_norm": 11.0,
7269
+ "learning_rate": 9.16112566324132e-06,
7270
+ "loss": 0.8144,
7271
+ "mean_token_accuracy": 0.7940633356571197,
7272
+ "num_tokens": 8938931.0,
7273
+ "step": 8070
7274
+ },
7275
+ {
7276
+ "epoch": 1.6280475518839412,
7277
+ "grad_norm": 10.4375,
7278
+ "learning_rate": 9.14769292766472e-06,
7279
+ "loss": 0.7865,
7280
+ "mean_token_accuracy": 0.806914460659027,
7281
+ "num_tokens": 8949201.0,
7282
+ "step": 8080
7283
+ },
7284
+ {
7285
+ "epoch": 1.6300624622204312,
7286
+ "grad_norm": 14.4375,
7287
+ "learning_rate": 9.13426019208812e-06,
7288
+ "loss": 0.8536,
7289
+ "mean_token_accuracy": 0.787992262840271,
7290
+ "num_tokens": 8960524.0,
7291
+ "step": 8090
7292
+ },
7293
+ {
7294
+ "epoch": 1.632077372556921,
7295
+ "grad_norm": 9.75,
7296
+ "learning_rate": 9.12082745651152e-06,
7297
+ "loss": 0.8154,
7298
+ "mean_token_accuracy": 0.7935736238956451,
7299
+ "num_tokens": 8972192.0,
7300
+ "step": 8100
7301
+ },
7302
+ {
7303
+ "epoch": 1.6340922828934112,
7304
+ "grad_norm": 13.375,
7305
+ "learning_rate": 9.107394720934919e-06,
7306
+ "loss": 0.782,
7307
+ "mean_token_accuracy": 0.802595990896225,
7308
+ "num_tokens": 8983325.0,
7309
+ "step": 8110
7310
+ },
7311
+ {
7312
+ "epoch": 1.6361071932299014,
7313
+ "grad_norm": 14.875,
7314
+ "learning_rate": 9.09396198535832e-06,
7315
+ "loss": 0.8885,
7316
+ "mean_token_accuracy": 0.779900997877121,
7317
+ "num_tokens": 8994056.0,
7318
+ "step": 8120
7319
+ },
7320
+ {
7321
+ "epoch": 1.6381221035663913,
7322
+ "grad_norm": 12.375,
7323
+ "learning_rate": 9.080529249781718e-06,
7324
+ "loss": 0.7469,
7325
+ "mean_token_accuracy": 0.8087693631649018,
7326
+ "num_tokens": 9004649.0,
7327
+ "step": 8130
7328
+ },
7329
+ {
7330
+ "epoch": 1.6401370139028812,
7331
+ "grad_norm": 12.375,
7332
+ "learning_rate": 9.067096514205117e-06,
7333
+ "loss": 0.9413,
7334
+ "mean_token_accuracy": 0.7759244620800019,
7335
+ "num_tokens": 9017546.0,
7336
+ "step": 8140
7337
+ },
7338
+ {
7339
+ "epoch": 1.6421519242393714,
7340
+ "grad_norm": 11.4375,
7341
+ "learning_rate": 9.053663778628518e-06,
7342
+ "loss": 0.8086,
7343
+ "mean_token_accuracy": 0.7987602353096008,
7344
+ "num_tokens": 9028816.0,
7345
+ "step": 8150
7346
+ },
7347
+ {
7348
+ "epoch": 1.6441668345758613,
7349
+ "grad_norm": 13.0,
7350
+ "learning_rate": 9.040231043051918e-06,
7351
+ "loss": 0.8092,
7352
+ "mean_token_accuracy": 0.8002450168132782,
7353
+ "num_tokens": 9040241.0,
7354
+ "step": 8160
7355
+ },
7356
+ {
7357
+ "epoch": 1.6461817449123513,
7358
+ "grad_norm": 11.875,
7359
+ "learning_rate": 9.026798307475319e-06,
7360
+ "loss": 0.8056,
7361
+ "mean_token_accuracy": 0.7970556557178498,
7362
+ "num_tokens": 9050944.0,
7363
+ "step": 8170
7364
+ },
7365
+ {
7366
+ "epoch": 1.6481966552488414,
7367
+ "grad_norm": 11.25,
7368
+ "learning_rate": 9.013365571898718e-06,
7369
+ "loss": 0.885,
7370
+ "mean_token_accuracy": 0.7855951130390167,
7371
+ "num_tokens": 9062338.0,
7372
+ "step": 8180
7373
+ },
7374
+ {
7375
+ "epoch": 1.6502115655853316,
7376
+ "grad_norm": 12.0625,
7377
+ "learning_rate": 8.999932836322117e-06,
7378
+ "loss": 0.8499,
7379
+ "mean_token_accuracy": 0.7869492530822754,
7380
+ "num_tokens": 9073217.0,
7381
+ "step": 8190
7382
+ },
7383
+ {
7384
+ "epoch": 1.6522264759218215,
7385
+ "grad_norm": 10.625,
7386
+ "learning_rate": 8.986500100745517e-06,
7387
+ "loss": 0.725,
7388
+ "mean_token_accuracy": 0.816201251745224,
7389
+ "num_tokens": 9084540.0,
7390
+ "step": 8200
7391
+ },
7392
+ {
7393
+ "epoch": 1.6542413862583114,
7394
+ "grad_norm": 14.625,
7395
+ "learning_rate": 8.973067365168918e-06,
7396
+ "loss": 0.8659,
7397
+ "mean_token_accuracy": 0.7861015915870666,
7398
+ "num_tokens": 9095828.0,
7399
+ "step": 8210
7400
+ },
7401
+ {
7402
+ "epoch": 1.6562562965948016,
7403
+ "grad_norm": 11.375,
7404
+ "learning_rate": 8.959634629592318e-06,
7405
+ "loss": 0.8543,
7406
+ "mean_token_accuracy": 0.7877636075019836,
7407
+ "num_tokens": 9105269.0,
7408
+ "step": 8220
7409
+ },
7410
+ {
7411
+ "epoch": 1.6582712069312917,
7412
+ "grad_norm": 11.8125,
7413
+ "learning_rate": 8.946201894015717e-06,
7414
+ "loss": 0.7655,
7415
+ "mean_token_accuracy": 0.8078620612621308,
7416
+ "num_tokens": 9115722.0,
7417
+ "step": 8230
7418
+ },
7419
+ {
7420
+ "epoch": 1.6602861172677816,
7421
+ "grad_norm": 12.1875,
7422
+ "learning_rate": 8.932769158439118e-06,
7423
+ "loss": 0.8754,
7424
+ "mean_token_accuracy": 0.7780845940113068,
7425
+ "num_tokens": 9126931.0,
7426
+ "step": 8240
7427
+ },
7428
+ {
7429
+ "epoch": 1.6623010276042716,
7430
+ "grad_norm": 14.0625,
7431
+ "learning_rate": 8.919336422862516e-06,
7432
+ "loss": 0.8186,
7433
+ "mean_token_accuracy": 0.7980137884616851,
7434
+ "num_tokens": 9137118.0,
7435
+ "step": 8250
7436
+ },
7437
+ {
7438
+ "epoch": 1.6643159379407617,
7439
+ "grad_norm": 11.3125,
7440
+ "learning_rate": 8.905903687285917e-06,
7441
+ "loss": 0.8233,
7442
+ "mean_token_accuracy": 0.7963060855865478,
7443
+ "num_tokens": 9148350.0,
7444
+ "step": 8260
7445
+ },
7446
+ {
7447
+ "epoch": 1.6663308482772516,
7448
+ "grad_norm": 12.25,
7449
+ "learning_rate": 8.892470951709317e-06,
7450
+ "loss": 0.8238,
7451
+ "mean_token_accuracy": 0.8000846326351165,
7452
+ "num_tokens": 9159385.0,
7453
+ "step": 8270
7454
+ },
7455
+ {
7456
+ "epoch": 1.6683457586137416,
7457
+ "grad_norm": 11.0,
7458
+ "learning_rate": 8.879038216132716e-06,
7459
+ "loss": 0.8972,
7460
+ "mean_token_accuracy": 0.7827898025512695,
7461
+ "num_tokens": 9170903.0,
7462
+ "step": 8280
7463
+ },
7464
+ {
7465
+ "epoch": 1.6703606689502317,
7466
+ "grad_norm": 11.625,
7467
+ "learning_rate": 8.865605480556117e-06,
7468
+ "loss": 0.7794,
7469
+ "mean_token_accuracy": 0.8025726079940796,
7470
+ "num_tokens": 9181737.0,
7471
+ "step": 8290
7472
+ },
7473
+ {
7474
+ "epoch": 1.6723755792867219,
7475
+ "grad_norm": 10.75,
7476
+ "learning_rate": 8.852172744979516e-06,
7477
+ "loss": 0.7777,
7478
+ "mean_token_accuracy": 0.8053012132644654,
7479
+ "num_tokens": 9193448.0,
7480
+ "step": 8300
7481
+ },
7482
+ {
7483
+ "epoch": 1.6743904896232118,
7484
+ "grad_norm": 11.4375,
7485
+ "learning_rate": 8.838740009402914e-06,
7486
+ "loss": 0.7605,
7487
+ "mean_token_accuracy": 0.8079525053501129,
7488
+ "num_tokens": 9204071.0,
7489
+ "step": 8310
7490
+ },
7491
+ {
7492
+ "epoch": 1.6764053999597017,
7493
+ "grad_norm": 11.5,
7494
+ "learning_rate": 8.825307273826315e-06,
7495
+ "loss": 0.9171,
7496
+ "mean_token_accuracy": 0.7748861670494079,
7497
+ "num_tokens": 9214504.0,
7498
+ "step": 8320
7499
+ },
7500
+ {
7501
+ "epoch": 1.6784203102961919,
7502
+ "grad_norm": 9.8125,
7503
+ "learning_rate": 8.811874538249716e-06,
7504
+ "loss": 0.8916,
7505
+ "mean_token_accuracy": 0.7793697714805603,
7506
+ "num_tokens": 9226284.0,
7507
+ "step": 8330
7508
+ },
7509
+ {
7510
+ "epoch": 1.6804352206326818,
7511
+ "grad_norm": 11.1875,
7512
+ "learning_rate": 8.798441802673116e-06,
7513
+ "loss": 0.7674,
7514
+ "mean_token_accuracy": 0.8027099728584289,
7515
+ "num_tokens": 9236745.0,
7516
+ "step": 8340
7517
+ },
7518
+ {
7519
+ "epoch": 1.6824501309691717,
7520
+ "grad_norm": 12.6875,
7521
+ "learning_rate": 8.785009067096515e-06,
7522
+ "loss": 0.7154,
7523
+ "mean_token_accuracy": 0.811217075586319,
7524
+ "num_tokens": 9246893.0,
7525
+ "step": 8350
7526
+ },
7527
+ {
7528
+ "epoch": 1.6844650413056619,
7529
+ "grad_norm": 10.8125,
7530
+ "learning_rate": 8.771576331519914e-06,
7531
+ "loss": 0.7992,
7532
+ "mean_token_accuracy": 0.8010998785495758,
7533
+ "num_tokens": 9257127.0,
7534
+ "step": 8360
7535
+ },
7536
+ {
7537
+ "epoch": 1.686479951642152,
7538
+ "grad_norm": 11.75,
7539
+ "learning_rate": 8.758143595943314e-06,
7540
+ "loss": 0.7553,
7541
+ "mean_token_accuracy": 0.8025872766971588,
7542
+ "num_tokens": 9267345.0,
7543
+ "step": 8370
7544
+ },
7545
+ {
7546
+ "epoch": 1.688494861978642,
7547
+ "grad_norm": 10.5625,
7548
+ "learning_rate": 8.744710860366715e-06,
7549
+ "loss": 0.7177,
7550
+ "mean_token_accuracy": 0.807683116197586,
7551
+ "num_tokens": 9278348.0,
7552
+ "step": 8380
7553
+ },
7554
+ {
7555
+ "epoch": 1.6905097723151319,
7556
+ "grad_norm": 10.6875,
7557
+ "learning_rate": 8.731278124790115e-06,
7558
+ "loss": 0.824,
7559
+ "mean_token_accuracy": 0.7992592275142669,
7560
+ "num_tokens": 9289759.0,
7561
+ "step": 8390
7562
+ },
7563
+ {
7564
+ "epoch": 1.692524682651622,
7565
+ "grad_norm": 10.0,
7566
+ "learning_rate": 8.717845389213514e-06,
7567
+ "loss": 0.7137,
7568
+ "mean_token_accuracy": 0.8179818749427795,
7569
+ "num_tokens": 9301077.0,
7570
+ "step": 8400
7571
+ },
7572
+ {
7573
+ "epoch": 1.6945395929881122,
7574
+ "grad_norm": 9.625,
7575
+ "learning_rate": 8.704412653636913e-06,
7576
+ "loss": 0.7854,
7577
+ "mean_token_accuracy": 0.8026704370975495,
7578
+ "num_tokens": 9311874.0,
7579
+ "step": 8410
7580
+ },
7581
+ {
7582
+ "epoch": 1.696554503324602,
7583
+ "grad_norm": 10.5,
7584
+ "learning_rate": 8.690979918060313e-06,
7585
+ "loss": 0.7699,
7586
+ "mean_token_accuracy": 0.8098136365413666,
7587
+ "num_tokens": 9322554.0,
7588
+ "step": 8420
7589
+ },
7590
+ {
7591
+ "epoch": 1.698569413661092,
7592
+ "grad_norm": 13.5,
7593
+ "learning_rate": 8.677547182483714e-06,
7594
+ "loss": 0.9204,
7595
+ "mean_token_accuracy": 0.7753970444202423,
7596
+ "num_tokens": 9334306.0,
7597
+ "step": 8430
7598
+ },
7599
+ {
7600
+ "epoch": 1.7005843239975822,
7601
+ "grad_norm": 10.375,
7602
+ "learning_rate": 8.664114446907113e-06,
7603
+ "loss": 0.7777,
7604
+ "mean_token_accuracy": 0.8091802179813385,
7605
+ "num_tokens": 9347003.0,
7606
+ "step": 8440
7607
+ },
7608
+ {
7609
+ "epoch": 1.7025992343340721,
7610
+ "grad_norm": 15.5625,
7611
+ "learning_rate": 8.650681711330513e-06,
7612
+ "loss": 0.7196,
7613
+ "mean_token_accuracy": 0.8092711210250855,
7614
+ "num_tokens": 9357300.0,
7615
+ "step": 8450
7616
+ },
7617
+ {
7618
+ "epoch": 1.704614144670562,
7619
+ "grad_norm": 11.3125,
7620
+ "learning_rate": 8.637248975753914e-06,
7621
+ "loss": 0.826,
7622
+ "mean_token_accuracy": 0.7981557488441468,
7623
+ "num_tokens": 9369935.0,
7624
+ "step": 8460
7625
+ },
7626
+ {
7627
+ "epoch": 1.7066290550070522,
7628
+ "grad_norm": 10.125,
7629
+ "learning_rate": 8.623816240177313e-06,
7630
+ "loss": 0.8045,
7631
+ "mean_token_accuracy": 0.8022239625453949,
7632
+ "num_tokens": 9381204.0,
7633
+ "step": 8470
7634
+ },
7635
+ {
7636
+ "epoch": 1.7086439653435423,
7637
+ "grad_norm": 11.9375,
7638
+ "learning_rate": 8.610383504600712e-06,
7639
+ "loss": 0.7834,
7640
+ "mean_token_accuracy": 0.7945830345153808,
7641
+ "num_tokens": 9391506.0,
7642
+ "step": 8480
7643
+ },
7644
+ {
7645
+ "epoch": 1.7106588756800323,
7646
+ "grad_norm": 11.75,
7647
+ "learning_rate": 8.596950769024112e-06,
7648
+ "loss": 0.8079,
7649
+ "mean_token_accuracy": 0.8012938261032104,
7650
+ "num_tokens": 9402691.0,
7651
+ "step": 8490
7652
+ },
7653
+ {
7654
+ "epoch": 1.7126737860165222,
7655
+ "grad_norm": 10.8125,
7656
+ "learning_rate": 8.583518033447513e-06,
7657
+ "loss": 0.8476,
7658
+ "mean_token_accuracy": 0.7890658736228943,
7659
+ "num_tokens": 9414095.0,
7660
+ "step": 8500
7661
  }
7662
  ],
7663
  "logging_steps": 10,
 
7677
  "attributes": {}
7678
  }
7679
  },
7680
+ "total_flos": 1.1396175021686784e+16,
7681
  "train_batch_size": 8,
7682
  "trial_name": null,
7683
  "trial_params": null