Plofski commited on
Commit
2e67f99
·
verified ·
1 Parent(s): 9f463cd

Training in progress, step 7500, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e99f98def5707de3be1588197fe5096482fac3f483b22d6d05ac701448ef1f6
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:445a128b149954e68d1af5a00630de0dc09e06cb78963d856ab9efe3a52157d9
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c36c51441c6a4e72e59a3d4e0e9b5b84bfb5e8d67b647194ab6c6bfde7983c0e
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66488112339a052865703e73eb9d72b3f5f142ea84ea68d0b968dcf9eb080bb8
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:326c38bef4f14b97646caa84204f32859351159ff635853df88679a10264e29a
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88ec7f0fcbb8e83ac60a847dffeda029d1a65c084556d4707d85ad106bc04ba0
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.4104372355430184,
6
  "eval_steps": 500,
7
- "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6308,6 +6308,456 @@
6308
  "mean_token_accuracy": 0.7988959193229676,
6309
  "num_tokens": 7754571.0,
6310
  "step": 7000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6311
  }
6312
  ],
6313
  "logging_steps": 10,
@@ -6327,7 +6777,7 @@
6327
  "attributes": {}
6328
  }
6329
  },
6330
- "total_flos": 9382261075611648.0,
6331
  "train_batch_size": 8,
6332
  "trial_name": null,
6333
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.5111827523675196,
6
  "eval_steps": 500,
7
+ "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6308
  "mean_token_accuracy": 0.7988959193229676,
6309
  "num_tokens": 7754571.0,
6310
  "step": 7000
6311
+ },
6312
+ {
6313
+ "epoch": 1.4124521458795083,
6314
+ "grad_norm": 12.5625,
6315
+ "learning_rate": 1.0584995634360939e-05,
6316
+ "loss": 0.8291,
6317
+ "mean_token_accuracy": 0.7979937255382538,
6318
+ "num_tokens": 7765407.0,
6319
+ "step": 7010
6320
+ },
6321
+ {
6322
+ "epoch": 1.4144670562159984,
6323
+ "grad_norm": 11.0,
6324
+ "learning_rate": 1.0571562898784338e-05,
6325
+ "loss": 0.7867,
6326
+ "mean_token_accuracy": 0.804653775691986,
6327
+ "num_tokens": 7777508.0,
6328
+ "step": 7020
6329
+ },
6330
+ {
6331
+ "epoch": 1.4164819665524884,
6332
+ "grad_norm": 12.5,
6333
+ "learning_rate": 1.0558130163207738e-05,
6334
+ "loss": 0.9272,
6335
+ "mean_token_accuracy": 0.7761716663837432,
6336
+ "num_tokens": 7789199.0,
6337
+ "step": 7030
6338
+ },
6339
+ {
6340
+ "epoch": 1.4184968768889785,
6341
+ "grad_norm": 12.4375,
6342
+ "learning_rate": 1.0544697427631139e-05,
6343
+ "loss": 0.8449,
6344
+ "mean_token_accuracy": 0.7917571127414703,
6345
+ "num_tokens": 7800709.0,
6346
+ "step": 7040
6347
+ },
6348
+ {
6349
+ "epoch": 1.4205117872254684,
6350
+ "grad_norm": 13.0,
6351
+ "learning_rate": 1.0531264692054538e-05,
6352
+ "loss": 0.9143,
6353
+ "mean_token_accuracy": 0.7804217040538788,
6354
+ "num_tokens": 7813083.0,
6355
+ "step": 7050
6356
+ },
6357
+ {
6358
+ "epoch": 1.4225266975619584,
6359
+ "grad_norm": 11.875,
6360
+ "learning_rate": 1.0517831956477938e-05,
6361
+ "loss": 0.8094,
6362
+ "mean_token_accuracy": 0.8005238711833954,
6363
+ "num_tokens": 7823896.0,
6364
+ "step": 7060
6365
+ },
6366
+ {
6367
+ "epoch": 1.4245416078984485,
6368
+ "grad_norm": 11.9375,
6369
+ "learning_rate": 1.0504399220901339e-05,
6370
+ "loss": 0.773,
6371
+ "mean_token_accuracy": 0.8118914902210236,
6372
+ "num_tokens": 7834242.0,
6373
+ "step": 7070
6374
+ },
6375
+ {
6376
+ "epoch": 1.4265565182349387,
6377
+ "grad_norm": 10.0625,
6378
+ "learning_rate": 1.0490966485324736e-05,
6379
+ "loss": 0.8299,
6380
+ "mean_token_accuracy": 0.7923681199550628,
6381
+ "num_tokens": 7845550.0,
6382
+ "step": 7080
6383
+ },
6384
+ {
6385
+ "epoch": 1.4285714285714286,
6386
+ "grad_norm": 12.6875,
6387
+ "learning_rate": 1.0477533749748136e-05,
6388
+ "loss": 0.8102,
6389
+ "mean_token_accuracy": 0.7967373371124268,
6390
+ "num_tokens": 7856319.0,
6391
+ "step": 7090
6392
+ },
6393
+ {
6394
+ "epoch": 1.4305863389079185,
6395
+ "grad_norm": 11.9375,
6396
+ "learning_rate": 1.0464101014171537e-05,
6397
+ "loss": 0.9132,
6398
+ "mean_token_accuracy": 0.7812554478645325,
6399
+ "num_tokens": 7867021.0,
6400
+ "step": 7100
6401
+ },
6402
+ {
6403
+ "epoch": 1.4326012492444087,
6404
+ "grad_norm": 10.0625,
6405
+ "learning_rate": 1.0450668278594937e-05,
6406
+ "loss": 0.7123,
6407
+ "mean_token_accuracy": 0.8167718529701233,
6408
+ "num_tokens": 7877634.0,
6409
+ "step": 7110
6410
+ },
6411
+ {
6412
+ "epoch": 1.4346161595808986,
6413
+ "grad_norm": 11.0625,
6414
+ "learning_rate": 1.0437235543018336e-05,
6415
+ "loss": 0.8817,
6416
+ "mean_token_accuracy": 0.7886090099811554,
6417
+ "num_tokens": 7889170.0,
6418
+ "step": 7120
6419
+ },
6420
+ {
6421
+ "epoch": 1.4366310699173888,
6422
+ "grad_norm": 12.1875,
6423
+ "learning_rate": 1.0423802807441737e-05,
6424
+ "loss": 0.8589,
6425
+ "mean_token_accuracy": 0.7861130595207214,
6426
+ "num_tokens": 7899457.0,
6427
+ "step": 7130
6428
+ },
6429
+ {
6430
+ "epoch": 1.4386459802538787,
6431
+ "grad_norm": 13.0625,
6432
+ "learning_rate": 1.0410370071865137e-05,
6433
+ "loss": 0.9932,
6434
+ "mean_token_accuracy": 0.7648563742637634,
6435
+ "num_tokens": 7910678.0,
6436
+ "step": 7140
6437
+ },
6438
+ {
6439
+ "epoch": 1.4406608905903688,
6440
+ "grad_norm": 10.0,
6441
+ "learning_rate": 1.0396937336288534e-05,
6442
+ "loss": 0.9069,
6443
+ "mean_token_accuracy": 0.7735403776168823,
6444
+ "num_tokens": 7922908.0,
6445
+ "step": 7150
6446
+ },
6447
+ {
6448
+ "epoch": 1.4426758009268588,
6449
+ "grad_norm": 12.875,
6450
+ "learning_rate": 1.0383504600711935e-05,
6451
+ "loss": 0.8557,
6452
+ "mean_token_accuracy": 0.7988546848297119,
6453
+ "num_tokens": 7933362.0,
6454
+ "step": 7160
6455
+ },
6456
+ {
6457
+ "epoch": 1.4446907112633487,
6458
+ "grad_norm": 12.625,
6459
+ "learning_rate": 1.0370071865135335e-05,
6460
+ "loss": 0.8692,
6461
+ "mean_token_accuracy": 0.786381047964096,
6462
+ "num_tokens": 7944561.0,
6463
+ "step": 7170
6464
+ },
6465
+ {
6466
+ "epoch": 1.4467056215998388,
6467
+ "grad_norm": 12.6875,
6468
+ "learning_rate": 1.0356639129558736e-05,
6469
+ "loss": 0.9052,
6470
+ "mean_token_accuracy": 0.7769907891750336,
6471
+ "num_tokens": 7955497.0,
6472
+ "step": 7180
6473
+ },
6474
+ {
6475
+ "epoch": 1.4487205319363288,
6476
+ "grad_norm": 11.8125,
6477
+ "learning_rate": 1.0343206393982135e-05,
6478
+ "loss": 0.8062,
6479
+ "mean_token_accuracy": 0.7997641444206238,
6480
+ "num_tokens": 7966168.0,
6481
+ "step": 7190
6482
+ },
6483
+ {
6484
+ "epoch": 1.450735442272819,
6485
+ "grad_norm": 12.5,
6486
+ "learning_rate": 1.0329773658405535e-05,
6487
+ "loss": 0.8286,
6488
+ "mean_token_accuracy": 0.8001461684703827,
6489
+ "num_tokens": 7977058.0,
6490
+ "step": 7200
6491
+ },
6492
+ {
6493
+ "epoch": 1.4527503526093088,
6494
+ "grad_norm": 11.25,
6495
+ "learning_rate": 1.0316340922828936e-05,
6496
+ "loss": 0.8228,
6497
+ "mean_token_accuracy": 0.7994490921497345,
6498
+ "num_tokens": 7987859.0,
6499
+ "step": 7210
6500
+ },
6501
+ {
6502
+ "epoch": 1.454765262945799,
6503
+ "grad_norm": 10.8125,
6504
+ "learning_rate": 1.0302908187252335e-05,
6505
+ "loss": 0.8172,
6506
+ "mean_token_accuracy": 0.7964129328727723,
6507
+ "num_tokens": 7999424.0,
6508
+ "step": 7220
6509
+ },
6510
+ {
6511
+ "epoch": 1.456780173282289,
6512
+ "grad_norm": 12.3125,
6513
+ "learning_rate": 1.0289475451675735e-05,
6514
+ "loss": 0.8538,
6515
+ "mean_token_accuracy": 0.7890827238559723,
6516
+ "num_tokens": 8011181.0,
6517
+ "step": 7230
6518
+ },
6519
+ {
6520
+ "epoch": 1.4587950836187789,
6521
+ "grad_norm": 12.9375,
6522
+ "learning_rate": 1.0276042716099136e-05,
6523
+ "loss": 0.9005,
6524
+ "mean_token_accuracy": 0.7845316469669342,
6525
+ "num_tokens": 8021786.0,
6526
+ "step": 7240
6527
+ },
6528
+ {
6529
+ "epoch": 1.460809993955269,
6530
+ "grad_norm": 12.0,
6531
+ "learning_rate": 1.0262609980522533e-05,
6532
+ "loss": 0.8514,
6533
+ "mean_token_accuracy": 0.7937814593315125,
6534
+ "num_tokens": 8033599.0,
6535
+ "step": 7250
6536
+ },
6537
+ {
6538
+ "epoch": 1.4628249042917592,
6539
+ "grad_norm": 13.8125,
6540
+ "learning_rate": 1.0249177244945933e-05,
6541
+ "loss": 0.9692,
6542
+ "mean_token_accuracy": 0.768017840385437,
6543
+ "num_tokens": 8044948.0,
6544
+ "step": 7260
6545
+ },
6546
+ {
6547
+ "epoch": 1.464839814628249,
6548
+ "grad_norm": 10.0625,
6549
+ "learning_rate": 1.0235744509369334e-05,
6550
+ "loss": 0.8586,
6551
+ "mean_token_accuracy": 0.7864105820655822,
6552
+ "num_tokens": 8056601.0,
6553
+ "step": 7270
6554
+ },
6555
+ {
6556
+ "epoch": 1.466854724964739,
6557
+ "grad_norm": 10.875,
6558
+ "learning_rate": 1.0222311773792735e-05,
6559
+ "loss": 0.7389,
6560
+ "mean_token_accuracy": 0.8095929026603699,
6561
+ "num_tokens": 8067564.0,
6562
+ "step": 7280
6563
+ },
6564
+ {
6565
+ "epoch": 1.4688696353012292,
6566
+ "grad_norm": 10.75,
6567
+ "learning_rate": 1.0208879038216133e-05,
6568
+ "loss": 0.829,
6569
+ "mean_token_accuracy": 0.7980533838272095,
6570
+ "num_tokens": 8077900.0,
6571
+ "step": 7290
6572
+ },
6573
+ {
6574
+ "epoch": 1.470884545637719,
6575
+ "grad_norm": 11.5625,
6576
+ "learning_rate": 1.0195446302639534e-05,
6577
+ "loss": 0.783,
6578
+ "mean_token_accuracy": 0.8065039277076721,
6579
+ "num_tokens": 8088501.0,
6580
+ "step": 7300
6581
+ },
6582
+ {
6583
+ "epoch": 1.472899455974209,
6584
+ "grad_norm": 10.375,
6585
+ "learning_rate": 1.0182013567062934e-05,
6586
+ "loss": 0.7907,
6587
+ "mean_token_accuracy": 0.7920153796672821,
6588
+ "num_tokens": 8099942.0,
6589
+ "step": 7310
6590
+ },
6591
+ {
6592
+ "epoch": 1.4749143663106992,
6593
+ "grad_norm": 11.125,
6594
+ "learning_rate": 1.0168580831486332e-05,
6595
+ "loss": 0.8727,
6596
+ "mean_token_accuracy": 0.7863239705562591,
6597
+ "num_tokens": 8111532.0,
6598
+ "step": 7320
6599
+ },
6600
+ {
6601
+ "epoch": 1.4769292766471893,
6602
+ "grad_norm": 12.6875,
6603
+ "learning_rate": 1.0155148095909732e-05,
6604
+ "loss": 0.798,
6605
+ "mean_token_accuracy": 0.8027134239673615,
6606
+ "num_tokens": 8122335.0,
6607
+ "step": 7330
6608
+ },
6609
+ {
6610
+ "epoch": 1.4789441869836792,
6611
+ "grad_norm": 13.9375,
6612
+ "learning_rate": 1.0141715360333133e-05,
6613
+ "loss": 0.7377,
6614
+ "mean_token_accuracy": 0.8082942187786102,
6615
+ "num_tokens": 8132604.0,
6616
+ "step": 7340
6617
+ },
6618
+ {
6619
+ "epoch": 1.4809590973201692,
6620
+ "grad_norm": 11.5,
6621
+ "learning_rate": 1.0128282624756533e-05,
6622
+ "loss": 0.8337,
6623
+ "mean_token_accuracy": 0.7979135930538177,
6624
+ "num_tokens": 8143891.0,
6625
+ "step": 7350
6626
+ },
6627
+ {
6628
+ "epoch": 1.4829740076566593,
6629
+ "grad_norm": 13.375,
6630
+ "learning_rate": 1.0114849889179932e-05,
6631
+ "loss": 0.9091,
6632
+ "mean_token_accuracy": 0.7805217266082763,
6633
+ "num_tokens": 8154184.0,
6634
+ "step": 7360
6635
+ },
6636
+ {
6637
+ "epoch": 1.4849889179931492,
6638
+ "grad_norm": 9.875,
6639
+ "learning_rate": 1.0101417153603332e-05,
6640
+ "loss": 0.8451,
6641
+ "mean_token_accuracy": 0.7925164818763732,
6642
+ "num_tokens": 8165049.0,
6643
+ "step": 7370
6644
+ },
6645
+ {
6646
+ "epoch": 1.4870038283296394,
6647
+ "grad_norm": 12.0625,
6648
+ "learning_rate": 1.0087984418026733e-05,
6649
+ "loss": 0.8572,
6650
+ "mean_token_accuracy": 0.7849507808685303,
6651
+ "num_tokens": 8177037.0,
6652
+ "step": 7380
6653
+ },
6654
+ {
6655
+ "epoch": 1.4890187386661293,
6656
+ "grad_norm": 10.875,
6657
+ "learning_rate": 1.0074551682450132e-05,
6658
+ "loss": 0.8239,
6659
+ "mean_token_accuracy": 0.795056939125061,
6660
+ "num_tokens": 8187440.0,
6661
+ "step": 7390
6662
+ },
6663
+ {
6664
+ "epoch": 1.4910336490026195,
6665
+ "grad_norm": 10.0,
6666
+ "learning_rate": 1.006111894687353e-05,
6667
+ "loss": 0.8283,
6668
+ "mean_token_accuracy": 0.7943599224090576,
6669
+ "num_tokens": 8199890.0,
6670
+ "step": 7400
6671
+ },
6672
+ {
6673
+ "epoch": 1.4930485593391094,
6674
+ "grad_norm": 10.5625,
6675
+ "learning_rate": 1.0047686211296931e-05,
6676
+ "loss": 0.8196,
6677
+ "mean_token_accuracy": 0.7991042912006379,
6678
+ "num_tokens": 8211416.0,
6679
+ "step": 7410
6680
+ },
6681
+ {
6682
+ "epoch": 1.4950634696755993,
6683
+ "grad_norm": 15.125,
6684
+ "learning_rate": 1.003425347572033e-05,
6685
+ "loss": 0.7576,
6686
+ "mean_token_accuracy": 0.8098958432674408,
6687
+ "num_tokens": 8221936.0,
6688
+ "step": 7420
6689
+ },
6690
+ {
6691
+ "epoch": 1.4970783800120895,
6692
+ "grad_norm": 10.6875,
6693
+ "learning_rate": 1.002082074014373e-05,
6694
+ "loss": 0.7949,
6695
+ "mean_token_accuracy": 0.803859144449234,
6696
+ "num_tokens": 8232684.0,
6697
+ "step": 7430
6698
+ },
6699
+ {
6700
+ "epoch": 1.4990932903485794,
6701
+ "grad_norm": 12.5,
6702
+ "learning_rate": 1.0007388004567131e-05,
6703
+ "loss": 0.8918,
6704
+ "mean_token_accuracy": 0.7786654233932495,
6705
+ "num_tokens": 8244164.0,
6706
+ "step": 7440
6707
+ },
6708
+ {
6709
+ "epoch": 1.5011082006850696,
6710
+ "grad_norm": 11.4375,
6711
+ "learning_rate": 9.99395526899053e-06,
6712
+ "loss": 0.9013,
6713
+ "mean_token_accuracy": 0.7826810419559479,
6714
+ "num_tokens": 8255935.0,
6715
+ "step": 7450
6716
+ },
6717
+ {
6718
+ "epoch": 1.5031231110215595,
6719
+ "grad_norm": 11.4375,
6720
+ "learning_rate": 9.98052253341393e-06,
6721
+ "loss": 0.7827,
6722
+ "mean_token_accuracy": 0.8083594501018524,
6723
+ "num_tokens": 8267114.0,
6724
+ "step": 7460
6725
+ },
6726
+ {
6727
+ "epoch": 1.5051380213580496,
6728
+ "grad_norm": 12.0,
6729
+ "learning_rate": 9.967089797837331e-06,
6730
+ "loss": 0.7499,
6731
+ "mean_token_accuracy": 0.8060566544532776,
6732
+ "num_tokens": 8277828.0,
6733
+ "step": 7470
6734
+ },
6735
+ {
6736
+ "epoch": 1.5071529316945396,
6737
+ "grad_norm": 12.375,
6738
+ "learning_rate": 9.95365706226073e-06,
6739
+ "loss": 0.8611,
6740
+ "mean_token_accuracy": 0.7861000895500183,
6741
+ "num_tokens": 8289857.0,
6742
+ "step": 7480
6743
+ },
6744
+ {
6745
+ "epoch": 1.5091678420310295,
6746
+ "grad_norm": 12.125,
6747
+ "learning_rate": 9.94022432668413e-06,
6748
+ "loss": 0.8669,
6749
+ "mean_token_accuracy": 0.7892852067947388,
6750
+ "num_tokens": 8300286.0,
6751
+ "step": 7490
6752
+ },
6753
+ {
6754
+ "epoch": 1.5111827523675196,
6755
+ "grad_norm": 10.8125,
6756
+ "learning_rate": 9.92679159110753e-06,
6757
+ "loss": 0.7735,
6758
+ "mean_token_accuracy": 0.8061196208000183,
6759
+ "num_tokens": 8312344.0,
6760
+ "step": 7500
6761
  }
6762
  ],
6763
  "logging_steps": 10,
 
6777
  "attributes": {}
6778
  }
6779
  },
6780
+ "total_flos": 1.006244257019904e+16,
6781
  "train_batch_size": 8,
6782
  "trial_name": null,
6783
  "trial_params": null