Plofski commited on
Commit
223946d
·
verified ·
1 Parent(s): 5309a54

Training in progress, step 6000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:fcb80f83cde4a31bb60c1fd7260ffe3f7e16f618b67202dd29fd631a03093894
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9eb50dbcfebd5f63fc3cc77929d31805c3be0d18c479c86d9d2674102149d998
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:deac3ee60db6adb45d1da1976f4f679efdf8206065175afc58ada5c695ccf6a5
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69bc7042f1e1e7b74e152e40dbcd26c60ace8254419664002f89f720c072bde5
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f4f36f1c6d7eb84c738a082911123d4e08f6356fc8093bb45612eb211d0cfe74
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6125d2b668a070022ee702876ba7ef10eb371529c27241694b5b376ca68bdc81
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.1082006850695145,
6
  "eval_steps": 500,
7
- "global_step": 5500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -4958,6 +4958,456 @@
4958
  "mean_token_accuracy": 0.7852272689342499,
4959
  "num_tokens": 6091516.0,
4960
  "step": 5500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4961
  }
4962
  ],
4963
  "logging_steps": 10,
@@ -4977,7 +5427,7 @@
4977
  "attributes": {}
4978
  }
4979
  },
4980
- "total_flos": 7364465716629504.0,
4981
  "train_batch_size": 8,
4982
  "trial_name": null,
4983
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.2089462018940158,
6
  "eval_steps": 500,
7
+ "global_step": 6000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
4958
  "mean_token_accuracy": 0.7852272689342499,
4959
  "num_tokens": 6091516.0,
4960
  "step": 5500
4961
+ },
4962
+ {
4963
+ "epoch": 1.1102155954060045,
4964
+ "grad_norm": 13.9375,
4965
+ "learning_rate": 1.2599905970850965e-05,
4966
+ "loss": 0.8984,
4967
+ "mean_token_accuracy": 0.7786856353282928,
4968
+ "num_tokens": 6101758.0,
4969
+ "step": 5510
4970
+ },
4971
+ {
4972
+ "epoch": 1.1122305057424944,
4973
+ "grad_norm": 10.5,
4974
+ "learning_rate": 1.2586473235274365e-05,
4975
+ "loss": 0.8218,
4976
+ "mean_token_accuracy": 0.8003330588340759,
4977
+ "num_tokens": 6112860.0,
4978
+ "step": 5520
4979
+ },
4980
+ {
4981
+ "epoch": 1.1142454160789845,
4982
+ "grad_norm": 10.25,
4983
+ "learning_rate": 1.2573040499697764e-05,
4984
+ "loss": 0.8069,
4985
+ "mean_token_accuracy": 0.7983652293682099,
4986
+ "num_tokens": 6124481.0,
4987
+ "step": 5530
4988
+ },
4989
+ {
4990
+ "epoch": 1.1162603264154745,
4991
+ "grad_norm": 10.5,
4992
+ "learning_rate": 1.2559607764121164e-05,
4993
+ "loss": 0.9377,
4994
+ "mean_token_accuracy": 0.7729220628738404,
4995
+ "num_tokens": 6137558.0,
4996
+ "step": 5540
4997
+ },
4998
+ {
4999
+ "epoch": 1.1182752367519646,
5000
+ "grad_norm": 10.0,
5001
+ "learning_rate": 1.2546175028544565e-05,
5002
+ "loss": 0.8107,
5003
+ "mean_token_accuracy": 0.8016826927661895,
5004
+ "num_tokens": 6149919.0,
5005
+ "step": 5550
5006
+ },
5007
+ {
5008
+ "epoch": 1.1202901470884545,
5009
+ "grad_norm": 11.875,
5010
+ "learning_rate": 1.2532742292967964e-05,
5011
+ "loss": 0.8375,
5012
+ "mean_token_accuracy": 0.7957591891288758,
5013
+ "num_tokens": 6160493.0,
5014
+ "step": 5560
5015
+ },
5016
+ {
5017
+ "epoch": 1.1223050574249447,
5018
+ "grad_norm": 9.25,
5019
+ "learning_rate": 1.2519309557391364e-05,
5020
+ "loss": 0.8292,
5021
+ "mean_token_accuracy": 0.7971819519996644,
5022
+ "num_tokens": 6172744.0,
5023
+ "step": 5570
5024
+ },
5025
+ {
5026
+ "epoch": 1.1243199677614346,
5027
+ "grad_norm": 11.0625,
5028
+ "learning_rate": 1.2505876821814765e-05,
5029
+ "loss": 0.8092,
5030
+ "mean_token_accuracy": 0.8008480191230773,
5031
+ "num_tokens": 6183620.0,
5032
+ "step": 5580
5033
+ },
5034
+ {
5035
+ "epoch": 1.1263348780979245,
5036
+ "grad_norm": 9.8125,
5037
+ "learning_rate": 1.2492444086238162e-05,
5038
+ "loss": 0.7622,
5039
+ "mean_token_accuracy": 0.8064143776893615,
5040
+ "num_tokens": 6194516.0,
5041
+ "step": 5590
5042
+ },
5043
+ {
5044
+ "epoch": 1.1283497884344147,
5045
+ "grad_norm": 12.875,
5046
+ "learning_rate": 1.2479011350661563e-05,
5047
+ "loss": 0.8479,
5048
+ "mean_token_accuracy": 0.7917460918426513,
5049
+ "num_tokens": 6205641.0,
5050
+ "step": 5600
5051
+ },
5052
+ {
5053
+ "epoch": 1.1303646987709046,
5054
+ "grad_norm": 13.125,
5055
+ "learning_rate": 1.2465578615084963e-05,
5056
+ "loss": 0.9111,
5057
+ "mean_token_accuracy": 0.77914879322052,
5058
+ "num_tokens": 6218283.0,
5059
+ "step": 5610
5060
+ },
5061
+ {
5062
+ "epoch": 1.1323796091073948,
5063
+ "grad_norm": 11.4375,
5064
+ "learning_rate": 1.2452145879508364e-05,
5065
+ "loss": 0.8832,
5066
+ "mean_token_accuracy": 0.784546959400177,
5067
+ "num_tokens": 6230024.0,
5068
+ "step": 5620
5069
+ },
5070
+ {
5071
+ "epoch": 1.1343945194438847,
5072
+ "grad_norm": 13.4375,
5073
+ "learning_rate": 1.2438713143931762e-05,
5074
+ "loss": 0.8859,
5075
+ "mean_token_accuracy": 0.7834093928337097,
5076
+ "num_tokens": 6242378.0,
5077
+ "step": 5630
5078
+ },
5079
+ {
5080
+ "epoch": 1.1364094297803748,
5081
+ "grad_norm": 9.875,
5082
+ "learning_rate": 1.2425280408355163e-05,
5083
+ "loss": 0.9509,
5084
+ "mean_token_accuracy": 0.7756809532642365,
5085
+ "num_tokens": 6253645.0,
5086
+ "step": 5640
5087
+ },
5088
+ {
5089
+ "epoch": 1.1384243401168648,
5090
+ "grad_norm": 10.625,
5091
+ "learning_rate": 1.2411847672778563e-05,
5092
+ "loss": 0.8785,
5093
+ "mean_token_accuracy": 0.7828778207302094,
5094
+ "num_tokens": 6266360.0,
5095
+ "step": 5650
5096
+ },
5097
+ {
5098
+ "epoch": 1.1404392504533547,
5099
+ "grad_norm": 12.1875,
5100
+ "learning_rate": 1.239841493720196e-05,
5101
+ "loss": 0.8892,
5102
+ "mean_token_accuracy": 0.7838792741298676,
5103
+ "num_tokens": 6277349.0,
5104
+ "step": 5660
5105
+ },
5106
+ {
5107
+ "epoch": 1.1424541607898449,
5108
+ "grad_norm": 12.875,
5109
+ "learning_rate": 1.2384982201625361e-05,
5110
+ "loss": 0.8632,
5111
+ "mean_token_accuracy": 0.790741640329361,
5112
+ "num_tokens": 6288664.0,
5113
+ "step": 5670
5114
+ },
5115
+ {
5116
+ "epoch": 1.144469071126335,
5117
+ "grad_norm": 9.1875,
5118
+ "learning_rate": 1.2371549466048762e-05,
5119
+ "loss": 0.8403,
5120
+ "mean_token_accuracy": 0.7891623616218567,
5121
+ "num_tokens": 6302179.0,
5122
+ "step": 5680
5123
+ },
5124
+ {
5125
+ "epoch": 1.146483981462825,
5126
+ "grad_norm": 11.75,
5127
+ "learning_rate": 1.2358116730472162e-05,
5128
+ "loss": 0.9125,
5129
+ "mean_token_accuracy": 0.7795878767967224,
5130
+ "num_tokens": 6313611.0,
5131
+ "step": 5690
5132
+ },
5133
+ {
5134
+ "epoch": 1.1484988917993149,
5135
+ "grad_norm": 11.9375,
5136
+ "learning_rate": 1.2344683994895561e-05,
5137
+ "loss": 0.8863,
5138
+ "mean_token_accuracy": 0.787626963853836,
5139
+ "num_tokens": 6324550.0,
5140
+ "step": 5700
5141
+ },
5142
+ {
5143
+ "epoch": 1.150513802135805,
5144
+ "grad_norm": 11.5625,
5145
+ "learning_rate": 1.2331251259318962e-05,
5146
+ "loss": 0.9229,
5147
+ "mean_token_accuracy": 0.7838542103767395,
5148
+ "num_tokens": 6336013.0,
5149
+ "step": 5710
5150
+ },
5151
+ {
5152
+ "epoch": 1.152528712472295,
5153
+ "grad_norm": 11.4375,
5154
+ "learning_rate": 1.2317818523742362e-05,
5155
+ "loss": 0.7724,
5156
+ "mean_token_accuracy": 0.8055199205875396,
5157
+ "num_tokens": 6346134.0,
5158
+ "step": 5720
5159
+ },
5160
+ {
5161
+ "epoch": 1.154543622808785,
5162
+ "grad_norm": 11.125,
5163
+ "learning_rate": 1.2304385788165761e-05,
5164
+ "loss": 0.7996,
5165
+ "mean_token_accuracy": 0.8010989010334015,
5166
+ "num_tokens": 6356419.0,
5167
+ "step": 5730
5168
+ },
5169
+ {
5170
+ "epoch": 1.156558533145275,
5171
+ "grad_norm": 12.6875,
5172
+ "learning_rate": 1.2290953052589161e-05,
5173
+ "loss": 0.7861,
5174
+ "mean_token_accuracy": 0.8053541004657745,
5175
+ "num_tokens": 6367086.0,
5176
+ "step": 5740
5177
+ },
5178
+ {
5179
+ "epoch": 1.1585734434817652,
5180
+ "grad_norm": 11.4375,
5181
+ "learning_rate": 1.2277520317012562e-05,
5182
+ "loss": 0.7886,
5183
+ "mean_token_accuracy": 0.8104895174503326,
5184
+ "num_tokens": 6378360.0,
5185
+ "step": 5750
5186
+ },
5187
+ {
5188
+ "epoch": 1.160588353818255,
5189
+ "grad_norm": 10.5625,
5190
+ "learning_rate": 1.226408758143596e-05,
5191
+ "loss": 0.7607,
5192
+ "mean_token_accuracy": 0.807652473449707,
5193
+ "num_tokens": 6390770.0,
5194
+ "step": 5760
5195
+ },
5196
+ {
5197
+ "epoch": 1.162603264154745,
5198
+ "grad_norm": 15.4375,
5199
+ "learning_rate": 1.225065484585936e-05,
5200
+ "loss": 0.8117,
5201
+ "mean_token_accuracy": 0.8016961336135864,
5202
+ "num_tokens": 6400647.0,
5203
+ "step": 5770
5204
+ },
5205
+ {
5206
+ "epoch": 1.1646181744912352,
5207
+ "grad_norm": 13.9375,
5208
+ "learning_rate": 1.223722211028276e-05,
5209
+ "loss": 0.8433,
5210
+ "mean_token_accuracy": 0.7853075683116912,
5211
+ "num_tokens": 6410061.0,
5212
+ "step": 5780
5213
+ },
5214
+ {
5215
+ "epoch": 1.166633084827725,
5216
+ "grad_norm": 13.6875,
5217
+ "learning_rate": 1.222378937470616e-05,
5218
+ "loss": 0.7607,
5219
+ "mean_token_accuracy": 0.8095987677574158,
5220
+ "num_tokens": 6420080.0,
5221
+ "step": 5790
5222
+ },
5223
+ {
5224
+ "epoch": 1.1686479951642152,
5225
+ "grad_norm": 12.25,
5226
+ "learning_rate": 1.221035663912956e-05,
5227
+ "loss": 0.9257,
5228
+ "mean_token_accuracy": 0.778600412607193,
5229
+ "num_tokens": 6432179.0,
5230
+ "step": 5800
5231
+ },
5232
+ {
5233
+ "epoch": 1.1706629055007052,
5234
+ "grad_norm": 11.0,
5235
+ "learning_rate": 1.219692390355296e-05,
5236
+ "loss": 0.8849,
5237
+ "mean_token_accuracy": 0.7801730871200562,
5238
+ "num_tokens": 6442848.0,
5239
+ "step": 5810
5240
+ },
5241
+ {
5242
+ "epoch": 1.1726778158371953,
5243
+ "grad_norm": 10.5625,
5244
+ "learning_rate": 1.218349116797636e-05,
5245
+ "loss": 0.765,
5246
+ "mean_token_accuracy": 0.8116752684116364,
5247
+ "num_tokens": 6453565.0,
5248
+ "step": 5820
5249
+ },
5250
+ {
5251
+ "epoch": 1.1746927261736853,
5252
+ "grad_norm": 14.0,
5253
+ "learning_rate": 1.2170058432399758e-05,
5254
+ "loss": 0.8767,
5255
+ "mean_token_accuracy": 0.7874524176120759,
5256
+ "num_tokens": 6464704.0,
5257
+ "step": 5830
5258
+ },
5259
+ {
5260
+ "epoch": 1.1767076365101752,
5261
+ "grad_norm": 14.625,
5262
+ "learning_rate": 1.2156625696823158e-05,
5263
+ "loss": 0.9735,
5264
+ "mean_token_accuracy": 0.7777929544448853,
5265
+ "num_tokens": 6476700.0,
5266
+ "step": 5840
5267
+ },
5268
+ {
5269
+ "epoch": 1.1787225468466653,
5270
+ "grad_norm": 10.9375,
5271
+ "learning_rate": 1.2143192961246559e-05,
5272
+ "loss": 0.9004,
5273
+ "mean_token_accuracy": 0.7828619062900544,
5274
+ "num_tokens": 6487976.0,
5275
+ "step": 5850
5276
+ },
5277
+ {
5278
+ "epoch": 1.1807374571831553,
5279
+ "grad_norm": 12.625,
5280
+ "learning_rate": 1.2129760225669958e-05,
5281
+ "loss": 0.8873,
5282
+ "mean_token_accuracy": 0.7900948286056518,
5283
+ "num_tokens": 6499362.0,
5284
+ "step": 5860
5285
+ },
5286
+ {
5287
+ "epoch": 1.1827523675196454,
5288
+ "grad_norm": 13.4375,
5289
+ "learning_rate": 1.2116327490093358e-05,
5290
+ "loss": 0.9967,
5291
+ "mean_token_accuracy": 0.7665694057941437,
5292
+ "num_tokens": 6510505.0,
5293
+ "step": 5870
5294
+ },
5295
+ {
5296
+ "epoch": 1.1847672778561353,
5297
+ "grad_norm": 11.625,
5298
+ "learning_rate": 1.2102894754516759e-05,
5299
+ "loss": 0.8207,
5300
+ "mean_token_accuracy": 0.7956820368766785,
5301
+ "num_tokens": 6522577.0,
5302
+ "step": 5880
5303
+ },
5304
+ {
5305
+ "epoch": 1.1867821881926255,
5306
+ "grad_norm": 13.25,
5307
+ "learning_rate": 1.208946201894016e-05,
5308
+ "loss": 0.7919,
5309
+ "mean_token_accuracy": 0.7999676465988159,
5310
+ "num_tokens": 6533179.0,
5311
+ "step": 5890
5312
+ },
5313
+ {
5314
+ "epoch": 1.1887970985291154,
5315
+ "grad_norm": 11.5,
5316
+ "learning_rate": 1.2076029283363558e-05,
5317
+ "loss": 0.8339,
5318
+ "mean_token_accuracy": 0.791290158033371,
5319
+ "num_tokens": 6544059.0,
5320
+ "step": 5900
5321
+ },
5322
+ {
5323
+ "epoch": 1.1908120088656056,
5324
+ "grad_norm": 11.1875,
5325
+ "learning_rate": 1.2062596547786957e-05,
5326
+ "loss": 0.876,
5327
+ "mean_token_accuracy": 0.7901681363582611,
5328
+ "num_tokens": 6554941.0,
5329
+ "step": 5910
5330
+ },
5331
+ {
5332
+ "epoch": 1.1928269192020955,
5333
+ "grad_norm": 9.6875,
5334
+ "learning_rate": 1.2049163812210357e-05,
5335
+ "loss": 0.969,
5336
+ "mean_token_accuracy": 0.7663461267948151,
5337
+ "num_tokens": 6567559.0,
5338
+ "step": 5920
5339
+ },
5340
+ {
5341
+ "epoch": 1.1948418295385856,
5342
+ "grad_norm": 11.25,
5343
+ "learning_rate": 1.2035731076633756e-05,
5344
+ "loss": 0.796,
5345
+ "mean_token_accuracy": 0.7978686451911926,
5346
+ "num_tokens": 6577821.0,
5347
+ "step": 5930
5348
+ },
5349
+ {
5350
+ "epoch": 1.1968567398750756,
5351
+ "grad_norm": 11.3125,
5352
+ "learning_rate": 1.2022298341057157e-05,
5353
+ "loss": 0.8198,
5354
+ "mean_token_accuracy": 0.7984302759170532,
5355
+ "num_tokens": 6590610.0,
5356
+ "step": 5940
5357
+ },
5358
+ {
5359
+ "epoch": 1.1988716502115655,
5360
+ "grad_norm": 10.25,
5361
+ "learning_rate": 1.2008865605480557e-05,
5362
+ "loss": 0.9362,
5363
+ "mean_token_accuracy": 0.7766359865665435,
5364
+ "num_tokens": 6601976.0,
5365
+ "step": 5950
5366
+ },
5367
+ {
5368
+ "epoch": 1.2008865605480556,
5369
+ "grad_norm": 10.0625,
5370
+ "learning_rate": 1.1995432869903958e-05,
5371
+ "loss": 0.7565,
5372
+ "mean_token_accuracy": 0.8038519501686097,
5373
+ "num_tokens": 6613911.0,
5374
+ "step": 5960
5375
+ },
5376
+ {
5377
+ "epoch": 1.2029014708845456,
5378
+ "grad_norm": 14.0625,
5379
+ "learning_rate": 1.1982000134327357e-05,
5380
+ "loss": 0.89,
5381
+ "mean_token_accuracy": 0.7806312680244446,
5382
+ "num_tokens": 6625392.0,
5383
+ "step": 5970
5384
+ },
5385
+ {
5386
+ "epoch": 1.2049163812210357,
5387
+ "grad_norm": 10.625,
5388
+ "learning_rate": 1.1968567398750757e-05,
5389
+ "loss": 0.7505,
5390
+ "mean_token_accuracy": 0.8176207900047302,
5391
+ "num_tokens": 6635806.0,
5392
+ "step": 5980
5393
+ },
5394
+ {
5395
+ "epoch": 1.2069312915575257,
5396
+ "grad_norm": 11.0625,
5397
+ "learning_rate": 1.1955134663174158e-05,
5398
+ "loss": 0.7663,
5399
+ "mean_token_accuracy": 0.8061110198497772,
5400
+ "num_tokens": 6646644.0,
5401
+ "step": 5990
5402
+ },
5403
+ {
5404
+ "epoch": 1.2089462018940158,
5405
+ "grad_norm": 11.0625,
5406
+ "learning_rate": 1.1941701927597555e-05,
5407
+ "loss": 0.7133,
5408
+ "mean_token_accuracy": 0.8181872367858887,
5409
+ "num_tokens": 6657605.0,
5410
+ "step": 6000
5411
  }
5412
  ],
5413
  "logging_steps": 10,
 
5427
  "attributes": {}
5428
  }
5429
  },
5430
+ "total_flos": 8046973169571840.0,
5431
  "train_batch_size": 8,
5432
  "trial_name": null,
5433
  "trial_params": null