Plofski commited on
Commit
9f463cd
·
verified ·
1 Parent(s): e0ce647

Training in progress, step 7000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8da83f3c30b9473fef2b931e6b47e4814c76e805b02501c93641aed6bc786ead
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e99f98def5707de3be1588197fe5096482fac3f483b22d6d05ac701448ef1f6
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:d8c7391e5803dc14420bae3b5326bbd52abb5236b17e67147b31348d199ebeef
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c36c51441c6a4e72e59a3d4e0e9b5b84bfb5e8d67b647194ab6c6bfde7983c0e
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0717c8780efa444a6d80d462b725b32f107f9a3c24550aaaa04a7d27cefba76b
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:326c38bef4f14b97646caa84204f32859351159ff635853df88679a10264e29a
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.309691718718517,
6
  "eval_steps": 500,
7
- "global_step": 6500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -5858,6 +5858,456 @@
5858
  "mean_token_accuracy": 0.7886571526527405,
5859
  "num_tokens": 7207013.0,
5860
  "step": 6500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5861
  }
5862
  ],
5863
  "logging_steps": 10,
@@ -5877,7 +6327,7 @@
5877
  "attributes": {}
5878
  }
5879
  },
5880
- "total_flos": 8718380552103936.0,
5881
  "train_batch_size": 8,
5882
  "trial_name": null,
5883
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.4104372355430184,
6
  "eval_steps": 500,
7
+ "global_step": 7000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
5858
  "mean_token_accuracy": 0.7886571526527405,
5859
  "num_tokens": 7207013.0,
5860
  "step": 6500
5861
+ },
5862
+ {
5863
+ "epoch": 1.311706629055007,
5864
+ "grad_norm": 10.0625,
5865
+ "learning_rate": 1.1256632413190946e-05,
5866
+ "loss": 0.7398,
5867
+ "mean_token_accuracy": 0.8139720261096954,
5868
+ "num_tokens": 7217616.0,
5869
+ "step": 6510
5870
+ },
5871
+ {
5872
+ "epoch": 1.313721539391497,
5873
+ "grad_norm": 12.875,
5874
+ "learning_rate": 1.1243199677614347e-05,
5875
+ "loss": 0.8148,
5876
+ "mean_token_accuracy": 0.7918058097362518,
5877
+ "num_tokens": 7228902.0,
5878
+ "step": 6520
5879
+ },
5880
+ {
5881
+ "epoch": 1.315736449727987,
5882
+ "grad_norm": 12.5,
5883
+ "learning_rate": 1.1229766942037747e-05,
5884
+ "loss": 0.849,
5885
+ "mean_token_accuracy": 0.7959416568279266,
5886
+ "num_tokens": 7239105.0,
5887
+ "step": 6530
5888
+ },
5889
+ {
5890
+ "epoch": 1.3177513600644772,
5891
+ "grad_norm": 11.1875,
5892
+ "learning_rate": 1.1216334206461146e-05,
5893
+ "loss": 0.7653,
5894
+ "mean_token_accuracy": 0.8075124859809876,
5895
+ "num_tokens": 7248600.0,
5896
+ "step": 6540
5897
+ },
5898
+ {
5899
+ "epoch": 1.3197662704009672,
5900
+ "grad_norm": 11.4375,
5901
+ "learning_rate": 1.1202901470884547e-05,
5902
+ "loss": 0.8824,
5903
+ "mean_token_accuracy": 0.7894264698028565,
5904
+ "num_tokens": 7260572.0,
5905
+ "step": 6550
5906
+ },
5907
+ {
5908
+ "epoch": 1.321781180737457,
5909
+ "grad_norm": 12.1875,
5910
+ "learning_rate": 1.1189468735307947e-05,
5911
+ "loss": 0.8524,
5912
+ "mean_token_accuracy": 0.7949903309345245,
5913
+ "num_tokens": 7270963.0,
5914
+ "step": 6560
5915
+ },
5916
+ {
5917
+ "epoch": 1.3237960910739472,
5918
+ "grad_norm": 13.0,
5919
+ "learning_rate": 1.1176035999731348e-05,
5920
+ "loss": 0.7687,
5921
+ "mean_token_accuracy": 0.8052358329296112,
5922
+ "num_tokens": 7282254.0,
5923
+ "step": 6570
5924
+ },
5925
+ {
5926
+ "epoch": 1.3258110014104372,
5927
+ "grad_norm": 12.1875,
5928
+ "learning_rate": 1.1162603264154745e-05,
5929
+ "loss": 0.771,
5930
+ "mean_token_accuracy": 0.8071496605873107,
5931
+ "num_tokens": 7293472.0,
5932
+ "step": 6580
5933
+ },
5934
+ {
5935
+ "epoch": 1.3278259117469273,
5936
+ "grad_norm": 16.0,
5937
+ "learning_rate": 1.1149170528578145e-05,
5938
+ "loss": 0.7544,
5939
+ "mean_token_accuracy": 0.8064080238342285,
5940
+ "num_tokens": 7302549.0,
5941
+ "step": 6590
5942
+ },
5943
+ {
5944
+ "epoch": 1.3298408220834173,
5945
+ "grad_norm": 11.625,
5946
+ "learning_rate": 1.1135737793001546e-05,
5947
+ "loss": 0.7427,
5948
+ "mean_token_accuracy": 0.8121874392032623,
5949
+ "num_tokens": 7313898.0,
5950
+ "step": 6600
5951
+ },
5952
+ {
5953
+ "epoch": 1.3318557324199074,
5954
+ "grad_norm": 11.4375,
5955
+ "learning_rate": 1.1122305057424945e-05,
5956
+ "loss": 0.8996,
5957
+ "mean_token_accuracy": 0.7810469567775726,
5958
+ "num_tokens": 7324658.0,
5959
+ "step": 6610
5960
+ },
5961
+ {
5962
+ "epoch": 1.3338706427563973,
5963
+ "grad_norm": 11.625,
5964
+ "learning_rate": 1.1108872321848345e-05,
5965
+ "loss": 0.7865,
5966
+ "mean_token_accuracy": 0.806594967842102,
5967
+ "num_tokens": 7335673.0,
5968
+ "step": 6620
5969
+ },
5970
+ {
5971
+ "epoch": 1.3358855530928873,
5972
+ "grad_norm": 12.6875,
5973
+ "learning_rate": 1.1095439586271746e-05,
5974
+ "loss": 0.9176,
5975
+ "mean_token_accuracy": 0.7800018846988678,
5976
+ "num_tokens": 7347253.0,
5977
+ "step": 6630
5978
+ },
5979
+ {
5980
+ "epoch": 1.3379004634293774,
5981
+ "grad_norm": 12.6875,
5982
+ "learning_rate": 1.1082006850695145e-05,
5983
+ "loss": 0.8083,
5984
+ "mean_token_accuracy": 0.8040257275104523,
5985
+ "num_tokens": 7358476.0,
5986
+ "step": 6640
5987
+ },
5988
+ {
5989
+ "epoch": 1.3399153737658676,
5990
+ "grad_norm": 14.0625,
5991
+ "learning_rate": 1.1068574115118545e-05,
5992
+ "loss": 0.9627,
5993
+ "mean_token_accuracy": 0.7791651308536529,
5994
+ "num_tokens": 7369343.0,
5995
+ "step": 6650
5996
+ },
5997
+ {
5998
+ "epoch": 1.3419302841023575,
5999
+ "grad_norm": 14.375,
6000
+ "learning_rate": 1.1055141379541944e-05,
6001
+ "loss": 0.869,
6002
+ "mean_token_accuracy": 0.791654235124588,
6003
+ "num_tokens": 7380142.0,
6004
+ "step": 6660
6005
+ },
6006
+ {
6007
+ "epoch": 1.3439451944388474,
6008
+ "grad_norm": 16.75,
6009
+ "learning_rate": 1.1041708643965345e-05,
6010
+ "loss": 0.8959,
6011
+ "mean_token_accuracy": 0.7814781248569489,
6012
+ "num_tokens": 7391681.0,
6013
+ "step": 6670
6014
+ },
6015
+ {
6016
+ "epoch": 1.3459601047753376,
6017
+ "grad_norm": 25.75,
6018
+ "learning_rate": 1.1028275908388743e-05,
6019
+ "loss": 0.7937,
6020
+ "mean_token_accuracy": 0.8048185467720032,
6021
+ "num_tokens": 7402373.0,
6022
+ "step": 6680
6023
+ },
6024
+ {
6025
+ "epoch": 1.3479750151118275,
6026
+ "grad_norm": 11.25,
6027
+ "learning_rate": 1.1014843172812144e-05,
6028
+ "loss": 0.8797,
6029
+ "mean_token_accuracy": 0.7896045446395874,
6030
+ "num_tokens": 7412646.0,
6031
+ "step": 6690
6032
+ },
6033
+ {
6034
+ "epoch": 1.3499899254483174,
6035
+ "grad_norm": 11.5625,
6036
+ "learning_rate": 1.1001410437235544e-05,
6037
+ "loss": 0.7739,
6038
+ "mean_token_accuracy": 0.8085869729518891,
6039
+ "num_tokens": 7423087.0,
6040
+ "step": 6700
6041
+ },
6042
+ {
6043
+ "epoch": 1.3520048357848076,
6044
+ "grad_norm": 42.5,
6045
+ "learning_rate": 1.0987977701658943e-05,
6046
+ "loss": 0.8267,
6047
+ "mean_token_accuracy": 0.7931196630001068,
6048
+ "num_tokens": 7433496.0,
6049
+ "step": 6710
6050
+ },
6051
+ {
6052
+ "epoch": 1.3540197461212977,
6053
+ "grad_norm": 11.625,
6054
+ "learning_rate": 1.0974544966082344e-05,
6055
+ "loss": 0.7688,
6056
+ "mean_token_accuracy": 0.8101352214813232,
6057
+ "num_tokens": 7444890.0,
6058
+ "step": 6720
6059
+ },
6060
+ {
6061
+ "epoch": 1.3560346564577876,
6062
+ "grad_norm": 9.9375,
6063
+ "learning_rate": 1.0961112230505744e-05,
6064
+ "loss": 0.8294,
6065
+ "mean_token_accuracy": 0.7928038239479065,
6066
+ "num_tokens": 7456497.0,
6067
+ "step": 6730
6068
+ },
6069
+ {
6070
+ "epoch": 1.3580495667942776,
6071
+ "grad_norm": 13.0,
6072
+ "learning_rate": 1.0947679494929145e-05,
6073
+ "loss": 0.8978,
6074
+ "mean_token_accuracy": 0.7794729173183441,
6075
+ "num_tokens": 7466678.0,
6076
+ "step": 6740
6077
+ },
6078
+ {
6079
+ "epoch": 1.3600644771307677,
6080
+ "grad_norm": 12.0625,
6081
+ "learning_rate": 1.0934246759352542e-05,
6082
+ "loss": 0.8052,
6083
+ "mean_token_accuracy": 0.8050779700279236,
6084
+ "num_tokens": 7477058.0,
6085
+ "step": 6750
6086
+ },
6087
+ {
6088
+ "epoch": 1.3620793874672577,
6089
+ "grad_norm": 11.75,
6090
+ "learning_rate": 1.0920814023775943e-05,
6091
+ "loss": 0.8537,
6092
+ "mean_token_accuracy": 0.7877449512481689,
6093
+ "num_tokens": 7487077.0,
6094
+ "step": 6760
6095
+ },
6096
+ {
6097
+ "epoch": 1.3640942978037478,
6098
+ "grad_norm": 14.0,
6099
+ "learning_rate": 1.0907381288199343e-05,
6100
+ "loss": 0.8519,
6101
+ "mean_token_accuracy": 0.7857004582881928,
6102
+ "num_tokens": 7497355.0,
6103
+ "step": 6770
6104
+ },
6105
+ {
6106
+ "epoch": 1.3661092081402377,
6107
+ "grad_norm": 10.4375,
6108
+ "learning_rate": 1.0893948552622742e-05,
6109
+ "loss": 0.796,
6110
+ "mean_token_accuracy": 0.7996562838554382,
6111
+ "num_tokens": 7508344.0,
6112
+ "step": 6780
6113
+ },
6114
+ {
6115
+ "epoch": 1.3681241184767279,
6116
+ "grad_norm": 11.75,
6117
+ "learning_rate": 1.0880515817046142e-05,
6118
+ "loss": 0.8427,
6119
+ "mean_token_accuracy": 0.7912454545497895,
6120
+ "num_tokens": 7519520.0,
6121
+ "step": 6790
6122
+ },
6123
+ {
6124
+ "epoch": 1.3701390288132178,
6125
+ "grad_norm": 9.25,
6126
+ "learning_rate": 1.0867083081469543e-05,
6127
+ "loss": 0.8216,
6128
+ "mean_token_accuracy": 0.7908532798290253,
6129
+ "num_tokens": 7531198.0,
6130
+ "step": 6800
6131
+ },
6132
+ {
6133
+ "epoch": 1.3721539391497077,
6134
+ "grad_norm": 10.625,
6135
+ "learning_rate": 1.085365034589294e-05,
6136
+ "loss": 0.8054,
6137
+ "mean_token_accuracy": 0.8009598433971405,
6138
+ "num_tokens": 7542242.0,
6139
+ "step": 6810
6140
+ },
6141
+ {
6142
+ "epoch": 1.3741688494861979,
6143
+ "grad_norm": 11.8125,
6144
+ "learning_rate": 1.084021761031634e-05,
6145
+ "loss": 0.8353,
6146
+ "mean_token_accuracy": 0.7952351868152618,
6147
+ "num_tokens": 7553773.0,
6148
+ "step": 6820
6149
+ },
6150
+ {
6151
+ "epoch": 1.3761837598226878,
6152
+ "grad_norm": 14.4375,
6153
+ "learning_rate": 1.0826784874739741e-05,
6154
+ "loss": 0.7143,
6155
+ "mean_token_accuracy": 0.8181480646133423,
6156
+ "num_tokens": 7563382.0,
6157
+ "step": 6830
6158
+ },
6159
+ {
6160
+ "epoch": 1.378198670159178,
6161
+ "grad_norm": 10.3125,
6162
+ "learning_rate": 1.0813352139163142e-05,
6163
+ "loss": 0.8876,
6164
+ "mean_token_accuracy": 0.7816132783889771,
6165
+ "num_tokens": 7575581.0,
6166
+ "step": 6840
6167
+ },
6168
+ {
6169
+ "epoch": 1.380213580495668,
6170
+ "grad_norm": 10.4375,
6171
+ "learning_rate": 1.079991940358654e-05,
6172
+ "loss": 0.8494,
6173
+ "mean_token_accuracy": 0.7854238271713256,
6174
+ "num_tokens": 7587514.0,
6175
+ "step": 6850
6176
+ },
6177
+ {
6178
+ "epoch": 1.382228490832158,
6179
+ "grad_norm": 10.1875,
6180
+ "learning_rate": 1.0786486668009941e-05,
6181
+ "loss": 0.8978,
6182
+ "mean_token_accuracy": 0.7826396405696869,
6183
+ "num_tokens": 7599849.0,
6184
+ "step": 6860
6185
+ },
6186
+ {
6187
+ "epoch": 1.384243401168648,
6188
+ "grad_norm": 10.25,
6189
+ "learning_rate": 1.0773053932433342e-05,
6190
+ "loss": 0.7709,
6191
+ "mean_token_accuracy": 0.8018522441387177,
6192
+ "num_tokens": 7611329.0,
6193
+ "step": 6870
6194
+ },
6195
+ {
6196
+ "epoch": 1.386258311505138,
6197
+ "grad_norm": 12.625,
6198
+ "learning_rate": 1.075962119685674e-05,
6199
+ "loss": 0.8472,
6200
+ "mean_token_accuracy": 0.7918815612792969,
6201
+ "num_tokens": 7621873.0,
6202
+ "step": 6880
6203
+ },
6204
+ {
6205
+ "epoch": 1.388273221841628,
6206
+ "grad_norm": 14.75,
6207
+ "learning_rate": 1.0746188461280141e-05,
6208
+ "loss": 0.9434,
6209
+ "mean_token_accuracy": 0.7729089677333831,
6210
+ "num_tokens": 7633787.0,
6211
+ "step": 6890
6212
+ },
6213
+ {
6214
+ "epoch": 1.3902881321781182,
6215
+ "grad_norm": 11.9375,
6216
+ "learning_rate": 1.0732755725703542e-05,
6217
+ "loss": 0.8285,
6218
+ "mean_token_accuracy": 0.7966946125030517,
6219
+ "num_tokens": 7643889.0,
6220
+ "step": 6900
6221
+ },
6222
+ {
6223
+ "epoch": 1.3923030425146081,
6224
+ "grad_norm": 11.9375,
6225
+ "learning_rate": 1.0719322990126942e-05,
6226
+ "loss": 0.8414,
6227
+ "mean_token_accuracy": 0.7909166395664216,
6228
+ "num_tokens": 7655243.0,
6229
+ "step": 6910
6230
+ },
6231
+ {
6232
+ "epoch": 1.394317952851098,
6233
+ "grad_norm": 11.6875,
6234
+ "learning_rate": 1.070589025455034e-05,
6235
+ "loss": 0.7375,
6236
+ "mean_token_accuracy": 0.8181369364261627,
6237
+ "num_tokens": 7666042.0,
6238
+ "step": 6920
6239
+ },
6240
+ {
6241
+ "epoch": 1.3963328631875882,
6242
+ "grad_norm": 8.4375,
6243
+ "learning_rate": 1.069245751897374e-05,
6244
+ "loss": 0.8464,
6245
+ "mean_token_accuracy": 0.797934228181839,
6246
+ "num_tokens": 7676887.0,
6247
+ "step": 6930
6248
+ },
6249
+ {
6250
+ "epoch": 1.3983477735240781,
6251
+ "grad_norm": 12.625,
6252
+ "learning_rate": 1.067902478339714e-05,
6253
+ "loss": 0.8094,
6254
+ "mean_token_accuracy": 0.7948006153106689,
6255
+ "num_tokens": 7688051.0,
6256
+ "step": 6940
6257
+ },
6258
+ {
6259
+ "epoch": 1.4003626838605683,
6260
+ "grad_norm": 10.9375,
6261
+ "learning_rate": 1.0665592047820539e-05,
6262
+ "loss": 0.8955,
6263
+ "mean_token_accuracy": 0.782581114768982,
6264
+ "num_tokens": 7699480.0,
6265
+ "step": 6950
6266
+ },
6267
+ {
6268
+ "epoch": 1.4023775941970582,
6269
+ "grad_norm": 12.375,
6270
+ "learning_rate": 1.065215931224394e-05,
6271
+ "loss": 0.8763,
6272
+ "mean_token_accuracy": 0.7808327317237854,
6273
+ "num_tokens": 7710853.0,
6274
+ "step": 6960
6275
+ },
6276
+ {
6277
+ "epoch": 1.4043925045335484,
6278
+ "grad_norm": 12.875,
6279
+ "learning_rate": 1.063872657666734e-05,
6280
+ "loss": 0.8204,
6281
+ "mean_token_accuracy": 0.8006475508213043,
6282
+ "num_tokens": 7722290.0,
6283
+ "step": 6970
6284
+ },
6285
+ {
6286
+ "epoch": 1.4064074148700383,
6287
+ "grad_norm": 10.625,
6288
+ "learning_rate": 1.0625293841090737e-05,
6289
+ "loss": 0.839,
6290
+ "mean_token_accuracy": 0.7950416922569274,
6291
+ "num_tokens": 7733653.0,
6292
+ "step": 6980
6293
+ },
6294
+ {
6295
+ "epoch": 1.4084223252065282,
6296
+ "grad_norm": 12.75,
6297
+ "learning_rate": 1.0611861105514138e-05,
6298
+ "loss": 0.8964,
6299
+ "mean_token_accuracy": 0.7838905036449433,
6300
+ "num_tokens": 7744184.0,
6301
+ "step": 6990
6302
+ },
6303
+ {
6304
+ "epoch": 1.4104372355430184,
6305
+ "grad_norm": 12.0,
6306
+ "learning_rate": 1.0598428369937538e-05,
6307
+ "loss": 0.7874,
6308
+ "mean_token_accuracy": 0.7988959193229676,
6309
+ "num_tokens": 7754571.0,
6310
+ "step": 7000
6311
  }
6312
  ],
6313
  "logging_steps": 10,
 
6327
  "attributes": {}
6328
  }
6329
  },
6330
+ "total_flos": 9382261075611648.0,
6331
  "train_batch_size": 8,
6332
  "trial_name": null,
6333
  "trial_params": null