Plofski commited on
Commit
897c7d8
·
verified ·
1 Parent(s): 2e67f99

Training in progress, step 8000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:445a128b149954e68d1af5a00630de0dc09e06cb78963d856ab9efe3a52157d9
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:872e13706948c7a141e635bc023a52fbe531ae28f59acde5c4f237db2a94c6b1
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:66488112339a052865703e73eb9d72b3f5f142ea84ea68d0b968dcf9eb080bb8
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c368469d799ee657aa6f345b72b1b063d1207badee5ef2708584fc5b29dd1fa0
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:88ec7f0fcbb8e83ac60a847dffeda029d1a65c084556d4707d85ad106bc04ba0
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f8c5daae46e22d0555f52515cb826d70a09c178d27140188b1fd68ded8645a9
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.5111827523675196,
6
  "eval_steps": 500,
7
- "global_step": 7500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -6758,6 +6758,456 @@
6758
  "mean_token_accuracy": 0.8061196208000183,
6759
  "num_tokens": 8312344.0,
6760
  "step": 7500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6761
  }
6762
  ],
6763
  "logging_steps": 10,
@@ -6777,7 +7227,7 @@
6777
  "attributes": {}
6778
  }
6779
  },
6780
- "total_flos": 1.006244257019904e+16,
6781
  "train_batch_size": 8,
6782
  "trial_name": null,
6783
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.611928269192021,
6
  "eval_steps": 500,
7
+ "global_step": 8000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
6758
  "mean_token_accuracy": 0.8061196208000183,
6759
  "num_tokens": 8312344.0,
6760
  "step": 7500
6761
+ },
6762
+ {
6763
+ "epoch": 1.5131976627040098,
6764
+ "grad_norm": 12.5625,
6765
+ "learning_rate": 9.91335885553093e-06,
6766
+ "loss": 0.7742,
6767
+ "mean_token_accuracy": 0.8027086615562439,
6768
+ "num_tokens": 8323025.0,
6769
+ "step": 7510
6770
+ },
6771
+ {
6772
+ "epoch": 1.5152125730404997,
6773
+ "grad_norm": 12.5,
6774
+ "learning_rate": 9.89992611995433e-06,
6775
+ "loss": 0.7713,
6776
+ "mean_token_accuracy": 0.8100184202194214,
6777
+ "num_tokens": 8333331.0,
6778
+ "step": 7520
6779
+ },
6780
+ {
6781
+ "epoch": 1.5172274833769896,
6782
+ "grad_norm": 12.8125,
6783
+ "learning_rate": 9.886493384377729e-06,
6784
+ "loss": 0.8154,
6785
+ "mean_token_accuracy": 0.7895949363708497,
6786
+ "num_tokens": 8344471.0,
6787
+ "step": 7530
6788
+ },
6789
+ {
6790
+ "epoch": 1.5192423937134798,
6791
+ "grad_norm": 12.3125,
6792
+ "learning_rate": 9.87306064880113e-06,
6793
+ "loss": 0.8998,
6794
+ "mean_token_accuracy": 0.7804327428340911,
6795
+ "num_tokens": 8356442.0,
6796
+ "step": 7540
6797
+ },
6798
+ {
6799
+ "epoch": 1.5212573040499697,
6800
+ "grad_norm": 11.9375,
6801
+ "learning_rate": 9.859627913224528e-06,
6802
+ "loss": 0.8346,
6803
+ "mean_token_accuracy": 0.7952861070632935,
6804
+ "num_tokens": 8367610.0,
6805
+ "step": 7550
6806
+ },
6807
+ {
6808
+ "epoch": 1.5232722143864597,
6809
+ "grad_norm": 11.6875,
6810
+ "learning_rate": 9.846195177647929e-06,
6811
+ "loss": 0.8045,
6812
+ "mean_token_accuracy": 0.8029259443283081,
6813
+ "num_tokens": 8378828.0,
6814
+ "step": 7560
6815
+ },
6816
+ {
6817
+ "epoch": 1.5252871247229498,
6818
+ "grad_norm": 12.5625,
6819
+ "learning_rate": 9.832762442071328e-06,
6820
+ "loss": 0.7931,
6821
+ "mean_token_accuracy": 0.8047609508037568,
6822
+ "num_tokens": 8389283.0,
6823
+ "step": 7570
6824
+ },
6825
+ {
6826
+ "epoch": 1.52730203505944,
6827
+ "grad_norm": 11.375,
6828
+ "learning_rate": 9.819329706494728e-06,
6829
+ "loss": 0.8274,
6830
+ "mean_token_accuracy": 0.7971135258674622,
6831
+ "num_tokens": 8401265.0,
6832
+ "step": 7580
6833
+ },
6834
+ {
6835
+ "epoch": 1.5293169453959299,
6836
+ "grad_norm": 13.3125,
6837
+ "learning_rate": 9.805896970918129e-06,
6838
+ "loss": 0.7739,
6839
+ "mean_token_accuracy": 0.803637433052063,
6840
+ "num_tokens": 8411843.0,
6841
+ "step": 7590
6842
+ },
6843
+ {
6844
+ "epoch": 1.5313318557324198,
6845
+ "grad_norm": 10.375,
6846
+ "learning_rate": 9.792464235341528e-06,
6847
+ "loss": 0.882,
6848
+ "mean_token_accuracy": 0.7834074079990387,
6849
+ "num_tokens": 8421967.0,
6850
+ "step": 7600
6851
+ },
6852
+ {
6853
+ "epoch": 1.53334676606891,
6854
+ "grad_norm": 11.9375,
6855
+ "learning_rate": 9.779031499764928e-06,
6856
+ "loss": 0.8852,
6857
+ "mean_token_accuracy": 0.7851302027702332,
6858
+ "num_tokens": 8435114.0,
6859
+ "step": 7610
6860
+ },
6861
+ {
6862
+ "epoch": 1.5353616764054,
6863
+ "grad_norm": 9.5625,
6864
+ "learning_rate": 9.765598764188327e-06,
6865
+ "loss": 0.7346,
6866
+ "mean_token_accuracy": 0.8161056697368622,
6867
+ "num_tokens": 8446359.0,
6868
+ "step": 7620
6869
+ },
6870
+ {
6871
+ "epoch": 1.53737658674189,
6872
+ "grad_norm": 12.25,
6873
+ "learning_rate": 9.752166028611728e-06,
6874
+ "loss": 0.828,
6875
+ "mean_token_accuracy": 0.7961056709289551,
6876
+ "num_tokens": 8456390.0,
6877
+ "step": 7630
6878
+ },
6879
+ {
6880
+ "epoch": 1.53939149707838,
6881
+ "grad_norm": 9.625,
6882
+ "learning_rate": 9.738733293035128e-06,
6883
+ "loss": 0.7805,
6884
+ "mean_token_accuracy": 0.8055654644966126,
6885
+ "num_tokens": 8467737.0,
6886
+ "step": 7640
6887
+ },
6888
+ {
6889
+ "epoch": 1.5414064074148701,
6890
+ "grad_norm": 11.5625,
6891
+ "learning_rate": 9.725300557458527e-06,
6892
+ "loss": 0.784,
6893
+ "mean_token_accuracy": 0.8105962395668029,
6894
+ "num_tokens": 8478450.0,
6895
+ "step": 7650
6896
+ },
6897
+ {
6898
+ "epoch": 1.54342131775136,
6899
+ "grad_norm": 10.625,
6900
+ "learning_rate": 9.711867821881928e-06,
6901
+ "loss": 0.8922,
6902
+ "mean_token_accuracy": 0.7858581006526947,
6903
+ "num_tokens": 8489331.0,
6904
+ "step": 7660
6905
+ },
6906
+ {
6907
+ "epoch": 1.54543622808785,
6908
+ "grad_norm": 11.625,
6909
+ "learning_rate": 9.698435086305326e-06,
6910
+ "loss": 0.7907,
6911
+ "mean_token_accuracy": 0.8015705049037933,
6912
+ "num_tokens": 8499569.0,
6913
+ "step": 7670
6914
+ },
6915
+ {
6916
+ "epoch": 1.5474511384243401,
6917
+ "grad_norm": 13.9375,
6918
+ "learning_rate": 9.685002350728727e-06,
6919
+ "loss": 0.9439,
6920
+ "mean_token_accuracy": 0.7698397815227509,
6921
+ "num_tokens": 8510367.0,
6922
+ "step": 7680
6923
+ },
6924
+ {
6925
+ "epoch": 1.5494660487608303,
6926
+ "grad_norm": 11.5,
6927
+ "learning_rate": 9.671569615152127e-06,
6928
+ "loss": 0.7814,
6929
+ "mean_token_accuracy": 0.8014598250389099,
6930
+ "num_tokens": 8521247.0,
6931
+ "step": 7690
6932
+ },
6933
+ {
6934
+ "epoch": 1.5514809590973202,
6935
+ "grad_norm": 11.25,
6936
+ "learning_rate": 9.658136879575526e-06,
6937
+ "loss": 0.7568,
6938
+ "mean_token_accuracy": 0.8163803517818451,
6939
+ "num_tokens": 8532047.0,
6940
+ "step": 7700
6941
+ },
6942
+ {
6943
+ "epoch": 1.5534958694338101,
6944
+ "grad_norm": 11.6875,
6945
+ "learning_rate": 9.644704143998927e-06,
6946
+ "loss": 0.7684,
6947
+ "mean_token_accuracy": 0.8017966628074646,
6948
+ "num_tokens": 8543022.0,
6949
+ "step": 7710
6950
+ },
6951
+ {
6952
+ "epoch": 1.5555107797703003,
6953
+ "grad_norm": 11.1875,
6954
+ "learning_rate": 9.631271408422326e-06,
6955
+ "loss": 0.7742,
6956
+ "mean_token_accuracy": 0.8069942653179168,
6957
+ "num_tokens": 8554086.0,
6958
+ "step": 7720
6959
+ },
6960
+ {
6961
+ "epoch": 1.5575256901067902,
6962
+ "grad_norm": 10.9375,
6963
+ "learning_rate": 9.617838672845726e-06,
6964
+ "loss": 0.8395,
6965
+ "mean_token_accuracy": 0.7957546770572662,
6966
+ "num_tokens": 8565626.0,
6967
+ "step": 7730
6968
+ },
6969
+ {
6970
+ "epoch": 1.5595406004432801,
6971
+ "grad_norm": 13.0625,
6972
+ "learning_rate": 9.604405937269125e-06,
6973
+ "loss": 0.7229,
6974
+ "mean_token_accuracy": 0.8145296096801757,
6975
+ "num_tokens": 8576046.0,
6976
+ "step": 7740
6977
+ },
6978
+ {
6979
+ "epoch": 1.5615555107797703,
6980
+ "grad_norm": 10.3125,
6981
+ "learning_rate": 9.590973201692525e-06,
6982
+ "loss": 0.8449,
6983
+ "mean_token_accuracy": 0.793234920501709,
6984
+ "num_tokens": 8586936.0,
6985
+ "step": 7750
6986
+ },
6987
+ {
6988
+ "epoch": 1.5635704211162604,
6989
+ "grad_norm": 14.125,
6990
+ "learning_rate": 9.577540466115926e-06,
6991
+ "loss": 0.8077,
6992
+ "mean_token_accuracy": 0.7942093849182129,
6993
+ "num_tokens": 8599134.0,
6994
+ "step": 7760
6995
+ },
6996
+ {
6997
+ "epoch": 1.5655853314527504,
6998
+ "grad_norm": 12.5,
6999
+ "learning_rate": 9.564107730539325e-06,
7000
+ "loss": 0.7583,
7001
+ "mean_token_accuracy": 0.8089915156364441,
7002
+ "num_tokens": 8609584.0,
7003
+ "step": 7770
7004
+ },
7005
+ {
7006
+ "epoch": 1.5676002417892403,
7007
+ "grad_norm": 11.1875,
7008
+ "learning_rate": 9.550674994962725e-06,
7009
+ "loss": 0.7924,
7010
+ "mean_token_accuracy": 0.804536098241806,
7011
+ "num_tokens": 8621578.0,
7012
+ "step": 7780
7013
+ },
7014
+ {
7015
+ "epoch": 1.5696151521257304,
7016
+ "grad_norm": 13.5625,
7017
+ "learning_rate": 9.537242259386124e-06,
7018
+ "loss": 0.7905,
7019
+ "mean_token_accuracy": 0.798646092414856,
7020
+ "num_tokens": 8632953.0,
7021
+ "step": 7790
7022
+ },
7023
+ {
7024
+ "epoch": 1.5716300624622206,
7025
+ "grad_norm": 11.125,
7026
+ "learning_rate": 9.523809523809525e-06,
7027
+ "loss": 0.7543,
7028
+ "mean_token_accuracy": 0.8105603516101837,
7029
+ "num_tokens": 8643817.0,
7030
+ "step": 7800
7031
+ },
7032
+ {
7033
+ "epoch": 1.5736449727987103,
7034
+ "grad_norm": 10.75,
7035
+ "learning_rate": 9.510376788232925e-06,
7036
+ "loss": 0.8613,
7037
+ "mean_token_accuracy": 0.7865113198757172,
7038
+ "num_tokens": 8654921.0,
7039
+ "step": 7810
7040
+ },
7041
+ {
7042
+ "epoch": 1.5756598831352004,
7043
+ "grad_norm": 13.375,
7044
+ "learning_rate": 9.496944052656324e-06,
7045
+ "loss": 0.7682,
7046
+ "mean_token_accuracy": 0.8063505351543426,
7047
+ "num_tokens": 8664722.0,
7048
+ "step": 7820
7049
+ },
7050
+ {
7051
+ "epoch": 1.5776747934716906,
7052
+ "grad_norm": 13.125,
7053
+ "learning_rate": 9.483511317079725e-06,
7054
+ "loss": 0.8011,
7055
+ "mean_token_accuracy": 0.8007908463478088,
7056
+ "num_tokens": 8675437.0,
7057
+ "step": 7830
7058
+ },
7059
+ {
7060
+ "epoch": 1.5796897038081805,
7061
+ "grad_norm": 15.3125,
7062
+ "learning_rate": 9.470078581503123e-06,
7063
+ "loss": 0.769,
7064
+ "mean_token_accuracy": 0.8038370370864868,
7065
+ "num_tokens": 8685254.0,
7066
+ "step": 7840
7067
+ },
7068
+ {
7069
+ "epoch": 1.5817046141446705,
7070
+ "grad_norm": 12.875,
7071
+ "learning_rate": 9.456645845926524e-06,
7072
+ "loss": 0.8023,
7073
+ "mean_token_accuracy": 0.8047023892402649,
7074
+ "num_tokens": 8695435.0,
7075
+ "step": 7850
7076
+ },
7077
+ {
7078
+ "epoch": 1.5837195244811606,
7079
+ "grad_norm": 12.3125,
7080
+ "learning_rate": 9.443213110349923e-06,
7081
+ "loss": 0.7938,
7082
+ "mean_token_accuracy": 0.7964716255664825,
7083
+ "num_tokens": 8706838.0,
7084
+ "step": 7860
7085
+ },
7086
+ {
7087
+ "epoch": 1.5857344348176508,
7088
+ "grad_norm": 10.875,
7089
+ "learning_rate": 9.429780374773323e-06,
7090
+ "loss": 0.8388,
7091
+ "mean_token_accuracy": 0.7962932288646698,
7092
+ "num_tokens": 8718011.0,
7093
+ "step": 7870
7094
+ },
7095
+ {
7096
+ "epoch": 1.5877493451541407,
7097
+ "grad_norm": 9.75,
7098
+ "learning_rate": 9.416347639196724e-06,
7099
+ "loss": 0.8319,
7100
+ "mean_token_accuracy": 0.788075852394104,
7101
+ "num_tokens": 8729277.0,
7102
+ "step": 7880
7103
+ },
7104
+ {
7105
+ "epoch": 1.5897642554906306,
7106
+ "grad_norm": 10.75,
7107
+ "learning_rate": 9.402914903620123e-06,
7108
+ "loss": 0.751,
7109
+ "mean_token_accuracy": 0.8099392414093017,
7110
+ "num_tokens": 8739674.0,
7111
+ "step": 7890
7112
+ },
7113
+ {
7114
+ "epoch": 1.5917791658271208,
7115
+ "grad_norm": 9.9375,
7116
+ "learning_rate": 9.389482168043523e-06,
7117
+ "loss": 0.7676,
7118
+ "mean_token_accuracy": 0.8102536201477051,
7119
+ "num_tokens": 8750307.0,
7120
+ "step": 7900
7121
+ },
7122
+ {
7123
+ "epoch": 1.5937940761636107,
7124
+ "grad_norm": 8.8125,
7125
+ "learning_rate": 9.376049432466922e-06,
7126
+ "loss": 0.7677,
7127
+ "mean_token_accuracy": 0.8090816259384155,
7128
+ "num_tokens": 8760932.0,
7129
+ "step": 7910
7130
+ },
7131
+ {
7132
+ "epoch": 1.5958089865001006,
7133
+ "grad_norm": 11.8125,
7134
+ "learning_rate": 9.362616696890323e-06,
7135
+ "loss": 0.9654,
7136
+ "mean_token_accuracy": 0.7688835144042969,
7137
+ "num_tokens": 8772244.0,
7138
+ "step": 7920
7139
+ },
7140
+ {
7141
+ "epoch": 1.5978238968365908,
7142
+ "grad_norm": 10.4375,
7143
+ "learning_rate": 9.349183961313723e-06,
7144
+ "loss": 0.7429,
7145
+ "mean_token_accuracy": 0.8144657909870148,
7146
+ "num_tokens": 8783351.0,
7147
+ "step": 7930
7148
+ },
7149
+ {
7150
+ "epoch": 1.599838807173081,
7151
+ "grad_norm": 11.5625,
7152
+ "learning_rate": 9.335751225737122e-06,
7153
+ "loss": 0.82,
7154
+ "mean_token_accuracy": 0.7950020253658294,
7155
+ "num_tokens": 8793990.0,
7156
+ "step": 7940
7157
+ },
7158
+ {
7159
+ "epoch": 1.6018537175095708,
7160
+ "grad_norm": 13.0625,
7161
+ "learning_rate": 9.322318490160523e-06,
7162
+ "loss": 0.7849,
7163
+ "mean_token_accuracy": 0.8066163957118988,
7164
+ "num_tokens": 8804970.0,
7165
+ "step": 7950
7166
+ },
7167
+ {
7168
+ "epoch": 1.6038686278460608,
7169
+ "grad_norm": 11.75,
7170
+ "learning_rate": 9.308885754583921e-06,
7171
+ "loss": 0.8965,
7172
+ "mean_token_accuracy": 0.7794711530208588,
7173
+ "num_tokens": 8816123.0,
7174
+ "step": 7960
7175
+ },
7176
+ {
7177
+ "epoch": 1.605883538182551,
7178
+ "grad_norm": 11.9375,
7179
+ "learning_rate": 9.295453019007322e-06,
7180
+ "loss": 0.7398,
7181
+ "mean_token_accuracy": 0.8103044688701629,
7182
+ "num_tokens": 8826861.0,
7183
+ "step": 7970
7184
+ },
7185
+ {
7186
+ "epoch": 1.607898448519041,
7187
+ "grad_norm": 11.5625,
7188
+ "learning_rate": 9.282020283430722e-06,
7189
+ "loss": 0.7859,
7190
+ "mean_token_accuracy": 0.8065201163291931,
7191
+ "num_tokens": 8837870.0,
7192
+ "step": 7980
7193
+ },
7194
+ {
7195
+ "epoch": 1.6099133588555308,
7196
+ "grad_norm": 10.875,
7197
+ "learning_rate": 9.268587547854121e-06,
7198
+ "loss": 0.7387,
7199
+ "mean_token_accuracy": 0.8145683348178864,
7200
+ "num_tokens": 8848365.0,
7201
+ "step": 7990
7202
+ },
7203
+ {
7204
+ "epoch": 1.611928269192021,
7205
+ "grad_norm": 10.9375,
7206
+ "learning_rate": 9.255154812277522e-06,
7207
+ "loss": 0.9003,
7208
+ "mean_token_accuracy": 0.7778131783008575,
7209
+ "num_tokens": 8860114.0,
7210
+ "step": 8000
7211
  }
7212
  ],
7213
  "logging_steps": 10,
 
7227
  "attributes": {}
7228
  }
7229
  },
7230
+ "total_flos": 1.0725865607073792e+16,
7231
  "train_batch_size": 8,
7232
  "trial_name": null,
7233
  "trial_params": null