Plofski commited on
Commit
72c0f75
·
verified ·
1 Parent(s): 58a616c

Training in progress, step 9000, checkpoint

Browse files
last-checkpoint/model.safetensors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:16ca55f673b3ad7e95262a9d0296f5d8f2b7edb92a87d108841a282630107b61
3
  size 536223056
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0ab362d2b3f9dedf1f0f43335f7b06eefee0b16e014fc83df80bc46c1b6044cf
3
  size 536223056
last-checkpoint/optimizer.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:0d62e7293944847d060dba9b65b5eb64216e79d1a688e81c3a95ed8977d7ce35
3
  size 1072594443
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b116e5cf316372406a0b75f20675173ce00a1448ad26470e8baba7a28543337c
3
  size 1072594443
last-checkpoint/scheduler.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:e3aeebf16be5d93156c95c5c47fce9ca30893837ac7097fcc26a2ec8d4dc9f51
3
  size 1465
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:553711fa7348e1460e8e11ff55c1e2ba08096c9266ea56894e269e1a647bd7f3
3
  size 1465
last-checkpoint/trainer_state.json CHANGED
@@ -2,9 +2,9 @@
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
- "epoch": 1.7126737860165222,
6
  "eval_steps": 500,
7
- "global_step": 8500,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
@@ -7658,6 +7658,456 @@
7658
  "mean_token_accuracy": 0.7890658736228943,
7659
  "num_tokens": 9414095.0,
7660
  "step": 8500
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7661
  }
7662
  ],
7663
  "logging_steps": 10,
@@ -7677,7 +8127,7 @@
7677
  "attributes": {}
7678
  }
7679
  },
7680
- "total_flos": 1.1396175021686784e+16,
7681
  "train_batch_size": 8,
7682
  "trial_name": null,
7683
  "trial_params": null
 
2
  "best_global_step": null,
3
  "best_metric": null,
4
  "best_model_checkpoint": null,
5
+ "epoch": 1.8134193028410235,
6
  "eval_steps": 500,
7
+ "global_step": 9000,
8
  "is_hyper_param_search": false,
9
  "is_local_process_zero": true,
10
  "is_world_process_zero": true,
 
7658
  "mean_token_accuracy": 0.7890658736228943,
7659
  "num_tokens": 9414095.0,
7660
  "step": 8500
7661
+ },
7662
+ {
7663
+ "epoch": 1.7146886963530124,
7664
+ "grad_norm": 10.6875,
7665
+ "learning_rate": 8.570085297870913e-06,
7666
+ "loss": 0.7469,
7667
+ "mean_token_accuracy": 0.8128379642963409,
7668
+ "num_tokens": 9425228.0,
7669
+ "step": 8510
7670
+ },
7671
+ {
7672
+ "epoch": 1.7167036066895023,
7673
+ "grad_norm": 11.5625,
7674
+ "learning_rate": 8.556652562294312e-06,
7675
+ "loss": 0.8635,
7676
+ "mean_token_accuracy": 0.7921235024929046,
7677
+ "num_tokens": 9436032.0,
7678
+ "step": 8520
7679
+ },
7680
+ {
7681
+ "epoch": 1.7187185170259922,
7682
+ "grad_norm": 11.5,
7683
+ "learning_rate": 8.54321982671771e-06,
7684
+ "loss": 0.888,
7685
+ "mean_token_accuracy": 0.7825378775596619,
7686
+ "num_tokens": 9447102.0,
7687
+ "step": 8530
7688
+ },
7689
+ {
7690
+ "epoch": 1.7207334273624824,
7691
+ "grad_norm": 13.25,
7692
+ "learning_rate": 8.529787091141111e-06,
7693
+ "loss": 0.774,
7694
+ "mean_token_accuracy": 0.809429943561554,
7695
+ "num_tokens": 9456885.0,
7696
+ "step": 8540
7697
+ },
7698
+ {
7699
+ "epoch": 1.7227483376989725,
7700
+ "grad_norm": 10.9375,
7701
+ "learning_rate": 8.516354355564512e-06,
7702
+ "loss": 0.8615,
7703
+ "mean_token_accuracy": 0.7885714650154114,
7704
+ "num_tokens": 9469044.0,
7705
+ "step": 8550
7706
+ },
7707
+ {
7708
+ "epoch": 1.7247632480354624,
7709
+ "grad_norm": 11.75,
7710
+ "learning_rate": 8.502921619987912e-06,
7711
+ "loss": 0.844,
7712
+ "mean_token_accuracy": 0.7914490044116974,
7713
+ "num_tokens": 9480822.0,
7714
+ "step": 8560
7715
+ },
7716
+ {
7717
+ "epoch": 1.7267781583719524,
7718
+ "grad_norm": 12.0,
7719
+ "learning_rate": 8.489488884411311e-06,
7720
+ "loss": 0.8777,
7721
+ "mean_token_accuracy": 0.7846565127372742,
7722
+ "num_tokens": 9491344.0,
7723
+ "step": 8570
7724
+ },
7725
+ {
7726
+ "epoch": 1.7287930687084425,
7727
+ "grad_norm": 12.0,
7728
+ "learning_rate": 8.47605614883471e-06,
7729
+ "loss": 0.739,
7730
+ "mean_token_accuracy": 0.8153780162334442,
7731
+ "num_tokens": 9501669.0,
7732
+ "step": 8580
7733
+ },
7734
+ {
7735
+ "epoch": 1.7308079790449327,
7736
+ "grad_norm": 12.25,
7737
+ "learning_rate": 8.46262341325811e-06,
7738
+ "loss": 0.8505,
7739
+ "mean_token_accuracy": 0.7858089745044708,
7740
+ "num_tokens": 9512619.0,
7741
+ "step": 8590
7742
+ },
7743
+ {
7744
+ "epoch": 1.7328228893814224,
7745
+ "grad_norm": 11.0,
7746
+ "learning_rate": 8.44919067768151e-06,
7747
+ "loss": 0.9526,
7748
+ "mean_token_accuracy": 0.7702113807201385,
7749
+ "num_tokens": 9525285.0,
7750
+ "step": 8600
7751
+ },
7752
+ {
7753
+ "epoch": 1.7348377997179125,
7754
+ "grad_norm": 12.3125,
7755
+ "learning_rate": 8.43575794210491e-06,
7756
+ "loss": 0.8298,
7757
+ "mean_token_accuracy": 0.7936967372894287,
7758
+ "num_tokens": 9536607.0,
7759
+ "step": 8610
7760
+ },
7761
+ {
7762
+ "epoch": 1.7368527100544027,
7763
+ "grad_norm": 11.875,
7764
+ "learning_rate": 8.42232520652831e-06,
7765
+ "loss": 0.8394,
7766
+ "mean_token_accuracy": 0.7899468779563904,
7767
+ "num_tokens": 9547548.0,
7768
+ "step": 8620
7769
+ },
7770
+ {
7771
+ "epoch": 1.7388676203908926,
7772
+ "grad_norm": 10.875,
7773
+ "learning_rate": 8.40889247095171e-06,
7774
+ "loss": 0.7908,
7775
+ "mean_token_accuracy": 0.7996328830718994,
7776
+ "num_tokens": 9559379.0,
7777
+ "step": 8630
7778
+ },
7779
+ {
7780
+ "epoch": 1.7408825307273825,
7781
+ "grad_norm": 11.5,
7782
+ "learning_rate": 8.39545973537511e-06,
7783
+ "loss": 0.7211,
7784
+ "mean_token_accuracy": 0.815495389699936,
7785
+ "num_tokens": 9569522.0,
7786
+ "step": 8640
7787
+ },
7788
+ {
7789
+ "epoch": 1.7428974410638727,
7790
+ "grad_norm": 13.4375,
7791
+ "learning_rate": 8.382026999798509e-06,
7792
+ "loss": 0.8985,
7793
+ "mean_token_accuracy": 0.7802120566368103,
7794
+ "num_tokens": 9580936.0,
7795
+ "step": 8650
7796
+ },
7797
+ {
7798
+ "epoch": 1.7449123514003628,
7799
+ "grad_norm": 11.625,
7800
+ "learning_rate": 8.36859426422191e-06,
7801
+ "loss": 0.8709,
7802
+ "mean_token_accuracy": 0.7891122341156006,
7803
+ "num_tokens": 9592276.0,
7804
+ "step": 8660
7805
+ },
7806
+ {
7807
+ "epoch": 1.7469272617368528,
7808
+ "grad_norm": 11.4375,
7809
+ "learning_rate": 8.35516152864531e-06,
7810
+ "loss": 0.8361,
7811
+ "mean_token_accuracy": 0.7939860701560975,
7812
+ "num_tokens": 9603087.0,
7813
+ "step": 8670
7814
+ },
7815
+ {
7816
+ "epoch": 1.7489421720733427,
7817
+ "grad_norm": 10.75,
7818
+ "learning_rate": 8.34172879306871e-06,
7819
+ "loss": 0.7678,
7820
+ "mean_token_accuracy": 0.8116808116436005,
7821
+ "num_tokens": 9613559.0,
7822
+ "step": 8680
7823
+ },
7824
+ {
7825
+ "epoch": 1.7509570824098328,
7826
+ "grad_norm": 11.6875,
7827
+ "learning_rate": 8.328296057492109e-06,
7828
+ "loss": 0.7728,
7829
+ "mean_token_accuracy": 0.8031215369701385,
7830
+ "num_tokens": 9625395.0,
7831
+ "step": 8690
7832
+ },
7833
+ {
7834
+ "epoch": 1.7529719927463228,
7835
+ "grad_norm": 13.6875,
7836
+ "learning_rate": 8.314863321915508e-06,
7837
+ "loss": 0.7771,
7838
+ "mean_token_accuracy": 0.802968579530716,
7839
+ "num_tokens": 9636437.0,
7840
+ "step": 8700
7841
+ },
7842
+ {
7843
+ "epoch": 1.7549869030828127,
7844
+ "grad_norm": 12.5,
7845
+ "learning_rate": 8.301430586338909e-06,
7846
+ "loss": 0.8337,
7847
+ "mean_token_accuracy": 0.79465811252594,
7848
+ "num_tokens": 9647693.0,
7849
+ "step": 8710
7850
+ },
7851
+ {
7852
+ "epoch": 1.7570018134193028,
7853
+ "grad_norm": 12.375,
7854
+ "learning_rate": 8.287997850762309e-06,
7855
+ "loss": 0.9093,
7856
+ "mean_token_accuracy": 0.7816155433654786,
7857
+ "num_tokens": 9659058.0,
7858
+ "step": 8720
7859
+ },
7860
+ {
7861
+ "epoch": 1.759016723755793,
7862
+ "grad_norm": 10.125,
7863
+ "learning_rate": 8.274565115185708e-06,
7864
+ "loss": 0.8053,
7865
+ "mean_token_accuracy": 0.7992358326911926,
7866
+ "num_tokens": 9671229.0,
7867
+ "step": 8730
7868
+ },
7869
+ {
7870
+ "epoch": 1.761031634092283,
7871
+ "grad_norm": 13.1875,
7872
+ "learning_rate": 8.261132379609108e-06,
7873
+ "loss": 0.9656,
7874
+ "mean_token_accuracy": 0.7700483322143554,
7875
+ "num_tokens": 9682074.0,
7876
+ "step": 8740
7877
+ },
7878
+ {
7879
+ "epoch": 1.7630465444287728,
7880
+ "grad_norm": 12.625,
7881
+ "learning_rate": 8.247699644032507e-06,
7882
+ "loss": 0.775,
7883
+ "mean_token_accuracy": 0.8076685547828675,
7884
+ "num_tokens": 9692580.0,
7885
+ "step": 8750
7886
+ },
7887
+ {
7888
+ "epoch": 1.765061454765263,
7889
+ "grad_norm": 13.3125,
7890
+ "learning_rate": 8.234266908455908e-06,
7891
+ "loss": 0.7936,
7892
+ "mean_token_accuracy": 0.8011666655540466,
7893
+ "num_tokens": 9703690.0,
7894
+ "step": 8760
7895
+ },
7896
+ {
7897
+ "epoch": 1.767076365101753,
7898
+ "grad_norm": 12.625,
7899
+ "learning_rate": 8.220834172879307e-06,
7900
+ "loss": 0.7932,
7901
+ "mean_token_accuracy": 0.8000261068344117,
7902
+ "num_tokens": 9715172.0,
7903
+ "step": 8770
7904
+ },
7905
+ {
7906
+ "epoch": 1.7690912754382428,
7907
+ "grad_norm": 13.375,
7908
+ "learning_rate": 8.207401437302707e-06,
7909
+ "loss": 0.8277,
7910
+ "mean_token_accuracy": 0.7993273079395294,
7911
+ "num_tokens": 9726400.0,
7912
+ "step": 8780
7913
+ },
7914
+ {
7915
+ "epoch": 1.771106185774733,
7916
+ "grad_norm": 14.9375,
7917
+ "learning_rate": 8.193968701726108e-06,
7918
+ "loss": 0.8041,
7919
+ "mean_token_accuracy": 0.801609891653061,
7920
+ "num_tokens": 9738233.0,
7921
+ "step": 8790
7922
+ },
7923
+ {
7924
+ "epoch": 1.7731210961112231,
7925
+ "grad_norm": 13.5625,
7926
+ "learning_rate": 8.180535966149506e-06,
7927
+ "loss": 0.9014,
7928
+ "mean_token_accuracy": 0.7815512001514435,
7929
+ "num_tokens": 9749323.0,
7930
+ "step": 8800
7931
+ },
7932
+ {
7933
+ "epoch": 1.775136006447713,
7934
+ "grad_norm": 10.5,
7935
+ "learning_rate": 8.167103230572907e-06,
7936
+ "loss": 0.7797,
7937
+ "mean_token_accuracy": 0.7976033747196197,
7938
+ "num_tokens": 9760814.0,
7939
+ "step": 8810
7940
+ },
7941
+ {
7942
+ "epoch": 1.777150916784203,
7943
+ "grad_norm": 13.125,
7944
+ "learning_rate": 8.153670494996306e-06,
7945
+ "loss": 0.7727,
7946
+ "mean_token_accuracy": 0.8090618014335632,
7947
+ "num_tokens": 9771342.0,
7948
+ "step": 8820
7949
+ },
7950
+ {
7951
+ "epoch": 1.7791658271206932,
7952
+ "grad_norm": 11.625,
7953
+ "learning_rate": 8.140237759419706e-06,
7954
+ "loss": 0.7791,
7955
+ "mean_token_accuracy": 0.805637001991272,
7956
+ "num_tokens": 9781975.0,
7957
+ "step": 8830
7958
+ },
7959
+ {
7960
+ "epoch": 1.7811807374571833,
7961
+ "grad_norm": 12.0,
7962
+ "learning_rate": 8.126805023843107e-06,
7963
+ "loss": 0.7841,
7964
+ "mean_token_accuracy": 0.8054970562458038,
7965
+ "num_tokens": 9793656.0,
7966
+ "step": 8840
7967
+ },
7968
+ {
7969
+ "epoch": 1.7831956477936732,
7970
+ "grad_norm": 10.4375,
7971
+ "learning_rate": 8.113372288266507e-06,
7972
+ "loss": 0.8201,
7973
+ "mean_token_accuracy": 0.7929071843624115,
7974
+ "num_tokens": 9804717.0,
7975
+ "step": 8850
7976
+ },
7977
+ {
7978
+ "epoch": 1.7852105581301632,
7979
+ "grad_norm": 13.1875,
7980
+ "learning_rate": 8.099939552689906e-06,
7981
+ "loss": 0.8184,
7982
+ "mean_token_accuracy": 0.7929362654685974,
7983
+ "num_tokens": 9815462.0,
7984
+ "step": 8860
7985
+ },
7986
+ {
7987
+ "epoch": 1.7872254684666533,
7988
+ "grad_norm": 10.5,
7989
+ "learning_rate": 8.086506817113305e-06,
7990
+ "loss": 0.8737,
7991
+ "mean_token_accuracy": 0.7850127279758453,
7992
+ "num_tokens": 9827212.0,
7993
+ "step": 8870
7994
+ },
7995
+ {
7996
+ "epoch": 1.7892403788031432,
7997
+ "grad_norm": 9.5625,
7998
+ "learning_rate": 8.073074081536706e-06,
7999
+ "loss": 0.8605,
8000
+ "mean_token_accuracy": 0.7867051362991333,
8001
+ "num_tokens": 9839656.0,
8002
+ "step": 8880
8003
+ },
8004
+ {
8005
+ "epoch": 1.7912552891396332,
8006
+ "grad_norm": 10.375,
8007
+ "learning_rate": 8.059641345960104e-06,
8008
+ "loss": 0.8143,
8009
+ "mean_token_accuracy": 0.8001775145530701,
8010
+ "num_tokens": 9851032.0,
8011
+ "step": 8890
8012
+ },
8013
+ {
8014
+ "epoch": 1.7932701994761233,
8015
+ "grad_norm": 11.5625,
8016
+ "learning_rate": 8.046208610383505e-06,
8017
+ "loss": 0.8651,
8018
+ "mean_token_accuracy": 0.7894278347492218,
8019
+ "num_tokens": 9863136.0,
8020
+ "step": 8900
8021
+ },
8022
+ {
8023
+ "epoch": 1.7952851098126135,
8024
+ "grad_norm": 11.1875,
8025
+ "learning_rate": 8.032775874806906e-06,
8026
+ "loss": 0.7915,
8027
+ "mean_token_accuracy": 0.8040505468845367,
8028
+ "num_tokens": 9874065.0,
8029
+ "step": 8910
8030
+ },
8031
+ {
8032
+ "epoch": 1.7973000201491034,
8033
+ "grad_norm": 11.5,
8034
+ "learning_rate": 8.019343139230304e-06,
8035
+ "loss": 0.7863,
8036
+ "mean_token_accuracy": 0.804164183139801,
8037
+ "num_tokens": 9884180.0,
8038
+ "step": 8920
8039
+ },
8040
+ {
8041
+ "epoch": 1.7993149304855933,
8042
+ "grad_norm": 12.1875,
8043
+ "learning_rate": 8.005910403653705e-06,
8044
+ "loss": 0.7998,
8045
+ "mean_token_accuracy": 0.8023830056190491,
8046
+ "num_tokens": 9893817.0,
8047
+ "step": 8930
8048
+ },
8049
+ {
8050
+ "epoch": 1.8013298408220835,
8051
+ "grad_norm": 12.5,
8052
+ "learning_rate": 7.992477668077104e-06,
8053
+ "loss": 0.8189,
8054
+ "mean_token_accuracy": 0.8001566708087922,
8055
+ "num_tokens": 9904272.0,
8056
+ "step": 8940
8057
+ },
8058
+ {
8059
+ "epoch": 1.8033447511585734,
8060
+ "grad_norm": 10.3125,
8061
+ "learning_rate": 7.979044932500504e-06,
8062
+ "loss": 0.8199,
8063
+ "mean_token_accuracy": 0.79620281457901,
8064
+ "num_tokens": 9915875.0,
8065
+ "step": 8950
8066
+ },
8067
+ {
8068
+ "epoch": 1.8053596614950633,
8069
+ "grad_norm": 9.3125,
8070
+ "learning_rate": 7.965612196923905e-06,
8071
+ "loss": 0.8058,
8072
+ "mean_token_accuracy": 0.7988872945308685,
8073
+ "num_tokens": 9927434.0,
8074
+ "step": 8960
8075
+ },
8076
+ {
8077
+ "epoch": 1.8073745718315535,
8078
+ "grad_norm": 16.5,
8079
+ "learning_rate": 7.952179461347304e-06,
8080
+ "loss": 0.8912,
8081
+ "mean_token_accuracy": 0.785044139623642,
8082
+ "num_tokens": 9936917.0,
8083
+ "step": 8970
8084
+ },
8085
+ {
8086
+ "epoch": 1.8093894821680436,
8087
+ "grad_norm": 9.5,
8088
+ "learning_rate": 7.938746725770704e-06,
8089
+ "loss": 0.73,
8090
+ "mean_token_accuracy": 0.8187473714351654,
8091
+ "num_tokens": 9947903.0,
8092
+ "step": 8980
8093
+ },
8094
+ {
8095
+ "epoch": 1.8114043925045336,
8096
+ "grad_norm": 10.5625,
8097
+ "learning_rate": 7.925313990194103e-06,
8098
+ "loss": 0.7863,
8099
+ "mean_token_accuracy": 0.8035355567932129,
8100
+ "num_tokens": 9957362.0,
8101
+ "step": 8990
8102
+ },
8103
+ {
8104
+ "epoch": 1.8134193028410235,
8105
+ "grad_norm": 11.8125,
8106
+ "learning_rate": 7.911881254617504e-06,
8107
+ "loss": 0.9173,
8108
+ "mean_token_accuracy": 0.7758583545684814,
8109
+ "num_tokens": 9969639.0,
8110
+ "step": 9000
8111
  }
8112
  ],
8113
  "logging_steps": 10,
 
8127
  "attributes": {}
8128
  }
8129
  },
8130
+ "total_flos": 1.2065001216479232e+16,
8131
  "train_batch_size": 8,
8132
  "trial_name": null,
8133
  "trial_params": null