Training in progress, step 8000, checkpoint
Browse files
last-checkpoint/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 536223056
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:872e13706948c7a141e635bc023a52fbe531ae28f59acde5c4f237db2a94c6b1
|
| 3 |
size 536223056
|
last-checkpoint/optimizer.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1072594443
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c368469d799ee657aa6f345b72b1b063d1207badee5ef2708584fc5b29dd1fa0
|
| 3 |
size 1072594443
|
last-checkpoint/scheduler.pt
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1465
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6f8c5daae46e22d0555f52515cb826d70a09c178d27140188b1fd68ded8645a9
|
| 3 |
size 1465
|
last-checkpoint/trainer_state.json
CHANGED
|
@@ -2,9 +2,9 @@
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
-
"epoch": 1.
|
| 6 |
"eval_steps": 500,
|
| 7 |
-
"global_step":
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
@@ -6758,6 +6758,456 @@
|
|
| 6758 |
"mean_token_accuracy": 0.8061196208000183,
|
| 6759 |
"num_tokens": 8312344.0,
|
| 6760 |
"step": 7500
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6761 |
}
|
| 6762 |
],
|
| 6763 |
"logging_steps": 10,
|
|
@@ -6777,7 +7227,7 @@
|
|
| 6777 |
"attributes": {}
|
| 6778 |
}
|
| 6779 |
},
|
| 6780 |
-
"total_flos": 1.
|
| 6781 |
"train_batch_size": 8,
|
| 6782 |
"trial_name": null,
|
| 6783 |
"trial_params": null
|
|
|
|
| 2 |
"best_global_step": null,
|
| 3 |
"best_metric": null,
|
| 4 |
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 1.611928269192021,
|
| 6 |
"eval_steps": 500,
|
| 7 |
+
"global_step": 8000,
|
| 8 |
"is_hyper_param_search": false,
|
| 9 |
"is_local_process_zero": true,
|
| 10 |
"is_world_process_zero": true,
|
|
|
|
| 6758 |
"mean_token_accuracy": 0.8061196208000183,
|
| 6759 |
"num_tokens": 8312344.0,
|
| 6760 |
"step": 7500
|
| 6761 |
+
},
|
| 6762 |
+
{
|
| 6763 |
+
"epoch": 1.5131976627040098,
|
| 6764 |
+
"grad_norm": 12.5625,
|
| 6765 |
+
"learning_rate": 9.91335885553093e-06,
|
| 6766 |
+
"loss": 0.7742,
|
| 6767 |
+
"mean_token_accuracy": 0.8027086615562439,
|
| 6768 |
+
"num_tokens": 8323025.0,
|
| 6769 |
+
"step": 7510
|
| 6770 |
+
},
|
| 6771 |
+
{
|
| 6772 |
+
"epoch": 1.5152125730404997,
|
| 6773 |
+
"grad_norm": 12.5,
|
| 6774 |
+
"learning_rate": 9.89992611995433e-06,
|
| 6775 |
+
"loss": 0.7713,
|
| 6776 |
+
"mean_token_accuracy": 0.8100184202194214,
|
| 6777 |
+
"num_tokens": 8333331.0,
|
| 6778 |
+
"step": 7520
|
| 6779 |
+
},
|
| 6780 |
+
{
|
| 6781 |
+
"epoch": 1.5172274833769896,
|
| 6782 |
+
"grad_norm": 12.8125,
|
| 6783 |
+
"learning_rate": 9.886493384377729e-06,
|
| 6784 |
+
"loss": 0.8154,
|
| 6785 |
+
"mean_token_accuracy": 0.7895949363708497,
|
| 6786 |
+
"num_tokens": 8344471.0,
|
| 6787 |
+
"step": 7530
|
| 6788 |
+
},
|
| 6789 |
+
{
|
| 6790 |
+
"epoch": 1.5192423937134798,
|
| 6791 |
+
"grad_norm": 12.3125,
|
| 6792 |
+
"learning_rate": 9.87306064880113e-06,
|
| 6793 |
+
"loss": 0.8998,
|
| 6794 |
+
"mean_token_accuracy": 0.7804327428340911,
|
| 6795 |
+
"num_tokens": 8356442.0,
|
| 6796 |
+
"step": 7540
|
| 6797 |
+
},
|
| 6798 |
+
{
|
| 6799 |
+
"epoch": 1.5212573040499697,
|
| 6800 |
+
"grad_norm": 11.9375,
|
| 6801 |
+
"learning_rate": 9.859627913224528e-06,
|
| 6802 |
+
"loss": 0.8346,
|
| 6803 |
+
"mean_token_accuracy": 0.7952861070632935,
|
| 6804 |
+
"num_tokens": 8367610.0,
|
| 6805 |
+
"step": 7550
|
| 6806 |
+
},
|
| 6807 |
+
{
|
| 6808 |
+
"epoch": 1.5232722143864597,
|
| 6809 |
+
"grad_norm": 11.6875,
|
| 6810 |
+
"learning_rate": 9.846195177647929e-06,
|
| 6811 |
+
"loss": 0.8045,
|
| 6812 |
+
"mean_token_accuracy": 0.8029259443283081,
|
| 6813 |
+
"num_tokens": 8378828.0,
|
| 6814 |
+
"step": 7560
|
| 6815 |
+
},
|
| 6816 |
+
{
|
| 6817 |
+
"epoch": 1.5252871247229498,
|
| 6818 |
+
"grad_norm": 12.5625,
|
| 6819 |
+
"learning_rate": 9.832762442071328e-06,
|
| 6820 |
+
"loss": 0.7931,
|
| 6821 |
+
"mean_token_accuracy": 0.8047609508037568,
|
| 6822 |
+
"num_tokens": 8389283.0,
|
| 6823 |
+
"step": 7570
|
| 6824 |
+
},
|
| 6825 |
+
{
|
| 6826 |
+
"epoch": 1.52730203505944,
|
| 6827 |
+
"grad_norm": 11.375,
|
| 6828 |
+
"learning_rate": 9.819329706494728e-06,
|
| 6829 |
+
"loss": 0.8274,
|
| 6830 |
+
"mean_token_accuracy": 0.7971135258674622,
|
| 6831 |
+
"num_tokens": 8401265.0,
|
| 6832 |
+
"step": 7580
|
| 6833 |
+
},
|
| 6834 |
+
{
|
| 6835 |
+
"epoch": 1.5293169453959299,
|
| 6836 |
+
"grad_norm": 13.3125,
|
| 6837 |
+
"learning_rate": 9.805896970918129e-06,
|
| 6838 |
+
"loss": 0.7739,
|
| 6839 |
+
"mean_token_accuracy": 0.803637433052063,
|
| 6840 |
+
"num_tokens": 8411843.0,
|
| 6841 |
+
"step": 7590
|
| 6842 |
+
},
|
| 6843 |
+
{
|
| 6844 |
+
"epoch": 1.5313318557324198,
|
| 6845 |
+
"grad_norm": 10.375,
|
| 6846 |
+
"learning_rate": 9.792464235341528e-06,
|
| 6847 |
+
"loss": 0.882,
|
| 6848 |
+
"mean_token_accuracy": 0.7834074079990387,
|
| 6849 |
+
"num_tokens": 8421967.0,
|
| 6850 |
+
"step": 7600
|
| 6851 |
+
},
|
| 6852 |
+
{
|
| 6853 |
+
"epoch": 1.53334676606891,
|
| 6854 |
+
"grad_norm": 11.9375,
|
| 6855 |
+
"learning_rate": 9.779031499764928e-06,
|
| 6856 |
+
"loss": 0.8852,
|
| 6857 |
+
"mean_token_accuracy": 0.7851302027702332,
|
| 6858 |
+
"num_tokens": 8435114.0,
|
| 6859 |
+
"step": 7610
|
| 6860 |
+
},
|
| 6861 |
+
{
|
| 6862 |
+
"epoch": 1.5353616764054,
|
| 6863 |
+
"grad_norm": 9.5625,
|
| 6864 |
+
"learning_rate": 9.765598764188327e-06,
|
| 6865 |
+
"loss": 0.7346,
|
| 6866 |
+
"mean_token_accuracy": 0.8161056697368622,
|
| 6867 |
+
"num_tokens": 8446359.0,
|
| 6868 |
+
"step": 7620
|
| 6869 |
+
},
|
| 6870 |
+
{
|
| 6871 |
+
"epoch": 1.53737658674189,
|
| 6872 |
+
"grad_norm": 12.25,
|
| 6873 |
+
"learning_rate": 9.752166028611728e-06,
|
| 6874 |
+
"loss": 0.828,
|
| 6875 |
+
"mean_token_accuracy": 0.7961056709289551,
|
| 6876 |
+
"num_tokens": 8456390.0,
|
| 6877 |
+
"step": 7630
|
| 6878 |
+
},
|
| 6879 |
+
{
|
| 6880 |
+
"epoch": 1.53939149707838,
|
| 6881 |
+
"grad_norm": 9.625,
|
| 6882 |
+
"learning_rate": 9.738733293035128e-06,
|
| 6883 |
+
"loss": 0.7805,
|
| 6884 |
+
"mean_token_accuracy": 0.8055654644966126,
|
| 6885 |
+
"num_tokens": 8467737.0,
|
| 6886 |
+
"step": 7640
|
| 6887 |
+
},
|
| 6888 |
+
{
|
| 6889 |
+
"epoch": 1.5414064074148701,
|
| 6890 |
+
"grad_norm": 11.5625,
|
| 6891 |
+
"learning_rate": 9.725300557458527e-06,
|
| 6892 |
+
"loss": 0.784,
|
| 6893 |
+
"mean_token_accuracy": 0.8105962395668029,
|
| 6894 |
+
"num_tokens": 8478450.0,
|
| 6895 |
+
"step": 7650
|
| 6896 |
+
},
|
| 6897 |
+
{
|
| 6898 |
+
"epoch": 1.54342131775136,
|
| 6899 |
+
"grad_norm": 10.625,
|
| 6900 |
+
"learning_rate": 9.711867821881928e-06,
|
| 6901 |
+
"loss": 0.8922,
|
| 6902 |
+
"mean_token_accuracy": 0.7858581006526947,
|
| 6903 |
+
"num_tokens": 8489331.0,
|
| 6904 |
+
"step": 7660
|
| 6905 |
+
},
|
| 6906 |
+
{
|
| 6907 |
+
"epoch": 1.54543622808785,
|
| 6908 |
+
"grad_norm": 11.625,
|
| 6909 |
+
"learning_rate": 9.698435086305326e-06,
|
| 6910 |
+
"loss": 0.7907,
|
| 6911 |
+
"mean_token_accuracy": 0.8015705049037933,
|
| 6912 |
+
"num_tokens": 8499569.0,
|
| 6913 |
+
"step": 7670
|
| 6914 |
+
},
|
| 6915 |
+
{
|
| 6916 |
+
"epoch": 1.5474511384243401,
|
| 6917 |
+
"grad_norm": 13.9375,
|
| 6918 |
+
"learning_rate": 9.685002350728727e-06,
|
| 6919 |
+
"loss": 0.9439,
|
| 6920 |
+
"mean_token_accuracy": 0.7698397815227509,
|
| 6921 |
+
"num_tokens": 8510367.0,
|
| 6922 |
+
"step": 7680
|
| 6923 |
+
},
|
| 6924 |
+
{
|
| 6925 |
+
"epoch": 1.5494660487608303,
|
| 6926 |
+
"grad_norm": 11.5,
|
| 6927 |
+
"learning_rate": 9.671569615152127e-06,
|
| 6928 |
+
"loss": 0.7814,
|
| 6929 |
+
"mean_token_accuracy": 0.8014598250389099,
|
| 6930 |
+
"num_tokens": 8521247.0,
|
| 6931 |
+
"step": 7690
|
| 6932 |
+
},
|
| 6933 |
+
{
|
| 6934 |
+
"epoch": 1.5514809590973202,
|
| 6935 |
+
"grad_norm": 11.25,
|
| 6936 |
+
"learning_rate": 9.658136879575526e-06,
|
| 6937 |
+
"loss": 0.7568,
|
| 6938 |
+
"mean_token_accuracy": 0.8163803517818451,
|
| 6939 |
+
"num_tokens": 8532047.0,
|
| 6940 |
+
"step": 7700
|
| 6941 |
+
},
|
| 6942 |
+
{
|
| 6943 |
+
"epoch": 1.5534958694338101,
|
| 6944 |
+
"grad_norm": 11.6875,
|
| 6945 |
+
"learning_rate": 9.644704143998927e-06,
|
| 6946 |
+
"loss": 0.7684,
|
| 6947 |
+
"mean_token_accuracy": 0.8017966628074646,
|
| 6948 |
+
"num_tokens": 8543022.0,
|
| 6949 |
+
"step": 7710
|
| 6950 |
+
},
|
| 6951 |
+
{
|
| 6952 |
+
"epoch": 1.5555107797703003,
|
| 6953 |
+
"grad_norm": 11.1875,
|
| 6954 |
+
"learning_rate": 9.631271408422326e-06,
|
| 6955 |
+
"loss": 0.7742,
|
| 6956 |
+
"mean_token_accuracy": 0.8069942653179168,
|
| 6957 |
+
"num_tokens": 8554086.0,
|
| 6958 |
+
"step": 7720
|
| 6959 |
+
},
|
| 6960 |
+
{
|
| 6961 |
+
"epoch": 1.5575256901067902,
|
| 6962 |
+
"grad_norm": 10.9375,
|
| 6963 |
+
"learning_rate": 9.617838672845726e-06,
|
| 6964 |
+
"loss": 0.8395,
|
| 6965 |
+
"mean_token_accuracy": 0.7957546770572662,
|
| 6966 |
+
"num_tokens": 8565626.0,
|
| 6967 |
+
"step": 7730
|
| 6968 |
+
},
|
| 6969 |
+
{
|
| 6970 |
+
"epoch": 1.5595406004432801,
|
| 6971 |
+
"grad_norm": 13.0625,
|
| 6972 |
+
"learning_rate": 9.604405937269125e-06,
|
| 6973 |
+
"loss": 0.7229,
|
| 6974 |
+
"mean_token_accuracy": 0.8145296096801757,
|
| 6975 |
+
"num_tokens": 8576046.0,
|
| 6976 |
+
"step": 7740
|
| 6977 |
+
},
|
| 6978 |
+
{
|
| 6979 |
+
"epoch": 1.5615555107797703,
|
| 6980 |
+
"grad_norm": 10.3125,
|
| 6981 |
+
"learning_rate": 9.590973201692525e-06,
|
| 6982 |
+
"loss": 0.8449,
|
| 6983 |
+
"mean_token_accuracy": 0.793234920501709,
|
| 6984 |
+
"num_tokens": 8586936.0,
|
| 6985 |
+
"step": 7750
|
| 6986 |
+
},
|
| 6987 |
+
{
|
| 6988 |
+
"epoch": 1.5635704211162604,
|
| 6989 |
+
"grad_norm": 14.125,
|
| 6990 |
+
"learning_rate": 9.577540466115926e-06,
|
| 6991 |
+
"loss": 0.8077,
|
| 6992 |
+
"mean_token_accuracy": 0.7942093849182129,
|
| 6993 |
+
"num_tokens": 8599134.0,
|
| 6994 |
+
"step": 7760
|
| 6995 |
+
},
|
| 6996 |
+
{
|
| 6997 |
+
"epoch": 1.5655853314527504,
|
| 6998 |
+
"grad_norm": 12.5,
|
| 6999 |
+
"learning_rate": 9.564107730539325e-06,
|
| 7000 |
+
"loss": 0.7583,
|
| 7001 |
+
"mean_token_accuracy": 0.8089915156364441,
|
| 7002 |
+
"num_tokens": 8609584.0,
|
| 7003 |
+
"step": 7770
|
| 7004 |
+
},
|
| 7005 |
+
{
|
| 7006 |
+
"epoch": 1.5676002417892403,
|
| 7007 |
+
"grad_norm": 11.1875,
|
| 7008 |
+
"learning_rate": 9.550674994962725e-06,
|
| 7009 |
+
"loss": 0.7924,
|
| 7010 |
+
"mean_token_accuracy": 0.804536098241806,
|
| 7011 |
+
"num_tokens": 8621578.0,
|
| 7012 |
+
"step": 7780
|
| 7013 |
+
},
|
| 7014 |
+
{
|
| 7015 |
+
"epoch": 1.5696151521257304,
|
| 7016 |
+
"grad_norm": 13.5625,
|
| 7017 |
+
"learning_rate": 9.537242259386124e-06,
|
| 7018 |
+
"loss": 0.7905,
|
| 7019 |
+
"mean_token_accuracy": 0.798646092414856,
|
| 7020 |
+
"num_tokens": 8632953.0,
|
| 7021 |
+
"step": 7790
|
| 7022 |
+
},
|
| 7023 |
+
{
|
| 7024 |
+
"epoch": 1.5716300624622206,
|
| 7025 |
+
"grad_norm": 11.125,
|
| 7026 |
+
"learning_rate": 9.523809523809525e-06,
|
| 7027 |
+
"loss": 0.7543,
|
| 7028 |
+
"mean_token_accuracy": 0.8105603516101837,
|
| 7029 |
+
"num_tokens": 8643817.0,
|
| 7030 |
+
"step": 7800
|
| 7031 |
+
},
|
| 7032 |
+
{
|
| 7033 |
+
"epoch": 1.5736449727987103,
|
| 7034 |
+
"grad_norm": 10.75,
|
| 7035 |
+
"learning_rate": 9.510376788232925e-06,
|
| 7036 |
+
"loss": 0.8613,
|
| 7037 |
+
"mean_token_accuracy": 0.7865113198757172,
|
| 7038 |
+
"num_tokens": 8654921.0,
|
| 7039 |
+
"step": 7810
|
| 7040 |
+
},
|
| 7041 |
+
{
|
| 7042 |
+
"epoch": 1.5756598831352004,
|
| 7043 |
+
"grad_norm": 13.375,
|
| 7044 |
+
"learning_rate": 9.496944052656324e-06,
|
| 7045 |
+
"loss": 0.7682,
|
| 7046 |
+
"mean_token_accuracy": 0.8063505351543426,
|
| 7047 |
+
"num_tokens": 8664722.0,
|
| 7048 |
+
"step": 7820
|
| 7049 |
+
},
|
| 7050 |
+
{
|
| 7051 |
+
"epoch": 1.5776747934716906,
|
| 7052 |
+
"grad_norm": 13.125,
|
| 7053 |
+
"learning_rate": 9.483511317079725e-06,
|
| 7054 |
+
"loss": 0.8011,
|
| 7055 |
+
"mean_token_accuracy": 0.8007908463478088,
|
| 7056 |
+
"num_tokens": 8675437.0,
|
| 7057 |
+
"step": 7830
|
| 7058 |
+
},
|
| 7059 |
+
{
|
| 7060 |
+
"epoch": 1.5796897038081805,
|
| 7061 |
+
"grad_norm": 15.3125,
|
| 7062 |
+
"learning_rate": 9.470078581503123e-06,
|
| 7063 |
+
"loss": 0.769,
|
| 7064 |
+
"mean_token_accuracy": 0.8038370370864868,
|
| 7065 |
+
"num_tokens": 8685254.0,
|
| 7066 |
+
"step": 7840
|
| 7067 |
+
},
|
| 7068 |
+
{
|
| 7069 |
+
"epoch": 1.5817046141446705,
|
| 7070 |
+
"grad_norm": 12.875,
|
| 7071 |
+
"learning_rate": 9.456645845926524e-06,
|
| 7072 |
+
"loss": 0.8023,
|
| 7073 |
+
"mean_token_accuracy": 0.8047023892402649,
|
| 7074 |
+
"num_tokens": 8695435.0,
|
| 7075 |
+
"step": 7850
|
| 7076 |
+
},
|
| 7077 |
+
{
|
| 7078 |
+
"epoch": 1.5837195244811606,
|
| 7079 |
+
"grad_norm": 12.3125,
|
| 7080 |
+
"learning_rate": 9.443213110349923e-06,
|
| 7081 |
+
"loss": 0.7938,
|
| 7082 |
+
"mean_token_accuracy": 0.7964716255664825,
|
| 7083 |
+
"num_tokens": 8706838.0,
|
| 7084 |
+
"step": 7860
|
| 7085 |
+
},
|
| 7086 |
+
{
|
| 7087 |
+
"epoch": 1.5857344348176508,
|
| 7088 |
+
"grad_norm": 10.875,
|
| 7089 |
+
"learning_rate": 9.429780374773323e-06,
|
| 7090 |
+
"loss": 0.8388,
|
| 7091 |
+
"mean_token_accuracy": 0.7962932288646698,
|
| 7092 |
+
"num_tokens": 8718011.0,
|
| 7093 |
+
"step": 7870
|
| 7094 |
+
},
|
| 7095 |
+
{
|
| 7096 |
+
"epoch": 1.5877493451541407,
|
| 7097 |
+
"grad_norm": 9.75,
|
| 7098 |
+
"learning_rate": 9.416347639196724e-06,
|
| 7099 |
+
"loss": 0.8319,
|
| 7100 |
+
"mean_token_accuracy": 0.788075852394104,
|
| 7101 |
+
"num_tokens": 8729277.0,
|
| 7102 |
+
"step": 7880
|
| 7103 |
+
},
|
| 7104 |
+
{
|
| 7105 |
+
"epoch": 1.5897642554906306,
|
| 7106 |
+
"grad_norm": 10.75,
|
| 7107 |
+
"learning_rate": 9.402914903620123e-06,
|
| 7108 |
+
"loss": 0.751,
|
| 7109 |
+
"mean_token_accuracy": 0.8099392414093017,
|
| 7110 |
+
"num_tokens": 8739674.0,
|
| 7111 |
+
"step": 7890
|
| 7112 |
+
},
|
| 7113 |
+
{
|
| 7114 |
+
"epoch": 1.5917791658271208,
|
| 7115 |
+
"grad_norm": 9.9375,
|
| 7116 |
+
"learning_rate": 9.389482168043523e-06,
|
| 7117 |
+
"loss": 0.7676,
|
| 7118 |
+
"mean_token_accuracy": 0.8102536201477051,
|
| 7119 |
+
"num_tokens": 8750307.0,
|
| 7120 |
+
"step": 7900
|
| 7121 |
+
},
|
| 7122 |
+
{
|
| 7123 |
+
"epoch": 1.5937940761636107,
|
| 7124 |
+
"grad_norm": 8.8125,
|
| 7125 |
+
"learning_rate": 9.376049432466922e-06,
|
| 7126 |
+
"loss": 0.7677,
|
| 7127 |
+
"mean_token_accuracy": 0.8090816259384155,
|
| 7128 |
+
"num_tokens": 8760932.0,
|
| 7129 |
+
"step": 7910
|
| 7130 |
+
},
|
| 7131 |
+
{
|
| 7132 |
+
"epoch": 1.5958089865001006,
|
| 7133 |
+
"grad_norm": 11.8125,
|
| 7134 |
+
"learning_rate": 9.362616696890323e-06,
|
| 7135 |
+
"loss": 0.9654,
|
| 7136 |
+
"mean_token_accuracy": 0.7688835144042969,
|
| 7137 |
+
"num_tokens": 8772244.0,
|
| 7138 |
+
"step": 7920
|
| 7139 |
+
},
|
| 7140 |
+
{
|
| 7141 |
+
"epoch": 1.5978238968365908,
|
| 7142 |
+
"grad_norm": 10.4375,
|
| 7143 |
+
"learning_rate": 9.349183961313723e-06,
|
| 7144 |
+
"loss": 0.7429,
|
| 7145 |
+
"mean_token_accuracy": 0.8144657909870148,
|
| 7146 |
+
"num_tokens": 8783351.0,
|
| 7147 |
+
"step": 7930
|
| 7148 |
+
},
|
| 7149 |
+
{
|
| 7150 |
+
"epoch": 1.599838807173081,
|
| 7151 |
+
"grad_norm": 11.5625,
|
| 7152 |
+
"learning_rate": 9.335751225737122e-06,
|
| 7153 |
+
"loss": 0.82,
|
| 7154 |
+
"mean_token_accuracy": 0.7950020253658294,
|
| 7155 |
+
"num_tokens": 8793990.0,
|
| 7156 |
+
"step": 7940
|
| 7157 |
+
},
|
| 7158 |
+
{
|
| 7159 |
+
"epoch": 1.6018537175095708,
|
| 7160 |
+
"grad_norm": 13.0625,
|
| 7161 |
+
"learning_rate": 9.322318490160523e-06,
|
| 7162 |
+
"loss": 0.7849,
|
| 7163 |
+
"mean_token_accuracy": 0.8066163957118988,
|
| 7164 |
+
"num_tokens": 8804970.0,
|
| 7165 |
+
"step": 7950
|
| 7166 |
+
},
|
| 7167 |
+
{
|
| 7168 |
+
"epoch": 1.6038686278460608,
|
| 7169 |
+
"grad_norm": 11.75,
|
| 7170 |
+
"learning_rate": 9.308885754583921e-06,
|
| 7171 |
+
"loss": 0.8965,
|
| 7172 |
+
"mean_token_accuracy": 0.7794711530208588,
|
| 7173 |
+
"num_tokens": 8816123.0,
|
| 7174 |
+
"step": 7960
|
| 7175 |
+
},
|
| 7176 |
+
{
|
| 7177 |
+
"epoch": 1.605883538182551,
|
| 7178 |
+
"grad_norm": 11.9375,
|
| 7179 |
+
"learning_rate": 9.295453019007322e-06,
|
| 7180 |
+
"loss": 0.7398,
|
| 7181 |
+
"mean_token_accuracy": 0.8103044688701629,
|
| 7182 |
+
"num_tokens": 8826861.0,
|
| 7183 |
+
"step": 7970
|
| 7184 |
+
},
|
| 7185 |
+
{
|
| 7186 |
+
"epoch": 1.607898448519041,
|
| 7187 |
+
"grad_norm": 11.5625,
|
| 7188 |
+
"learning_rate": 9.282020283430722e-06,
|
| 7189 |
+
"loss": 0.7859,
|
| 7190 |
+
"mean_token_accuracy": 0.8065201163291931,
|
| 7191 |
+
"num_tokens": 8837870.0,
|
| 7192 |
+
"step": 7980
|
| 7193 |
+
},
|
| 7194 |
+
{
|
| 7195 |
+
"epoch": 1.6099133588555308,
|
| 7196 |
+
"grad_norm": 10.875,
|
| 7197 |
+
"learning_rate": 9.268587547854121e-06,
|
| 7198 |
+
"loss": 0.7387,
|
| 7199 |
+
"mean_token_accuracy": 0.8145683348178864,
|
| 7200 |
+
"num_tokens": 8848365.0,
|
| 7201 |
+
"step": 7990
|
| 7202 |
+
},
|
| 7203 |
+
{
|
| 7204 |
+
"epoch": 1.611928269192021,
|
| 7205 |
+
"grad_norm": 10.9375,
|
| 7206 |
+
"learning_rate": 9.255154812277522e-06,
|
| 7207 |
+
"loss": 0.9003,
|
| 7208 |
+
"mean_token_accuracy": 0.7778131783008575,
|
| 7209 |
+
"num_tokens": 8860114.0,
|
| 7210 |
+
"step": 8000
|
| 7211 |
}
|
| 7212 |
],
|
| 7213 |
"logging_steps": 10,
|
|
|
|
| 7227 |
"attributes": {}
|
| 7228 |
}
|
| 7229 |
},
|
| 7230 |
+
"total_flos": 1.0725865607073792e+16,
|
| 7231 |
"train_batch_size": 8,
|
| 7232 |
"trial_name": null,
|
| 7233 |
"trial_params": null
|