Batch upload part 8
Browse files- nl_tasks/exps/run_ex28/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex28/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex28/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex28/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex28/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex28/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex28/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex29/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex29/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex29/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex29/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex29/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex29/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex29/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex29/trainer_state.json +505 -0
- nl_tasks/exps/run_ex30/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex30/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex30/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex30/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex30/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex30/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex30/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex30/trainer_state.json +505 -0
- nl_tasks/exps/run_ex31/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex31/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex31/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex31/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex31/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex31/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex31/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex31/trainer_state.json +743 -0
- nl_tasks/exps/run_ex32/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex32/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex32/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex32/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex32/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex32/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex32/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex32/trainer_state.json +743 -0
- nl_tasks/exps/run_ex33/ft/adapter_config.json +18 -0
- nl_tasks/exps/run_ex33/ft/special_tokens_map.json +24 -0
- nl_tasks/exps/run_ex33/ft/tokenizer.json +0 -0
- nl_tasks/exps/run_ex33/ft/tokenizer.model +3 -0
- nl_tasks/exps/run_ex33/ft/tokenizer_config.json +43 -0
- nl_tasks/exps/run_ex33/ft2/adapter_config.json +18 -0
- nl_tasks/exps/run_ex33/ft2/adapter_model.bin +3 -0
- nl_tasks/exps/run_ex33/trainer_state.json +743 -0
- nl_tasks/exps/run_ex34/gsm8k.txt +1 -0
- nl_tasks/exps/run_ex34/math.txt +1 -0
- nl_tasks/exps/run_ex34/trainer_state.json +743 -0
nl_tasks/exps/run_ex28/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex28/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex28/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex28/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex28/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex28/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex28/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4b2ff3c37a243e0a7907b8e6da8bde1c03c0404c3c881e0b71b1698879447d68
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex29/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex29/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex29/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex29/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex29/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex29/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex29/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:84e3c739b20c3790118a8b7ea87a0218b5c9c9e771866690dea91b3c76edfd03
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex29/trainer_state.json
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1668,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.26481908559799194,
|
| 15 |
+
"learning_rate": 0.000718562874251497,
|
| 16 |
+
"loss": 0.5019,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 0.21658311784267426,
|
| 22 |
+
"learning_rate": 0.001467065868263473,
|
| 23 |
+
"loss": 0.3441,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.4752499461174011,
|
| 29 |
+
"learning_rate": 0.002215568862275449,
|
| 30 |
+
"loss": 0.3298,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 56.11571502685547,
|
| 36 |
+
"learning_rate": 0.002964071856287425,
|
| 37 |
+
"loss": 0.3863,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 0.24988949298858643,
|
| 43 |
+
"learning_rate": 0.003712574850299401,
|
| 44 |
+
"loss": 0.3536,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.23253102600574493,
|
| 50 |
+
"learning_rate": 0.004461077844311378,
|
| 51 |
+
"loss": 0.3441,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.20779232680797577,
|
| 57 |
+
"learning_rate": 0.0049997316901074056,
|
| 58 |
+
"loss": 0.3304,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.14326857030391693,
|
| 64 |
+
"learning_rate": 0.004994394866271345,
|
| 65 |
+
"loss": 0.3232,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.1106962114572525,
|
| 71 |
+
"learning_rate": 0.004982230184254933,
|
| 72 |
+
"loss": 0.3079,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.10388347506523132,
|
| 78 |
+
"learning_rate": 0.004963270942203842,
|
| 79 |
+
"loss": 0.2993,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.10831473022699356,
|
| 85 |
+
"learning_rate": 0.004937569036879761,
|
| 86 |
+
"loss": 0.289,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.10159999877214432,
|
| 92 |
+
"learning_rate": 0.004905194821604405,
|
| 93 |
+
"loss": 0.2792,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.09414353221654892,
|
| 99 |
+
"learning_rate": 0.004866236913682755,
|
| 100 |
+
"loss": 0.2742,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.08423851430416107,
|
| 106 |
+
"learning_rate": 0.004820801951832635,
|
| 107 |
+
"loss": 0.2746,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.10220842808485031,
|
| 113 |
+
"learning_rate": 0.004769014304284648,
|
| 114 |
+
"loss": 0.2689,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.07861992716789246,
|
| 120 |
+
"learning_rate": 0.0047110157283514545,
|
| 121 |
+
"loss": 0.2684,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.09534072130918503,
|
| 127 |
+
"learning_rate": 0.004646964982398253,
|
| 128 |
+
"loss": 0.2748,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.06600063294172287,
|
| 134 |
+
"learning_rate": 0.0045770373912766265,
|
| 135 |
+
"loss": 0.2578,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.08592315763235092,
|
| 141 |
+
"learning_rate": 0.004501424366411254,
|
| 142 |
+
"loss": 0.2567,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.08367173373699188,
|
| 148 |
+
"learning_rate": 0.00442033288185318,
|
| 149 |
+
"loss": 0.2631,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.08196345716714859,
|
| 155 |
+
"learning_rate": 0.004333984907733788,
|
| 156 |
+
"loss": 0.2505,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.07102052867412567,
|
| 162 |
+
"learning_rate": 0.004242616802670323,
|
| 163 |
+
"loss": 0.2464,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.07556530088186264,
|
| 169 |
+
"learning_rate": 0.00414647866678607,
|
| 170 |
+
"loss": 0.2542,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.0706329271197319,
|
| 176 |
+
"learning_rate": 0.004045833657116195,
|
| 177 |
+
"loss": 0.2484,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.07402704656124115,
|
| 183 |
+
"learning_rate": 0.003940957267273149,
|
| 184 |
+
"loss": 0.2453,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.06807030737400055,
|
| 190 |
+
"learning_rate": 0.0038321365733434,
|
| 191 |
+
"loss": 0.2431,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.07543069124221802,
|
| 197 |
+
"learning_rate": 0.0037196694480796876,
|
| 198 |
+
"loss": 0.2497,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.06862358748912811,
|
| 204 |
+
"learning_rate": 0.0036038637455397798,
|
| 205 |
+
"loss": 0.238,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.09762419760227203,
|
| 211 |
+
"learning_rate": 0.0034850364584035876,
|
| 212 |
+
"loss": 0.2339,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.0853116512298584,
|
| 218 |
+
"learning_rate": 0.0033635128502753193,
|
| 219 |
+
"loss": 0.241,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.05775105208158493,
|
| 225 |
+
"learning_rate": 0.00323962556534579,
|
| 226 |
+
"loss": 0.2377,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.06312242150306702,
|
| 232 |
+
"learning_rate": 0.003113713717851998,
|
| 233 |
+
"loss": 0.2371,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.06418934464454651,
|
| 239 |
+
"learning_rate": 0.0029861219638263694,
|
| 240 |
+
"loss": 0.2313,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.06555480509996414,
|
| 246 |
+
"learning_rate": 0.002857199557676555,
|
| 247 |
+
"loss": 0.2148,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.061830855906009674,
|
| 253 |
+
"learning_rate": 0.00272729939617819,
|
| 254 |
+
"loss": 0.203,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.07122394442558289,
|
| 260 |
+
"learning_rate": 0.002596777052497456,
|
| 261 |
+
"loss": 0.2041,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.06675304472446442,
|
| 267 |
+
"learning_rate": 0.002465989802887632,
|
| 268 |
+
"loss": 0.21,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.06000453978776932,
|
| 274 |
+
"learning_rate": 0.0023352956487238063,
|
| 275 |
+
"loss": 0.2003,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.05904003605246544,
|
| 281 |
+
"learning_rate": 0.002205052336552725,
|
| 282 |
+
"loss": 0.2035,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.07205251604318619,
|
| 288 |
+
"learning_rate": 0.0020756163788401825,
|
| 289 |
+
"loss": 0.205,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.06704974919557571,
|
| 295 |
+
"learning_rate": 0.0019473420780964405,
|
| 296 |
+
"loss": 0.2069,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.060501646250486374,
|
| 302 |
+
"learning_rate": 0.0018205805570509052,
|
| 303 |
+
"loss": 0.198,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.05758596956729889,
|
| 309 |
+
"learning_rate": 0.0016956787975307614,
|
| 310 |
+
"loss": 0.1917,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.05682109296321869,
|
| 316 |
+
"learning_rate": 0.0015729786906744237,
|
| 317 |
+
"loss": 0.1914,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.06109858676791191,
|
| 323 |
+
"learning_rate": 0.0014528161010796171,
|
| 324 |
+
"loss": 0.196,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.06597461551427841,
|
| 330 |
+
"learning_rate": 0.0013355199474478,
|
| 331 |
+
"loss": 0.1897,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.060266848653554916,
|
| 337 |
+
"learning_rate": 0.0012214113022414447,
|
| 338 |
+
"loss": 0.1965,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.05543503537774086,
|
| 344 |
+
"learning_rate": 0.0011108025128186872,
|
| 345 |
+
"loss": 0.1816,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.06788609176874161,
|
| 351 |
+
"learning_rate": 0.001003996346451016,
|
| 352 |
+
"loss": 0.1887,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.05910054221749306,
|
| 358 |
+
"learning_rate": 0.0009012851615643594,
|
| 359 |
+
"loss": 0.1916,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.06214448809623718,
|
| 365 |
+
"learning_rate": 0.0008029501074720933,
|
| 366 |
+
"loss": 0.1897,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.05667509138584137,
|
| 372 |
+
"learning_rate": 0.0007092603547905377,
|
| 373 |
+
"loss": 0.1823,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.0649266168475151,
|
| 379 |
+
"learning_rate": 0.000620472358643503,
|
| 380 |
+
"loss": 0.1877,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.054551344364881516,
|
| 386 |
+
"learning_rate": 0.000536829156672706,
|
| 387 |
+
"loss": 0.1821,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.060151200741529465,
|
| 393 |
+
"learning_rate": 0.00045855970377559676,
|
| 394 |
+
"loss": 0.188,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.05992837995290756,
|
| 400 |
+
"learning_rate": 0.00038587824539160486,
|
| 401 |
+
"loss": 0.185,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.06002328544855118,
|
| 407 |
+
"learning_rate": 0.00031898373105229694,
|
| 408 |
+
"loss": 0.1823,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.06145670637488365,
|
| 414 |
+
"learning_rate": 0.00025805926980072337,
|
| 415 |
+
"loss": 0.1877,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.051237791776657104,
|
| 421 |
+
"learning_rate": 0.00020327162897062267,
|
| 422 |
+
"loss": 0.1826,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.059376440942287445,
|
| 428 |
+
"learning_rate": 0.00015477077769746855,
|
| 429 |
+
"loss": 0.1837,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.05060333386063576,
|
| 435 |
+
"learning_rate": 0.00011268947641089322,
|
| 436 |
+
"loss": 0.1786,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.06010892242193222,
|
| 442 |
+
"learning_rate": 7.714291343216635e-05,
|
| 443 |
+
"loss": 0.1854,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.06022082641720772,
|
| 449 |
+
"learning_rate": 4.822838967146054e-05,
|
| 450 |
+
"loss": 0.184,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.05504591017961502,
|
| 456 |
+
"learning_rate": 2.6025052287976248e-05,
|
| 457 |
+
"loss": 0.19,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.0550151988863945,
|
| 463 |
+
"learning_rate": 1.0593678041975475e-05,
|
| 464 |
+
"loss": 0.1808,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.05710240826010704,
|
| 470 |
+
"learning_rate": 1.9765069317453923e-06,
|
| 471 |
+
"loss": 0.1844,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0,
|
| 476 |
+
"step": 1668,
|
| 477 |
+
"total_flos": 1.62588235137024e+18,
|
| 478 |
+
"train_loss": 0.2374439179468498,
|
| 479 |
+
"train_runtime": 2227.387,
|
| 480 |
+
"train_samples_per_second": 35.917,
|
| 481 |
+
"train_steps_per_second": 0.749
|
| 482 |
+
}
|
| 483 |
+
],
|
| 484 |
+
"logging_steps": 25,
|
| 485 |
+
"max_steps": 1668,
|
| 486 |
+
"num_input_tokens_seen": 0,
|
| 487 |
+
"num_train_epochs": 2,
|
| 488 |
+
"save_steps": 0,
|
| 489 |
+
"stateful_callbacks": {
|
| 490 |
+
"TrainerControl": {
|
| 491 |
+
"args": {
|
| 492 |
+
"should_epoch_stop": false,
|
| 493 |
+
"should_evaluate": false,
|
| 494 |
+
"should_log": false,
|
| 495 |
+
"should_save": true,
|
| 496 |
+
"should_training_stop": true
|
| 497 |
+
},
|
| 498 |
+
"attributes": {}
|
| 499 |
+
}
|
| 500 |
+
},
|
| 501 |
+
"total_flos": 1.62588235137024e+18,
|
| 502 |
+
"train_batch_size": 48,
|
| 503 |
+
"trial_name": null,
|
| 504 |
+
"trial_params": null
|
| 505 |
+
}
|
nl_tasks/exps/run_ex30/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex30/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex30/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex30/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex30/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex30/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex30/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:37abc3b7865aedfe138803a372ca6148e64aa7084b6ae523203860321f217145
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex30/trainer_state.json
ADDED
|
@@ -0,0 +1,505 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 2.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 1668,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.2027871459722519,
|
| 15 |
+
"learning_rate": 0.0001437125748502994,
|
| 16 |
+
"loss": 0.6037,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 0.2463991791009903,
|
| 22 |
+
"learning_rate": 0.0002934131736526946,
|
| 23 |
+
"loss": 0.3853,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.16277779638767242,
|
| 29 |
+
"learning_rate": 0.0004431137724550898,
|
| 30 |
+
"loss": 0.3367,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 0.19866418838500977,
|
| 36 |
+
"learning_rate": 0.000592814371257485,
|
| 37 |
+
"loss": 0.3121,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 0.1782834231853485,
|
| 43 |
+
"learning_rate": 0.0007425149700598802,
|
| 44 |
+
"loss": 0.3089,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.19668474793434143,
|
| 50 |
+
"learning_rate": 0.0008922155688622756,
|
| 51 |
+
"loss": 0.2998,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.20847776532173157,
|
| 57 |
+
"learning_rate": 0.000999946338021481,
|
| 58 |
+
"loss": 0.2978,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.24161750078201294,
|
| 64 |
+
"learning_rate": 0.000998878973254269,
|
| 65 |
+
"loss": 0.304,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.20520828664302826,
|
| 71 |
+
"learning_rate": 0.0009964460368509867,
|
| 72 |
+
"loss": 0.2982,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.205276221036911,
|
| 78 |
+
"learning_rate": 0.0009926541884407686,
|
| 79 |
+
"loss": 0.2948,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.1710120588541031,
|
| 85 |
+
"learning_rate": 0.000987513807375952,
|
| 86 |
+
"loss": 0.2866,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.18962617218494415,
|
| 92 |
+
"learning_rate": 0.000981038964320881,
|
| 93 |
+
"loss": 0.2766,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.19223880767822266,
|
| 99 |
+
"learning_rate": 0.0009732473827365509,
|
| 100 |
+
"loss": 0.2738,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.17323505878448486,
|
| 106 |
+
"learning_rate": 0.0009641603903665269,
|
| 107 |
+
"loss": 0.2747,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.2111186534166336,
|
| 113 |
+
"learning_rate": 0.0009538028608569297,
|
| 114 |
+
"loss": 0.2687,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.16343681514263153,
|
| 120 |
+
"learning_rate": 0.0009422031456702909,
|
| 121 |
+
"loss": 0.2695,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.166376531124115,
|
| 127 |
+
"learning_rate": 0.0009293929964796506,
|
| 128 |
+
"loss": 0.2764,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.15445727109909058,
|
| 134 |
+
"learning_rate": 0.0009154074782553252,
|
| 135 |
+
"loss": 0.2592,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.19298841059207916,
|
| 141 |
+
"learning_rate": 0.0009002848732822509,
|
| 142 |
+
"loss": 0.2586,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.15150733292102814,
|
| 148 |
+
"learning_rate": 0.0008840665763706359,
|
| 149 |
+
"loss": 0.2642,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.1794758439064026,
|
| 155 |
+
"learning_rate": 0.0008667969815467577,
|
| 156 |
+
"loss": 0.2519,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.17440396547317505,
|
| 162 |
+
"learning_rate": 0.0008485233605340645,
|
| 163 |
+
"loss": 0.2473,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.1693456918001175,
|
| 169 |
+
"learning_rate": 0.000829295733357214,
|
| 170 |
+
"loss": 0.2554,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.21234950423240662,
|
| 176 |
+
"learning_rate": 0.0008091667314232391,
|
| 177 |
+
"loss": 0.2509,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.16216659545898438,
|
| 183 |
+
"learning_rate": 0.0007881914534546298,
|
| 184 |
+
"loss": 0.2539,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.1589777022600174,
|
| 190 |
+
"learning_rate": 0.00076642731466868,
|
| 191 |
+
"loss": 0.2478,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.17090196907520294,
|
| 197 |
+
"learning_rate": 0.0007439338896159376,
|
| 198 |
+
"loss": 0.2526,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.1454530507326126,
|
| 204 |
+
"learning_rate": 0.000720772749107956,
|
| 205 |
+
"loss": 0.2407,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.1544404923915863,
|
| 211 |
+
"learning_rate": 0.0006970072916807175,
|
| 212 |
+
"loss": 0.2358,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.15039412677288055,
|
| 218 |
+
"learning_rate": 0.0006727025700550639,
|
| 219 |
+
"loss": 0.2416,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.13531458377838135,
|
| 225 |
+
"learning_rate": 0.000647925113069158,
|
| 226 |
+
"loss": 0.2396,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.13535469770431519,
|
| 232 |
+
"learning_rate": 0.0006227427435703996,
|
| 233 |
+
"loss": 0.2382,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.13635869324207306,
|
| 239 |
+
"learning_rate": 0.0005972243927652738,
|
| 240 |
+
"loss": 0.234,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.16282866895198822,
|
| 246 |
+
"learning_rate": 0.0005714399115353111,
|
| 247 |
+
"loss": 0.2181,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.15078669786453247,
|
| 253 |
+
"learning_rate": 0.0005454598792356381,
|
| 254 |
+
"loss": 0.2082,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.14040178060531616,
|
| 260 |
+
"learning_rate": 0.0005193554104994912,
|
| 261 |
+
"loss": 0.2083,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.14513766765594482,
|
| 267 |
+
"learning_rate": 0.0004931979605775264,
|
| 268 |
+
"loss": 0.2137,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.14192743599414825,
|
| 274 |
+
"learning_rate": 0.0004670591297447613,
|
| 275 |
+
"loss": 0.2039,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.14158278703689575,
|
| 281 |
+
"learning_rate": 0.00044101046731054495,
|
| 282 |
+
"loss": 0.2073,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.15080343186855316,
|
| 288 |
+
"learning_rate": 0.0004151232757680365,
|
| 289 |
+
"loss": 0.2089,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.16032980382442474,
|
| 295 |
+
"learning_rate": 0.0003894684156192881,
|
| 296 |
+
"loss": 0.2097,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.14257696270942688,
|
| 302 |
+
"learning_rate": 0.00036411611141018104,
|
| 303 |
+
"loss": 0.2013,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.1491222381591797,
|
| 309 |
+
"learning_rate": 0.00033913575950615226,
|
| 310 |
+
"loss": 0.1949,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.13092097640037537,
|
| 316 |
+
"learning_rate": 0.00031459573813488474,
|
| 317 |
+
"loss": 0.1946,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.13922549784183502,
|
| 323 |
+
"learning_rate": 0.0002905632202159234,
|
| 324 |
+
"loss": 0.1991,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.13861505687236786,
|
| 330 |
+
"learning_rate": 0.00026710398948956,
|
| 331 |
+
"loss": 0.1921,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.14462125301361084,
|
| 337 |
+
"learning_rate": 0.00024428226044828893,
|
| 338 |
+
"loss": 0.1992,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.13747504353523254,
|
| 344 |
+
"learning_rate": 0.00022216050256373743,
|
| 345 |
+
"loss": 0.1848,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.1536317616701126,
|
| 351 |
+
"learning_rate": 0.00020079926929020321,
|
| 352 |
+
"loss": 0.1914,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.1415477842092514,
|
| 358 |
+
"learning_rate": 0.00018025703231287188,
|
| 359 |
+
"loss": 0.1937,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.14675097167491913,
|
| 365 |
+
"learning_rate": 0.00016059002149441864,
|
| 366 |
+
"loss": 0.1934,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.13264699280261993,
|
| 372 |
+
"learning_rate": 0.00014185207095810754,
|
| 373 |
+
"loss": 0.1848,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.15923435986042023,
|
| 379 |
+
"learning_rate": 0.00012409447172870058,
|
| 380 |
+
"loss": 0.1909,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.12699192762374878,
|
| 386 |
+
"learning_rate": 0.00010736583133454119,
|
| 387 |
+
"loss": 0.1853,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.14546607434749603,
|
| 393 |
+
"learning_rate": 9.171194075511934e-05,
|
| 394 |
+
"loss": 0.1919,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.14308422803878784,
|
| 400 |
+
"learning_rate": 7.717564907832098e-05,
|
| 401 |
+
"loss": 0.1886,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.1330956369638443,
|
| 407 |
+
"learning_rate": 6.379674621045939e-05,
|
| 408 |
+
"loss": 0.1856,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.14610905945301056,
|
| 414 |
+
"learning_rate": 5.1611853960144674e-05,
|
| 415 |
+
"loss": 0.1923,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.12671244144439697,
|
| 421 |
+
"learning_rate": 4.0654325794124535e-05,
|
| 422 |
+
"loss": 0.1853,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.14322331547737122,
|
| 428 |
+
"learning_rate": 3.095415553949371e-05,
|
| 429 |
+
"loss": 0.1868,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.1249406635761261,
|
| 435 |
+
"learning_rate": 2.2537895282178645e-05,
|
| 436 |
+
"loss": 0.1829,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.1507108360528946,
|
| 442 |
+
"learning_rate": 1.542858268643327e-05,
|
| 443 |
+
"loss": 0.1888,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.13775067031383514,
|
| 449 |
+
"learning_rate": 9.645677934292108e-06,
|
| 450 |
+
"loss": 0.1881,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.13463319838047028,
|
| 456 |
+
"learning_rate": 5.205010457595249e-06,
|
| 457 |
+
"loss": 0.1937,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.13453663885593414,
|
| 463 |
+
"learning_rate": 2.118735608395095e-06,
|
| 464 |
+
"loss": 0.1846,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.15357571840286255,
|
| 470 |
+
"learning_rate": 3.953013863490784e-07,
|
| 471 |
+
"loss": 0.1877,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0,
|
| 476 |
+
"step": 1668,
|
| 477 |
+
"total_flos": 1.62588235137024e+18,
|
| 478 |
+
"train_loss": 0.23850378386980053,
|
| 479 |
+
"train_runtime": 2220.9851,
|
| 480 |
+
"train_samples_per_second": 36.02,
|
| 481 |
+
"train_steps_per_second": 0.751
|
| 482 |
+
}
|
| 483 |
+
],
|
| 484 |
+
"logging_steps": 25,
|
| 485 |
+
"max_steps": 1668,
|
| 486 |
+
"num_input_tokens_seen": 0,
|
| 487 |
+
"num_train_epochs": 2,
|
| 488 |
+
"save_steps": 0,
|
| 489 |
+
"stateful_callbacks": {
|
| 490 |
+
"TrainerControl": {
|
| 491 |
+
"args": {
|
| 492 |
+
"should_epoch_stop": false,
|
| 493 |
+
"should_evaluate": false,
|
| 494 |
+
"should_log": false,
|
| 495 |
+
"should_save": true,
|
| 496 |
+
"should_training_stop": true
|
| 497 |
+
},
|
| 498 |
+
"attributes": {}
|
| 499 |
+
}
|
| 500 |
+
},
|
| 501 |
+
"total_flos": 1.62588235137024e+18,
|
| 502 |
+
"train_batch_size": 48,
|
| 503 |
+
"trial_name": null,
|
| 504 |
+
"trial_params": null
|
| 505 |
+
}
|
nl_tasks/exps/run_ex31/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex31/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex31/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex31/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex31/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex31/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"v_proj",
|
| 14 |
+
"q_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex31/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a02d042f5103673b43fce0e75a90ffc2bbd4c2dd3f028a6db285cf34c732bb6f
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex31/trainer_state.json
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2502,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.237616166472435,
|
| 15 |
+
"learning_rate": 0.00047808764940239046,
|
| 16 |
+
"loss": 0.523,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 0.20422297716140747,
|
| 22 |
+
"learning_rate": 0.0009760956175298805,
|
| 23 |
+
"loss": 0.3493,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.21873657405376434,
|
| 29 |
+
"learning_rate": 0.0014741035856573707,
|
| 30 |
+
"loss": 0.3229,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 0.24540852010250092,
|
| 36 |
+
"learning_rate": 0.0019721115537848603,
|
| 37 |
+
"loss": 0.314,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 1.2509855031967163,
|
| 43 |
+
"learning_rate": 0.002470119521912351,
|
| 44 |
+
"loss": 0.3362,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.3144875168800354,
|
| 50 |
+
"learning_rate": 0.002968127490039841,
|
| 51 |
+
"loss": 0.3425,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.3264140486717224,
|
| 57 |
+
"learning_rate": 0.003466135458167331,
|
| 58 |
+
"loss": 0.3266,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.18573451042175293,
|
| 64 |
+
"learning_rate": 0.0039641434262948205,
|
| 65 |
+
"loss": 0.3298,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.18408645689487457,
|
| 71 |
+
"learning_rate": 0.004462151394422311,
|
| 72 |
+
"loss": 0.3179,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.15508218109607697,
|
| 78 |
+
"learning_rate": 0.0049601593625498005,
|
| 79 |
+
"loss": 0.3138,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.12099787592887878,
|
| 85 |
+
"learning_rate": 0.004998712114810764,
|
| 86 |
+
"loss": 0.3034,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.15490184724330902,
|
| 92 |
+
"learning_rate": 0.004994392376862353,
|
| 93 |
+
"loss": 0.2906,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.12329553812742233,
|
| 99 |
+
"learning_rate": 0.004987036305323271,
|
| 100 |
+
"loss": 0.283,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.1184345930814743,
|
| 106 |
+
"learning_rate": 0.0049766528544732515,
|
| 107 |
+
"loss": 0.2827,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.11834505200386047,
|
| 113 |
+
"learning_rate": 0.00496325466371133,
|
| 114 |
+
"loss": 0.2732,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.07786522805690765,
|
| 120 |
+
"learning_rate": 0.004946858042170361,
|
| 121 |
+
"loss": 0.2735,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.08665332198143005,
|
| 127 |
+
"learning_rate": 0.0049274829488645,
|
| 128 |
+
"loss": 0.2795,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.07928116619586945,
|
| 134 |
+
"learning_rate": 0.004905152968393817,
|
| 135 |
+
"loss": 0.2609,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.12693190574645996,
|
| 141 |
+
"learning_rate": 0.004879895282235616,
|
| 142 |
+
"loss": 0.2617,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.07392635196447372,
|
| 148 |
+
"learning_rate": 0.0048517406356574115,
|
| 149 |
+
"loss": 0.2672,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.091416135430336,
|
| 155 |
+
"learning_rate": 0.0048207233002918164,
|
| 156 |
+
"loss": 0.256,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.08377746492624283,
|
| 162 |
+
"learning_rate": 0.004786881032418933,
|
| 163 |
+
"loss": 0.2511,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.06915393471717834,
|
| 169 |
+
"learning_rate": 0.004750255027006994,
|
| 170 |
+
"loss": 0.2589,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.07373099029064178,
|
| 176 |
+
"learning_rate": 0.004710889867567222,
|
| 177 |
+
"loss": 0.2518,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.06648170202970505,
|
| 183 |
+
"learning_rate": 0.004668833471883931,
|
| 184 |
+
"loss": 0.249,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.06580448895692825,
|
| 190 |
+
"learning_rate": 0.0046241370336859424,
|
| 191 |
+
"loss": 0.2481,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.07079949229955673,
|
| 197 |
+
"learning_rate": 0.004576854960330311,
|
| 198 |
+
"loss": 0.2543,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.06271594017744064,
|
| 204 |
+
"learning_rate": 0.004527044806574219,
|
| 205 |
+
"loss": 0.2422,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.0618261955678463,
|
| 211 |
+
"learning_rate": 0.004474767204515652,
|
| 212 |
+
"loss": 0.2386,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.06375081837177277,
|
| 218 |
+
"learning_rate": 0.004420085789788137,
|
| 219 |
+
"loss": 0.2445,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.05429168790578842,
|
| 225 |
+
"learning_rate": 0.0043630671240993905,
|
| 226 |
+
"loss": 0.2422,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.05972912162542343,
|
| 232 |
+
"learning_rate": 0.0043037806142081645,
|
| 233 |
+
"loss": 0.2418,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.05979093909263611,
|
| 239 |
+
"learning_rate": 0.004242298427437903,
|
| 240 |
+
"loss": 0.2361,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.07171288132667542,
|
| 246 |
+
"learning_rate": 0.00417869540383007,
|
| 247 |
+
"loss": 0.2221,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.06082647666335106,
|
| 253 |
+
"learning_rate": 0.0041130489650440805,
|
| 254 |
+
"loss": 0.211,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.05869077146053314,
|
| 260 |
+
"learning_rate": 0.004045439020114715,
|
| 261 |
+
"loss": 0.2123,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.059404339641332626,
|
| 267 |
+
"learning_rate": 0.003975947868181739,
|
| 268 |
+
"loss": 0.2193,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.05453066527843475,
|
| 274 |
+
"learning_rate": 0.0039046600983101355,
|
| 275 |
+
"loss": 0.2105,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.05172204226255417,
|
| 281 |
+
"learning_rate": 0.0038316624865229088,
|
| 282 |
+
"loss": 0.2142,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.060828547924757004,
|
| 288 |
+
"learning_rate": 0.003757043890171755,
|
| 289 |
+
"loss": 0.2165,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.0641385167837143,
|
| 295 |
+
"learning_rate": 0.0036808951397742378,
|
| 296 |
+
"loss": 0.218,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.05649897828698158,
|
| 302 |
+
"learning_rate": 0.0036033089284490745,
|
| 303 |
+
"loss": 0.2094,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.05249471217393875,
|
| 309 |
+
"learning_rate": 0.003524379699084162,
|
| 310 |
+
"loss": 0.2028,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.05551101639866829,
|
| 316 |
+
"learning_rate": 0.0034442035293746655,
|
| 317 |
+
"loss": 0.2037,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.05503613501787186,
|
| 323 |
+
"learning_rate": 0.003362878014871117,
|
| 324 |
+
"loss": 0.208,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.05301510915160179,
|
| 330 |
+
"learning_rate": 0.0032805021501798805,
|
| 331 |
+
"loss": 0.2012,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.056410036981105804,
|
| 337 |
+
"learning_rate": 0.0031971762084606003,
|
| 338 |
+
"loss": 0.2095,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.05041206628084183,
|
| 344 |
+
"learning_rate": 0.0031130016193673137,
|
| 345 |
+
"loss": 0.1943,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.05369720607995987,
|
| 351 |
+
"learning_rate": 0.003028080845581801,
|
| 352 |
+
"loss": 0.2021,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.05225532501935959,
|
| 358 |
+
"learning_rate": 0.00294251725808947,
|
| 359 |
+
"loss": 0.2058,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.052846185863018036,
|
| 365 |
+
"learning_rate": 0.0028564150103495963,
|
| 366 |
+
"loss": 0.204,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.051086682826280594,
|
| 372 |
+
"learning_rate": 0.002769878911513086,
|
| 373 |
+
"loss": 0.1961,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.055970534682273865,
|
| 379 |
+
"learning_rate": 0.0026830142988420866,
|
| 380 |
+
"loss": 0.2012,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.0505131334066391,
|
| 386 |
+
"learning_rate": 0.0025959269094867525,
|
| 387 |
+
"loss": 0.1975,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.050360601395368576,
|
| 393 |
+
"learning_rate": 0.0025087227517752355,
|
| 394 |
+
"loss": 0.2029,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.0552959144115448,
|
| 400 |
+
"learning_rate": 0.0024215079761735793,
|
| 401 |
+
"loss": 0.1986,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.04918622598052025,
|
| 407 |
+
"learning_rate": 0.0023343887460726058,
|
| 408 |
+
"loss": 0.1966,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.05339549854397774,
|
| 414 |
+
"learning_rate": 0.0022474711085590524,
|
| 415 |
+
"loss": 0.2022,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.044814374297857285,
|
| 421 |
+
"learning_rate": 0.002160860865328295,
|
| 422 |
+
"loss": 0.1953,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.05152401328086853,
|
| 428 |
+
"learning_rate": 0.002074663443895771,
|
| 429 |
+
"loss": 0.1974,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.04496421292424202,
|
| 435 |
+
"learning_rate": 0.001988983769263877,
|
| 436 |
+
"loss": 0.1926,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.0547536201775074,
|
| 442 |
+
"learning_rate": 0.001903926136200566,
|
| 443 |
+
"loss": 0.1992,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.04525403305888176,
|
| 449 |
+
"learning_rate": 0.0018195940822850927,
|
| 450 |
+
"loss": 0.1976,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.045630406588315964,
|
| 456 |
+
"learning_rate": 0.0017360902618754664,
|
| 457 |
+
"loss": 0.2022,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.046241626143455505,
|
| 463 |
+
"learning_rate": 0.0016535163211510203,
|
| 464 |
+
"loss": 0.1926,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.04715004190802574,
|
| 470 |
+
"learning_rate": 0.0015719727743821854,
|
| 471 |
+
"loss": 0.1947,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0083932853717026,
|
| 476 |
+
"grad_norm": 0.046405646950006485,
|
| 477 |
+
"learning_rate": 0.0014915588815781152,
|
| 478 |
+
"loss": 0.1849,
|
| 479 |
+
"step": 1675
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 2.038369304556355,
|
| 483 |
+
"grad_norm": 0.04901168495416641,
|
| 484 |
+
"learning_rate": 0.0014123725276610638,
|
| 485 |
+
"loss": 0.1587,
|
| 486 |
+
"step": 1700
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 2.068345323741007,
|
| 490 |
+
"grad_norm": 0.05803445354104042,
|
| 491 |
+
"learning_rate": 0.0013345101033146085,
|
| 492 |
+
"loss": 0.1605,
|
| 493 |
+
"step": 1725
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 2.0983213429256593,
|
| 497 |
+
"grad_norm": 0.05447980388998985,
|
| 498 |
+
"learning_rate": 0.0012580663876507647,
|
| 499 |
+
"loss": 0.1601,
|
| 500 |
+
"step": 1750
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 2.128297362110312,
|
| 504 |
+
"grad_norm": 0.05418948829174042,
|
| 505 |
+
"learning_rate": 0.0011831344328387986,
|
| 506 |
+
"loss": 0.1577,
|
| 507 |
+
"step": 1775
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 2.158273381294964,
|
| 511 |
+
"grad_norm": 0.055272314697504044,
|
| 512 |
+
"learning_rate": 0.0011098054508361854,
|
| 513 |
+
"loss": 0.1596,
|
| 514 |
+
"step": 1800
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 2.1882494004796165,
|
| 518 |
+
"grad_norm": 0.04978896677494049,
|
| 519 |
+
"learning_rate": 0.0010381687023596014,
|
| 520 |
+
"loss": 0.1634,
|
| 521 |
+
"step": 1825
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 2.2182254196642686,
|
| 525 |
+
"grad_norm": 0.052053723484277725,
|
| 526 |
+
"learning_rate": 0.0009683113882310735,
|
| 527 |
+
"loss": 0.1565,
|
| 528 |
+
"step": 1850
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 2.2482014388489207,
|
| 532 |
+
"grad_norm": 0.04874909296631813,
|
| 533 |
+
"learning_rate": 0.0009003185432315822,
|
| 534 |
+
"loss": 0.1597,
|
| 535 |
+
"step": 1875
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 2.278177458033573,
|
| 539 |
+
"grad_norm": 0.04999493435025215,
|
| 540 |
+
"learning_rate": 0.0008342729325912946,
|
| 541 |
+
"loss": 0.1554,
|
| 542 |
+
"step": 1900
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 2.3081534772182253,
|
| 546 |
+
"grad_norm": 0.051304448395967484,
|
| 547 |
+
"learning_rate": 0.0007702549512424437,
|
| 548 |
+
"loss": 0.1617,
|
| 549 |
+
"step": 1925
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 2.338129496402878,
|
| 553 |
+
"grad_norm": 0.04773577302694321,
|
| 554 |
+
"learning_rate": 0.0007083425259574896,
|
| 555 |
+
"loss": 0.1563,
|
| 556 |
+
"step": 1950
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 2.36810551558753,
|
| 560 |
+
"grad_norm": 0.04622693732380867,
|
| 561 |
+
"learning_rate": 0.0006486110204916776,
|
| 562 |
+
"loss": 0.1582,
|
| 563 |
+
"step": 1975
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 2.3980815347721824,
|
| 567 |
+
"grad_norm": 0.05061614140868187,
|
| 568 |
+
"learning_rate": 0.000591133143845462,
|
| 569 |
+
"loss": 0.1544,
|
| 570 |
+
"step": 2000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 2.4280575539568345,
|
| 574 |
+
"grad_norm": 0.05210672691464424,
|
| 575 |
+
"learning_rate": 0.0005359788617584769,
|
| 576 |
+
"loss": 0.1575,
|
| 577 |
+
"step": 2025
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 2.4580335731414866,
|
| 581 |
+
"grad_norm": 0.049017682671546936,
|
| 582 |
+
"learning_rate": 0.00048321531154276706,
|
| 583 |
+
"loss": 0.1578,
|
| 584 |
+
"step": 2050
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 2.488009592326139,
|
| 588 |
+
"grad_norm": 0.061639346182346344,
|
| 589 |
+
"learning_rate": 0.0004329067203589709,
|
| 590 |
+
"loss": 0.1544,
|
| 591 |
+
"step": 2075
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 2.5179856115107913,
|
| 595 |
+
"grad_norm": 0.05240131914615631,
|
| 596 |
+
"learning_rate": 0.00038511432703492083,
|
| 597 |
+
"loss": 0.1568,
|
| 598 |
+
"step": 2100
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 2.547961630695444,
|
| 602 |
+
"grad_norm": 0.05182984471321106,
|
| 603 |
+
"learning_rate": 0.0003398963075218309,
|
| 604 |
+
"loss": 0.1567,
|
| 605 |
+
"step": 2125
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 2.577937649880096,
|
| 609 |
+
"grad_norm": 0.04628787934780121,
|
| 610 |
+
"learning_rate": 0.0002973077040788205,
|
| 611 |
+
"loss": 0.1528,
|
| 612 |
+
"step": 2150
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 2.6079136690647484,
|
| 616 |
+
"grad_norm": 0.056303806602954865,
|
| 617 |
+
"learning_rate": 0.00025740035827196165,
|
| 618 |
+
"loss": 0.1515,
|
| 619 |
+
"step": 2175
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 2.6378896882494005,
|
| 623 |
+
"grad_norm": 0.05215095728635788,
|
| 624 |
+
"learning_rate": 0.00022022284786941544,
|
| 625 |
+
"loss": 0.1527,
|
| 626 |
+
"step": 2200
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 2.6678657074340526,
|
| 630 |
+
"grad_norm": 0.04626120626926422,
|
| 631 |
+
"learning_rate": 0.00018582042770947467,
|
| 632 |
+
"loss": 0.1541,
|
| 633 |
+
"step": 2225
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 2.697841726618705,
|
| 637 |
+
"grad_norm": 0.04964889958500862,
|
| 638 |
+
"learning_rate": 0.0001542349746134855,
|
| 639 |
+
"loss": 0.1574,
|
| 640 |
+
"step": 2250
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 2.7278177458033572,
|
| 644 |
+
"grad_norm": 0.05072702094912529,
|
| 645 |
+
"learning_rate": 0.00012550493641070665,
|
| 646 |
+
"loss": 0.1609,
|
| 647 |
+
"step": 2275
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 2.7577937649880093,
|
| 651 |
+
"grad_norm": 0.048793647438287735,
|
| 652 |
+
"learning_rate": 9.966528513716072e-05,
|
| 653 |
+
"loss": 0.151,
|
| 654 |
+
"step": 2300
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 2.787769784172662,
|
| 658 |
+
"grad_norm": 0.04410620033740997,
|
| 659 |
+
"learning_rate": 7.674747446543756e-05,
|
| 660 |
+
"loss": 0.149,
|
| 661 |
+
"step": 2325
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 2.8177458033573144,
|
| 665 |
+
"grad_norm": 0.04964963719248772,
|
| 666 |
+
"learning_rate": 5.677940141727761e-05,
|
| 667 |
+
"loss": 0.1524,
|
| 668 |
+
"step": 2350
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 2.8477218225419665,
|
| 672 |
+
"grad_norm": 0.052818212658166885,
|
| 673 |
+
"learning_rate": 3.9785372405537756e-05,
|
| 674 |
+
"loss": 0.149,
|
| 675 |
+
"step": 2375
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 2.8776978417266186,
|
| 679 |
+
"grad_norm": 0.07118818908929825,
|
| 680 |
+
"learning_rate": 2.5786073646871523e-05,
|
| 681 |
+
"loss": 0.1528,
|
| 682 |
+
"step": 2400
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 2.907673860911271,
|
| 686 |
+
"grad_norm": 0.04659281671047211,
|
| 687 |
+
"learning_rate": 1.479854598114977e-05,
|
| 688 |
+
"loss": 0.1544,
|
| 689 |
+
"step": 2425
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 2.937649880095923,
|
| 693 |
+
"grad_norm": 0.04493272304534912,
|
| 694 |
+
"learning_rate": 6.836164128259103e-06,
|
| 695 |
+
"loss": 0.1504,
|
| 696 |
+
"step": 2450
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 2.9676258992805753,
|
| 700 |
+
"grad_norm": 0.06299802660942078,
|
| 701 |
+
"learning_rate": 1.908620407542472e-06,
|
| 702 |
+
"loss": 0.1501,
|
| 703 |
+
"step": 2475
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 2.997601918465228,
|
| 707 |
+
"grad_norm": 0.053529493510723114,
|
| 708 |
+
"learning_rate": 2.191293968722974e-08,
|
| 709 |
+
"loss": 0.152,
|
| 710 |
+
"step": 2500
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 3.0,
|
| 714 |
+
"step": 2502,
|
| 715 |
+
"total_flos": 2.43882352705536e+18,
|
| 716 |
+
"train_loss": 0.215125925130219,
|
| 717 |
+
"train_runtime": 3297.3883,
|
| 718 |
+
"train_samples_per_second": 36.392,
|
| 719 |
+
"train_steps_per_second": 0.759
|
| 720 |
+
}
|
| 721 |
+
],
|
| 722 |
+
"logging_steps": 25,
|
| 723 |
+
"max_steps": 2502,
|
| 724 |
+
"num_input_tokens_seen": 0,
|
| 725 |
+
"num_train_epochs": 3,
|
| 726 |
+
"save_steps": 0,
|
| 727 |
+
"stateful_callbacks": {
|
| 728 |
+
"TrainerControl": {
|
| 729 |
+
"args": {
|
| 730 |
+
"should_epoch_stop": false,
|
| 731 |
+
"should_evaluate": false,
|
| 732 |
+
"should_log": false,
|
| 733 |
+
"should_save": true,
|
| 734 |
+
"should_training_stop": true
|
| 735 |
+
},
|
| 736 |
+
"attributes": {}
|
| 737 |
+
}
|
| 738 |
+
},
|
| 739 |
+
"total_flos": 2.43882352705536e+18,
|
| 740 |
+
"train_batch_size": 48,
|
| 741 |
+
"trial_name": null,
|
| 742 |
+
"trial_params": null
|
| 743 |
+
}
|
nl_tasks/exps/run_ex32/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex32/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex32/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex32/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex32/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex32/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex32/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e8b473fff55419f14a36da83dfd8d8f05944a51103982531702ea9c3fdd5c0c
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex32/trainer_state.json
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2502,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.22439797222614288,
|
| 15 |
+
"learning_rate": 9.56175298804781e-05,
|
| 16 |
+
"loss": 0.634,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 0.21105553209781647,
|
| 22 |
+
"learning_rate": 0.0001952191235059761,
|
| 23 |
+
"loss": 0.4028,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.18460212647914886,
|
| 29 |
+
"learning_rate": 0.0002948207171314741,
|
| 30 |
+
"loss": 0.346,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 0.19811777770519257,
|
| 36 |
+
"learning_rate": 0.0003944223107569721,
|
| 37 |
+
"loss": 0.3192,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 0.18307138979434967,
|
| 43 |
+
"learning_rate": 0.0004940239043824702,
|
| 44 |
+
"loss": 0.3131,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.19494092464447021,
|
| 50 |
+
"learning_rate": 0.0005936254980079682,
|
| 51 |
+
"loss": 0.3019,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.19441217184066772,
|
| 57 |
+
"learning_rate": 0.0006932270916334662,
|
| 58 |
+
"loss": 0.2973,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.1927807629108429,
|
| 64 |
+
"learning_rate": 0.0007928286852589641,
|
| 65 |
+
"loss": 0.3038,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.17632770538330078,
|
| 71 |
+
"learning_rate": 0.0008924302788844621,
|
| 72 |
+
"loss": 0.2981,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.19236312806606293,
|
| 78 |
+
"learning_rate": 0.00099203187250996,
|
| 79 |
+
"loss": 0.2966,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.21083885431289673,
|
| 85 |
+
"learning_rate": 0.0009997424229621528,
|
| 86 |
+
"loss": 0.2933,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.21011164784431458,
|
| 92 |
+
"learning_rate": 0.0009988784753724707,
|
| 93 |
+
"loss": 0.343,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.2327512800693512,
|
| 99 |
+
"learning_rate": 0.0009974072610646543,
|
| 100 |
+
"loss": 0.2838,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.25379207730293274,
|
| 106 |
+
"learning_rate": 0.0009953305708946503,
|
| 107 |
+
"loss": 0.2835,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.21607662737369537,
|
| 113 |
+
"learning_rate": 0.000992650932742266,
|
| 114 |
+
"loss": 0.2739,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.17558318376541138,
|
| 120 |
+
"learning_rate": 0.0009893716084340722,
|
| 121 |
+
"loss": 0.2751,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.17286434769630432,
|
| 127 |
+
"learning_rate": 0.0009854965897729,
|
| 128 |
+
"loss": 0.2812,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.15521785616874695,
|
| 134 |
+
"learning_rate": 0.0009810305936787634,
|
| 135 |
+
"loss": 0.2631,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.19885142147541046,
|
| 141 |
+
"learning_rate": 0.0009759790564471232,
|
| 142 |
+
"loss": 0.2634,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.16864049434661865,
|
| 148 |
+
"learning_rate": 0.0009703481271314822,
|
| 149 |
+
"loss": 0.2686,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.18147310614585876,
|
| 155 |
+
"learning_rate": 0.0009641446600583632,
|
| 156 |
+
"loss": 0.2565,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.1791866272687912,
|
| 162 |
+
"learning_rate": 0.0009573762064837866,
|
| 163 |
+
"loss": 0.2525,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.16333305835723877,
|
| 169 |
+
"learning_rate": 0.0009500510054013988,
|
| 170 |
+
"loss": 0.2599,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.15634645521640778,
|
| 176 |
+
"learning_rate": 0.0009421779735134444,
|
| 177 |
+
"loss": 0.2551,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.16058474779129028,
|
| 183 |
+
"learning_rate": 0.0009337666943767861,
|
| 184 |
+
"loss": 0.2518,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.15423771739006042,
|
| 190 |
+
"learning_rate": 0.0009248274067371884,
|
| 191 |
+
"loss": 0.2507,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.16649393737316132,
|
| 197 |
+
"learning_rate": 0.0009153709920660622,
|
| 198 |
+
"loss": 0.256,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.14274722337722778,
|
| 204 |
+
"learning_rate": 0.0009054089613148438,
|
| 205 |
+
"loss": 0.2444,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.14566783607006073,
|
| 211 |
+
"learning_rate": 0.0008949534409031304,
|
| 212 |
+
"loss": 0.2406,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.16103540360927582,
|
| 218 |
+
"learning_rate": 0.0008840171579576273,
|
| 219 |
+
"loss": 0.2476,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.13278396427631378,
|
| 225 |
+
"learning_rate": 0.0008726134248198781,
|
| 226 |
+
"loss": 0.2444,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.13953395187854767,
|
| 232 |
+
"learning_rate": 0.000860756122841633,
|
| 233 |
+
"loss": 0.2429,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.15767574310302734,
|
| 239 |
+
"learning_rate": 0.0008484596854875805,
|
| 240 |
+
"loss": 0.2382,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.17023757100105286,
|
| 246 |
+
"learning_rate": 0.0008357390807660139,
|
| 247 |
+
"loss": 0.2247,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.1432039439678192,
|
| 253 |
+
"learning_rate": 0.0008226097930088161,
|
| 254 |
+
"loss": 0.2148,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.14797061681747437,
|
| 260 |
+
"learning_rate": 0.0008090878040229431,
|
| 261 |
+
"loss": 0.2162,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.15065963566303253,
|
| 267 |
+
"learning_rate": 0.0007951895736363477,
|
| 268 |
+
"loss": 0.2234,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.13475900888442993,
|
| 274 |
+
"learning_rate": 0.0007809320196620271,
|
| 275 |
+
"loss": 0.2137,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.1358369141817093,
|
| 281 |
+
"learning_rate": 0.0007663324973045817,
|
| 282 |
+
"loss": 0.2167,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.15405559539794922,
|
| 288 |
+
"learning_rate": 0.000751408778034351,
|
| 289 |
+
"loss": 0.2193,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.15572060644626617,
|
| 295 |
+
"learning_rate": 0.0007361790279548476,
|
| 296 |
+
"loss": 0.2207,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.14885199069976807,
|
| 302 |
+
"learning_rate": 0.0007206617856898149,
|
| 303 |
+
"loss": 0.2122,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.13450340926647186,
|
| 309 |
+
"learning_rate": 0.0007048759398168324,
|
| 310 |
+
"loss": 0.2053,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.12750637531280518,
|
| 316 |
+
"learning_rate": 0.0006888407058749331,
|
| 317 |
+
"loss": 0.2059,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.1344868540763855,
|
| 323 |
+
"learning_rate": 0.0006725756029742234,
|
| 324 |
+
"loss": 0.2108,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.12938323616981506,
|
| 330 |
+
"learning_rate": 0.0006561004300359761,
|
| 331 |
+
"loss": 0.2038,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.13532765209674835,
|
| 337 |
+
"learning_rate": 0.00063943524169212,
|
| 338 |
+
"loss": 0.2107,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.1328439861536026,
|
| 344 |
+
"learning_rate": 0.0006226003238734627,
|
| 345 |
+
"loss": 0.1973,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.14724420011043549,
|
| 351 |
+
"learning_rate": 0.0006056161691163601,
|
| 352 |
+
"loss": 0.2038,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.1372281312942505,
|
| 358 |
+
"learning_rate": 0.000588503451617894,
|
| 359 |
+
"loss": 0.2063,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.13836322724819183,
|
| 365 |
+
"learning_rate": 0.0005712830020699192,
|
| 366 |
+
"loss": 0.2056,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.12976917624473572,
|
| 372 |
+
"learning_rate": 0.0005539757823026172,
|
| 373 |
+
"loss": 0.1973,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.1546931117773056,
|
| 379 |
+
"learning_rate": 0.0005366028597684172,
|
| 380 |
+
"loss": 0.2028,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.12072357535362244,
|
| 386 |
+
"learning_rate": 0.0005191853818973506,
|
| 387 |
+
"loss": 0.1979,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.13212819397449493,
|
| 393 |
+
"learning_rate": 0.0005017445503550471,
|
| 394 |
+
"loss": 0.2051,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.13972344994544983,
|
| 400 |
+
"learning_rate": 0.00048430159523471587,
|
| 401 |
+
"loss": 0.1999,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.13032999634742737,
|
| 407 |
+
"learning_rate": 0.00046687774921452113,
|
| 408 |
+
"loss": 0.1975,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.1394297480583191,
|
| 414 |
+
"learning_rate": 0.00044949422171181047,
|
| 415 |
+
"loss": 0.2031,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.12275266647338867,
|
| 421 |
+
"learning_rate": 0.0004321721730656589,
|
| 422 |
+
"loss": 0.1966,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.12649331986904144,
|
| 428 |
+
"learning_rate": 0.0004149326887791541,
|
| 429 |
+
"loss": 0.1983,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.11251500993967056,
|
| 435 |
+
"learning_rate": 0.0003977967538527754,
|
| 436 |
+
"loss": 0.1932,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.13119769096374512,
|
| 442 |
+
"learning_rate": 0.0003807852272401132,
|
| 443 |
+
"loss": 0.1995,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.12021032720804214,
|
| 449 |
+
"learning_rate": 0.0003639188164570185,
|
| 450 |
+
"loss": 0.1985,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.12251219153404236,
|
| 456 |
+
"learning_rate": 0.0003472180523750933,
|
| 457 |
+
"loss": 0.2041,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.11931514739990234,
|
| 463 |
+
"learning_rate": 0.0003307032642302041,
|
| 464 |
+
"loss": 0.1933,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.12238750606775284,
|
| 470 |
+
"learning_rate": 0.0003143945548764371,
|
| 471 |
+
"loss": 0.195,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0083932853717026,
|
| 476 |
+
"grad_norm": 0.13976894319057465,
|
| 477 |
+
"learning_rate": 0.00029831177631562306,
|
| 478 |
+
"loss": 0.1858,
|
| 479 |
+
"step": 1675
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 2.038369304556355,
|
| 483 |
+
"grad_norm": 0.12677723169326782,
|
| 484 |
+
"learning_rate": 0.0002824745055322128,
|
| 485 |
+
"loss": 0.1608,
|
| 486 |
+
"step": 1700
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 2.068345323741007,
|
| 490 |
+
"grad_norm": 0.13730570673942566,
|
| 491 |
+
"learning_rate": 0.0002669020206629217,
|
| 492 |
+
"loss": 0.1632,
|
| 493 |
+
"step": 1725
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 2.0983213429256593,
|
| 497 |
+
"grad_norm": 0.1422010064125061,
|
| 498 |
+
"learning_rate": 0.00025161327753015297,
|
| 499 |
+
"loss": 0.1619,
|
| 500 |
+
"step": 1750
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 2.128297362110312,
|
| 504 |
+
"grad_norm": 0.14339914917945862,
|
| 505 |
+
"learning_rate": 0.00023662688656775972,
|
| 506 |
+
"loss": 0.1607,
|
| 507 |
+
"step": 1775
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 2.158273381294964,
|
| 511 |
+
"grad_norm": 0.14340265095233917,
|
| 512 |
+
"learning_rate": 0.00022196109016723708,
|
| 513 |
+
"loss": 0.1611,
|
| 514 |
+
"step": 1800
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 2.1882494004796165,
|
| 518 |
+
"grad_norm": 0.14029954373836517,
|
| 519 |
+
"learning_rate": 0.0002076337404719203,
|
| 520 |
+
"loss": 0.1657,
|
| 521 |
+
"step": 1825
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 2.2182254196642686,
|
| 525 |
+
"grad_norm": 0.13461889326572418,
|
| 526 |
+
"learning_rate": 0.00019366227764621468,
|
| 527 |
+
"loss": 0.1584,
|
| 528 |
+
"step": 1850
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 2.2482014388489207,
|
| 532 |
+
"grad_norm": 0.1285167932510376,
|
| 533 |
+
"learning_rate": 0.00018006370864631643,
|
| 534 |
+
"loss": 0.1622,
|
| 535 |
+
"step": 1875
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 2.278177458033573,
|
| 539 |
+
"grad_norm": 0.13294167816638947,
|
| 540 |
+
"learning_rate": 0.0001668545865182589,
|
| 541 |
+
"loss": 0.1577,
|
| 542 |
+
"step": 1900
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 2.3081534772182253,
|
| 546 |
+
"grad_norm": 0.1409018188714981,
|
| 547 |
+
"learning_rate": 0.00015405099024848874,
|
| 548 |
+
"loss": 0.1637,
|
| 549 |
+
"step": 1925
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 2.338129496402878,
|
| 553 |
+
"grad_norm": 0.13364772498607635,
|
| 554 |
+
"learning_rate": 0.00014166850519149794,
|
| 555 |
+
"loss": 0.1579,
|
| 556 |
+
"step": 1950
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 2.36810551558753,
|
| 560 |
+
"grad_norm": 0.12694032490253448,
|
| 561 |
+
"learning_rate": 0.0001297222040983355,
|
| 562 |
+
"loss": 0.1597,
|
| 563 |
+
"step": 1975
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 2.3980815347721824,
|
| 567 |
+
"grad_norm": 0.12603726983070374,
|
| 568 |
+
"learning_rate": 0.0001182266287690924,
|
| 569 |
+
"loss": 0.1569,
|
| 570 |
+
"step": 2000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 2.4280575539568345,
|
| 574 |
+
"grad_norm": 0.12928339838981628,
|
| 575 |
+
"learning_rate": 0.00010719577235169537,
|
| 576 |
+
"loss": 0.1592,
|
| 577 |
+
"step": 2025
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 2.4580335731414866,
|
| 581 |
+
"grad_norm": 0.12346290051937103,
|
| 582 |
+
"learning_rate": 9.664306230855341e-05,
|
| 583 |
+
"loss": 0.1596,
|
| 584 |
+
"step": 2050
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 2.488009592326139,
|
| 588 |
+
"grad_norm": 0.1295640915632248,
|
| 589 |
+
"learning_rate": 8.658134407179418e-05,
|
| 590 |
+
"loss": 0.1561,
|
| 591 |
+
"step": 2075
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 2.5179856115107913,
|
| 595 |
+
"grad_norm": 0.13177761435508728,
|
| 596 |
+
"learning_rate": 7.702286540698416e-05,
|
| 597 |
+
"loss": 0.1597,
|
| 598 |
+
"step": 2100
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 2.547961630695444,
|
| 602 |
+
"grad_norm": 0.13077791035175323,
|
| 603 |
+
"learning_rate": 6.797926150436617e-05,
|
| 604 |
+
"loss": 0.1586,
|
| 605 |
+
"step": 2125
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 2.577937649880096,
|
| 609 |
+
"grad_norm": 0.117102712392807,
|
| 610 |
+
"learning_rate": 5.9461540815764105e-05,
|
| 611 |
+
"loss": 0.1551,
|
| 612 |
+
"step": 2150
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 2.6079136690647484,
|
| 616 |
+
"grad_norm": 0.14442621171474457,
|
| 617 |
+
"learning_rate": 5.1480071654392335e-05,
|
| 618 |
+
"loss": 0.1543,
|
| 619 |
+
"step": 2175
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 2.6378896882494005,
|
| 623 |
+
"grad_norm": 0.13530191779136658,
|
| 624 |
+
"learning_rate": 4.404456957388309e-05,
|
| 625 |
+
"loss": 0.1547,
|
| 626 |
+
"step": 2200
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 2.6678657074340526,
|
| 630 |
+
"grad_norm": 0.11663298308849335,
|
| 631 |
+
"learning_rate": 3.716408554189493e-05,
|
| 632 |
+
"loss": 0.1567,
|
| 633 |
+
"step": 2225
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 2.697841726618705,
|
| 637 |
+
"grad_norm": 0.13254733383655548,
|
| 638 |
+
"learning_rate": 3.08469949226971e-05,
|
| 639 |
+
"loss": 0.1605,
|
| 640 |
+
"step": 2250
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 2.7278177458033572,
|
| 644 |
+
"grad_norm": 0.13481509685516357,
|
| 645 |
+
"learning_rate": 2.510098728214133e-05,
|
| 646 |
+
"loss": 0.1638,
|
| 647 |
+
"step": 2275
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 2.7577937649880093,
|
| 651 |
+
"grad_norm": 0.12630033493041992,
|
| 652 |
+
"learning_rate": 1.9933057027432144e-05,
|
| 653 |
+
"loss": 0.1544,
|
| 654 |
+
"step": 2300
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 2.787769784172662,
|
| 658 |
+
"grad_norm": 0.11446399986743927,
|
| 659 |
+
"learning_rate": 1.5349494893087514e-05,
|
| 660 |
+
"loss": 0.1519,
|
| 661 |
+
"step": 2325
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 2.8177458033573144,
|
| 665 |
+
"grad_norm": 0.1315765082836151,
|
| 666 |
+
"learning_rate": 1.1355880283455521e-05,
|
| 667 |
+
"loss": 0.1559,
|
| 668 |
+
"step": 2350
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 2.8477218225419665,
|
| 672 |
+
"grad_norm": 0.13976338505744934,
|
| 673 |
+
"learning_rate": 7.95707448110755e-06,
|
| 674 |
+
"loss": 0.1513,
|
| 675 |
+
"step": 2375
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 2.8776978417266186,
|
| 679 |
+
"grad_norm": 0.15874944627285004,
|
| 680 |
+
"learning_rate": 5.157214729374305e-06,
|
| 681 |
+
"loss": 0.1558,
|
| 682 |
+
"step": 2400
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 2.907673860911271,
|
| 686 |
+
"grad_norm": 0.12649980187416077,
|
| 687 |
+
"learning_rate": 2.959709196229954e-06,
|
| 688 |
+
"loss": 0.1576,
|
| 689 |
+
"step": 2425
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 2.937649880095923,
|
| 693 |
+
"grad_norm": 0.121933713555336,
|
| 694 |
+
"learning_rate": 1.3672328256518206e-06,
|
| 695 |
+
"loss": 0.1541,
|
| 696 |
+
"step": 2450
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 2.9676258992805753,
|
| 700 |
+
"grad_norm": 0.14589641988277435,
|
| 701 |
+
"learning_rate": 3.8172408150849435e-07,
|
| 702 |
+
"loss": 0.1534,
|
| 703 |
+
"step": 2475
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 2.997601918465228,
|
| 707 |
+
"grad_norm": 0.14007411897182465,
|
| 708 |
+
"learning_rate": 4.382587937445947e-09,
|
| 709 |
+
"loss": 0.1536,
|
| 710 |
+
"step": 2500
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 3.0,
|
| 714 |
+
"step": 2502,
|
| 715 |
+
"total_flos": 2.43882352705536e+18,
|
| 716 |
+
"train_loss": 0.217716321051359,
|
| 717 |
+
"train_runtime": 3304.2612,
|
| 718 |
+
"train_samples_per_second": 36.317,
|
| 719 |
+
"train_steps_per_second": 0.757
|
| 720 |
+
}
|
| 721 |
+
],
|
| 722 |
+
"logging_steps": 25,
|
| 723 |
+
"max_steps": 2502,
|
| 724 |
+
"num_input_tokens_seen": 0,
|
| 725 |
+
"num_train_epochs": 3,
|
| 726 |
+
"save_steps": 0,
|
| 727 |
+
"stateful_callbacks": {
|
| 728 |
+
"TrainerControl": {
|
| 729 |
+
"args": {
|
| 730 |
+
"should_epoch_stop": false,
|
| 731 |
+
"should_evaluate": false,
|
| 732 |
+
"should_log": false,
|
| 733 |
+
"should_save": true,
|
| 734 |
+
"should_training_stop": true
|
| 735 |
+
},
|
| 736 |
+
"attributes": {}
|
| 737 |
+
}
|
| 738 |
+
},
|
| 739 |
+
"total_flos": 2.43882352705536e+18,
|
| 740 |
+
"train_batch_size": 48,
|
| 741 |
+
"trial_name": null,
|
| 742 |
+
"trial_params": null
|
| 743 |
+
}
|
nl_tasks/exps/run_ex33/ft/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": false,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex33/ft/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<s>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": false,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "</s>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": false,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "<unk>",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<unk>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
nl_tasks/exps/run_ex33/ft/tokenizer.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
nl_tasks/exps/run_ex33/ft/tokenizer.model
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
|
| 3 |
+
size 499723
|
nl_tasks/exps/run_ex33/ft/tokenizer_config.json
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": true,
|
| 3 |
+
"add_eos_token": false,
|
| 4 |
+
"add_prefix_space": null,
|
| 5 |
+
"added_tokens_decoder": {
|
| 6 |
+
"0": {
|
| 7 |
+
"content": "<unk>",
|
| 8 |
+
"lstrip": false,
|
| 9 |
+
"normalized": false,
|
| 10 |
+
"rstrip": false,
|
| 11 |
+
"single_word": false,
|
| 12 |
+
"special": true
|
| 13 |
+
},
|
| 14 |
+
"1": {
|
| 15 |
+
"content": "<s>",
|
| 16 |
+
"lstrip": false,
|
| 17 |
+
"normalized": false,
|
| 18 |
+
"rstrip": false,
|
| 19 |
+
"single_word": false,
|
| 20 |
+
"special": true
|
| 21 |
+
},
|
| 22 |
+
"2": {
|
| 23 |
+
"content": "</s>",
|
| 24 |
+
"lstrip": false,
|
| 25 |
+
"normalized": false,
|
| 26 |
+
"rstrip": false,
|
| 27 |
+
"single_word": false,
|
| 28 |
+
"special": true
|
| 29 |
+
}
|
| 30 |
+
},
|
| 31 |
+
"bos_token": "<s>",
|
| 32 |
+
"clean_up_tokenization_spaces": false,
|
| 33 |
+
"eos_token": "</s>",
|
| 34 |
+
"extra_special_tokens": {},
|
| 35 |
+
"legacy": false,
|
| 36 |
+
"model_max_length": 512,
|
| 37 |
+
"pad_token": "<unk>",
|
| 38 |
+
"padding_side": "right",
|
| 39 |
+
"sp_model_kwargs": {},
|
| 40 |
+
"tokenizer_class": "LlamaTokenizer",
|
| 41 |
+
"unk_token": "<unk>",
|
| 42 |
+
"use_default_system_prompt": false
|
| 43 |
+
}
|
nl_tasks/exps/run_ex33/ft2/adapter_config.json
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"T": 1.0,
|
| 3 |
+
"base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
|
| 4 |
+
"bias": "none",
|
| 5 |
+
"inference_mode": true,
|
| 6 |
+
"layers_to_transform": null,
|
| 7 |
+
"modules_to_save": null,
|
| 8 |
+
"num_rotations": 1,
|
| 9 |
+
"peft_type": "ROTATION",
|
| 10 |
+
"r": 16,
|
| 11 |
+
"revision": null,
|
| 12 |
+
"target_modules": [
|
| 13 |
+
"q_proj",
|
| 14 |
+
"v_proj"
|
| 15 |
+
],
|
| 16 |
+
"target_modules_to_skip": null,
|
| 17 |
+
"task_type": "CAUSAL_LM"
|
| 18 |
+
}
|
nl_tasks/exps/run_ex33/ft2/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:50b23d0f92322496640c91535debe14bd2599b7c55d1720ecbeff4fd370d1495
|
| 3 |
+
size 33602915
|
nl_tasks/exps/run_ex33/trainer_state.json
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2502,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.24060821533203125,
|
| 15 |
+
"learning_rate": 0.0009561752988047809,
|
| 16 |
+
"loss": 0.4891,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 0.2504737377166748,
|
| 22 |
+
"learning_rate": 0.001952191235059761,
|
| 23 |
+
"loss": 0.3442,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.3224866986274719,
|
| 29 |
+
"learning_rate": 0.0029482071713147415,
|
| 30 |
+
"loss": 0.3373,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 1.9630478620529175,
|
| 36 |
+
"learning_rate": 0.003944223107569721,
|
| 37 |
+
"loss": 0.367,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 2.872677803039551,
|
| 43 |
+
"learning_rate": 0.004940239043824702,
|
| 44 |
+
"loss": 0.3491,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.3211333751678467,
|
| 50 |
+
"learning_rate": 0.005936254980079682,
|
| 51 |
+
"loss": 0.3578,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.14077529311180115,
|
| 57 |
+
"learning_rate": 0.006932270916334662,
|
| 58 |
+
"loss": 0.3252,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.1346043050289154,
|
| 64 |
+
"learning_rate": 0.007928286852589641,
|
| 65 |
+
"loss": 0.3208,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.12695597112178802,
|
| 71 |
+
"learning_rate": 0.008924302788844622,
|
| 72 |
+
"loss": 0.3113,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.09868916869163513,
|
| 78 |
+
"learning_rate": 0.009920318725099601,
|
| 79 |
+
"loss": 0.3067,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.10491207242012024,
|
| 85 |
+
"learning_rate": 0.009997424229621528,
|
| 86 |
+
"loss": 0.2987,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.08867417275905609,
|
| 92 |
+
"learning_rate": 0.009988784753724706,
|
| 93 |
+
"loss": 0.2875,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.07948382943868637,
|
| 99 |
+
"learning_rate": 0.009974072610646543,
|
| 100 |
+
"loss": 0.2814,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.1311594694852829,
|
| 106 |
+
"learning_rate": 0.009953305708946503,
|
| 107 |
+
"loss": 0.2815,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.09352317452430725,
|
| 113 |
+
"learning_rate": 0.00992650932742266,
|
| 114 |
+
"loss": 0.274,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.06186804547905922,
|
| 120 |
+
"learning_rate": 0.009893716084340722,
|
| 121 |
+
"loss": 0.2739,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.057951804250478745,
|
| 127 |
+
"learning_rate": 0.009854965897729,
|
| 128 |
+
"loss": 0.2802,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.05402874946594238,
|
| 134 |
+
"learning_rate": 0.009810305936787634,
|
| 135 |
+
"loss": 0.2619,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.06987974047660828,
|
| 141 |
+
"learning_rate": 0.009759790564471233,
|
| 142 |
+
"loss": 0.2596,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.055585265159606934,
|
| 148 |
+
"learning_rate": 0.009703481271314823,
|
| 149 |
+
"loss": 0.2663,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.06418672204017639,
|
| 155 |
+
"learning_rate": 0.009641446600583633,
|
| 156 |
+
"loss": 0.2537,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.05453021079301834,
|
| 162 |
+
"learning_rate": 0.009573762064837866,
|
| 163 |
+
"loss": 0.2506,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.04687987267971039,
|
| 169 |
+
"learning_rate": 0.009500510054013988,
|
| 170 |
+
"loss": 0.2581,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.05430283397436142,
|
| 176 |
+
"learning_rate": 0.009421779735134445,
|
| 177 |
+
"loss": 0.2524,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.04550522193312645,
|
| 183 |
+
"learning_rate": 0.009337666943767862,
|
| 184 |
+
"loss": 0.2492,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.04663613811135292,
|
| 190 |
+
"learning_rate": 0.009248274067371885,
|
| 191 |
+
"loss": 0.2476,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.05199963226914406,
|
| 197 |
+
"learning_rate": 0.009153709920660622,
|
| 198 |
+
"loss": 0.2548,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.043098509311676025,
|
| 204 |
+
"learning_rate": 0.009054089613148438,
|
| 205 |
+
"loss": 0.243,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.043702222406864166,
|
| 211 |
+
"learning_rate": 0.008949534409031304,
|
| 212 |
+
"loss": 0.2392,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.04471327364444733,
|
| 218 |
+
"learning_rate": 0.008840171579576273,
|
| 219 |
+
"loss": 0.2447,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.04158543795347214,
|
| 225 |
+
"learning_rate": 0.008726134248198781,
|
| 226 |
+
"loss": 0.2431,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.03955095633864403,
|
| 232 |
+
"learning_rate": 0.008607561228416329,
|
| 233 |
+
"loss": 0.243,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.039677463471889496,
|
| 239 |
+
"learning_rate": 0.008484596854875806,
|
| 240 |
+
"loss": 0.2379,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.04437502473592758,
|
| 246 |
+
"learning_rate": 0.00835739080766014,
|
| 247 |
+
"loss": 0.2221,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.04531238228082657,
|
| 253 |
+
"learning_rate": 0.008226097930088161,
|
| 254 |
+
"loss": 0.2113,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.04641604423522949,
|
| 260 |
+
"learning_rate": 0.00809087804022943,
|
| 261 |
+
"loss": 0.2146,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.0421479269862175,
|
| 267 |
+
"learning_rate": 0.007951895736363478,
|
| 268 |
+
"loss": 0.2209,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.036580126732587814,
|
| 274 |
+
"learning_rate": 0.007809320196620271,
|
| 275 |
+
"loss": 0.2105,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.03694167360663414,
|
| 281 |
+
"learning_rate": 0.0076633249730458175,
|
| 282 |
+
"loss": 0.2148,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.03922872990369797,
|
| 288 |
+
"learning_rate": 0.00751408778034351,
|
| 289 |
+
"loss": 0.2171,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.042761627584695816,
|
| 295 |
+
"learning_rate": 0.0073617902795484755,
|
| 296 |
+
"loss": 0.2194,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.03898875042796135,
|
| 302 |
+
"learning_rate": 0.007206617856898149,
|
| 303 |
+
"loss": 0.2102,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.039882808923721313,
|
| 309 |
+
"learning_rate": 0.007048759398168324,
|
| 310 |
+
"loss": 0.2055,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.03663821145892143,
|
| 316 |
+
"learning_rate": 0.006888407058749331,
|
| 317 |
+
"loss": 0.2045,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.04057300463318825,
|
| 323 |
+
"learning_rate": 0.006725756029742234,
|
| 324 |
+
"loss": 0.2094,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.03922704979777336,
|
| 330 |
+
"learning_rate": 0.006561004300359761,
|
| 331 |
+
"loss": 0.2031,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.03589184582233429,
|
| 337 |
+
"learning_rate": 0.0063943524169212005,
|
| 338 |
+
"loss": 0.2103,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.03473867475986481,
|
| 344 |
+
"learning_rate": 0.0062260032387346275,
|
| 345 |
+
"loss": 0.1965,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.041303601115942,
|
| 351 |
+
"learning_rate": 0.006056161691163602,
|
| 352 |
+
"loss": 0.2037,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.04580175131559372,
|
| 358 |
+
"learning_rate": 0.00588503451617894,
|
| 359 |
+
"loss": 0.2067,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.03737177699804306,
|
| 365 |
+
"learning_rate": 0.005712830020699193,
|
| 366 |
+
"loss": 0.2058,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.03482063487172127,
|
| 372 |
+
"learning_rate": 0.005539757823026172,
|
| 373 |
+
"loss": 0.1975,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.04050496220588684,
|
| 379 |
+
"learning_rate": 0.005366028597684173,
|
| 380 |
+
"loss": 0.2029,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.031009122729301453,
|
| 386 |
+
"learning_rate": 0.005191853818973505,
|
| 387 |
+
"loss": 0.1978,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.03680524602532387,
|
| 393 |
+
"learning_rate": 0.005017445503550471,
|
| 394 |
+
"loss": 0.2042,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.03796203434467316,
|
| 400 |
+
"learning_rate": 0.004843015952347159,
|
| 401 |
+
"loss": 0.2002,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.03277565911412239,
|
| 407 |
+
"learning_rate": 0.0046687774921452116,
|
| 408 |
+
"loss": 0.1982,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.03661928325891495,
|
| 414 |
+
"learning_rate": 0.004494942217118105,
|
| 415 |
+
"loss": 0.2034,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.02970374934375286,
|
| 421 |
+
"learning_rate": 0.00432172173065659,
|
| 422 |
+
"loss": 0.1973,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.03302035480737686,
|
| 428 |
+
"learning_rate": 0.004149326887791542,
|
| 429 |
+
"loss": 0.1989,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.03104039840400219,
|
| 435 |
+
"learning_rate": 0.003977967538527754,
|
| 436 |
+
"loss": 0.194,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.037008076906204224,
|
| 442 |
+
"learning_rate": 0.003807852272401132,
|
| 443 |
+
"loss": 0.2002,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.030725648626685143,
|
| 449 |
+
"learning_rate": 0.0036391881645701854,
|
| 450 |
+
"loss": 0.1992,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.03207903727889061,
|
| 456 |
+
"learning_rate": 0.003472180523750933,
|
| 457 |
+
"loss": 0.2046,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.0310438871383667,
|
| 463 |
+
"learning_rate": 0.0033070326423020407,
|
| 464 |
+
"loss": 0.1941,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.032687630504369736,
|
| 470 |
+
"learning_rate": 0.0031439455487643707,
|
| 471 |
+
"loss": 0.1962,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0083932853717026,
|
| 476 |
+
"grad_norm": 0.0336097776889801,
|
| 477 |
+
"learning_rate": 0.0029831177631562305,
|
| 478 |
+
"loss": 0.1852,
|
| 479 |
+
"step": 1675
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 2.038369304556355,
|
| 483 |
+
"grad_norm": 0.034838490188121796,
|
| 484 |
+
"learning_rate": 0.0028247450553221276,
|
| 485 |
+
"loss": 0.1593,
|
| 486 |
+
"step": 1700
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 2.068345323741007,
|
| 490 |
+
"grad_norm": 0.03779308870434761,
|
| 491 |
+
"learning_rate": 0.002669020206629217,
|
| 492 |
+
"loss": 0.1617,
|
| 493 |
+
"step": 1725
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 2.0983213429256593,
|
| 497 |
+
"grad_norm": 0.03595089539885521,
|
| 498 |
+
"learning_rate": 0.0025161327753015295,
|
| 499 |
+
"loss": 0.1602,
|
| 500 |
+
"step": 1750
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 2.128297362110312,
|
| 504 |
+
"grad_norm": 0.03848210349678993,
|
| 505 |
+
"learning_rate": 0.0023662688656775972,
|
| 506 |
+
"loss": 0.159,
|
| 507 |
+
"step": 1775
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 2.158273381294964,
|
| 511 |
+
"grad_norm": 0.04832014814019203,
|
| 512 |
+
"learning_rate": 0.0022196109016723708,
|
| 513 |
+
"loss": 0.1593,
|
| 514 |
+
"step": 1800
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 2.1882494004796165,
|
| 518 |
+
"grad_norm": 0.03901856020092964,
|
| 519 |
+
"learning_rate": 0.0020763374047192027,
|
| 520 |
+
"loss": 0.1641,
|
| 521 |
+
"step": 1825
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 2.2182254196642686,
|
| 525 |
+
"grad_norm": 0.03556321561336517,
|
| 526 |
+
"learning_rate": 0.001936622776462147,
|
| 527 |
+
"loss": 0.1573,
|
| 528 |
+
"step": 1850
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 2.2482014388489207,
|
| 532 |
+
"grad_norm": 0.033188410103321075,
|
| 533 |
+
"learning_rate": 0.0018006370864631644,
|
| 534 |
+
"loss": 0.1611,
|
| 535 |
+
"step": 1875
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 2.278177458033573,
|
| 539 |
+
"grad_norm": 0.0357639417052269,
|
| 540 |
+
"learning_rate": 0.0016685458651825892,
|
| 541 |
+
"loss": 0.1565,
|
| 542 |
+
"step": 1900
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 2.3081534772182253,
|
| 546 |
+
"grad_norm": 0.03594391047954559,
|
| 547 |
+
"learning_rate": 0.0015405099024848874,
|
| 548 |
+
"loss": 0.1629,
|
| 549 |
+
"step": 1925
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 2.338129496402878,
|
| 553 |
+
"grad_norm": 0.03396276384592056,
|
| 554 |
+
"learning_rate": 0.0014166850519149793,
|
| 555 |
+
"loss": 0.1566,
|
| 556 |
+
"step": 1950
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 2.36810551558753,
|
| 560 |
+
"grad_norm": 0.03651060536503792,
|
| 561 |
+
"learning_rate": 0.0012972220409833552,
|
| 562 |
+
"loss": 0.1587,
|
| 563 |
+
"step": 1975
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 2.3980815347721824,
|
| 567 |
+
"grad_norm": 0.03662113845348358,
|
| 568 |
+
"learning_rate": 0.001182266287690924,
|
| 569 |
+
"loss": 0.155,
|
| 570 |
+
"step": 2000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 2.4280575539568345,
|
| 574 |
+
"grad_norm": 0.03555059805512428,
|
| 575 |
+
"learning_rate": 0.0010719577235169537,
|
| 576 |
+
"loss": 0.1587,
|
| 577 |
+
"step": 2025
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 2.4580335731414866,
|
| 581 |
+
"grad_norm": 0.03360700234770775,
|
| 582 |
+
"learning_rate": 0.0009664306230855341,
|
| 583 |
+
"loss": 0.1585,
|
| 584 |
+
"step": 2050
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 2.488009592326139,
|
| 588 |
+
"grad_norm": 0.033780504018068314,
|
| 589 |
+
"learning_rate": 0.0008658134407179418,
|
| 590 |
+
"loss": 0.1553,
|
| 591 |
+
"step": 2075
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 2.5179856115107913,
|
| 595 |
+
"grad_norm": 0.03432834520936012,
|
| 596 |
+
"learning_rate": 0.0007702286540698417,
|
| 597 |
+
"loss": 0.158,
|
| 598 |
+
"step": 2100
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 2.547961630695444,
|
| 602 |
+
"grad_norm": 0.035472046583890915,
|
| 603 |
+
"learning_rate": 0.0006797926150436618,
|
| 604 |
+
"loss": 0.1573,
|
| 605 |
+
"step": 2125
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 2.577937649880096,
|
| 609 |
+
"grad_norm": 0.03242143243551254,
|
| 610 |
+
"learning_rate": 0.000594615408157641,
|
| 611 |
+
"loss": 0.1537,
|
| 612 |
+
"step": 2150
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 2.6079136690647484,
|
| 616 |
+
"grad_norm": 0.03848033398389816,
|
| 617 |
+
"learning_rate": 0.0005148007165439233,
|
| 618 |
+
"loss": 0.1532,
|
| 619 |
+
"step": 2175
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 2.6378896882494005,
|
| 623 |
+
"grad_norm": 0.03433902934193611,
|
| 624 |
+
"learning_rate": 0.0004404456957388309,
|
| 625 |
+
"loss": 0.1529,
|
| 626 |
+
"step": 2200
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 2.6678657074340526,
|
| 630 |
+
"grad_norm": 0.031391434371471405,
|
| 631 |
+
"learning_rate": 0.00037164085541894934,
|
| 632 |
+
"loss": 0.1546,
|
| 633 |
+
"step": 2225
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 2.697841726618705,
|
| 637 |
+
"grad_norm": 0.04350714385509491,
|
| 638 |
+
"learning_rate": 0.000308469949226971,
|
| 639 |
+
"loss": 0.1584,
|
| 640 |
+
"step": 2250
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 2.7278177458033572,
|
| 644 |
+
"grad_norm": 0.03302815929055214,
|
| 645 |
+
"learning_rate": 0.0002510098728214133,
|
| 646 |
+
"loss": 0.1614,
|
| 647 |
+
"step": 2275
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 2.7577937649880093,
|
| 651 |
+
"grad_norm": 0.03371904045343399,
|
| 652 |
+
"learning_rate": 0.00019933057027432145,
|
| 653 |
+
"loss": 0.1517,
|
| 654 |
+
"step": 2300
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 2.787769784172662,
|
| 658 |
+
"grad_norm": 0.031399570405483246,
|
| 659 |
+
"learning_rate": 0.00015349494893087513,
|
| 660 |
+
"loss": 0.1496,
|
| 661 |
+
"step": 2325
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 2.8177458033573144,
|
| 665 |
+
"grad_norm": 0.03590654581785202,
|
| 666 |
+
"learning_rate": 0.00011355880283455522,
|
| 667 |
+
"loss": 0.1535,
|
| 668 |
+
"step": 2350
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 2.8477218225419665,
|
| 672 |
+
"grad_norm": 0.03726603463292122,
|
| 673 |
+
"learning_rate": 7.957074481107551e-05,
|
| 674 |
+
"loss": 0.1491,
|
| 675 |
+
"step": 2375
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 2.8776978417266186,
|
| 679 |
+
"grad_norm": 0.041672345250844955,
|
| 680 |
+
"learning_rate": 5.1572147293743046e-05,
|
| 681 |
+
"loss": 0.1537,
|
| 682 |
+
"step": 2400
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 2.907673860911271,
|
| 686 |
+
"grad_norm": 0.03395906835794449,
|
| 687 |
+
"learning_rate": 2.959709196229954e-05,
|
| 688 |
+
"loss": 0.1544,
|
| 689 |
+
"step": 2425
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 2.937649880095923,
|
| 693 |
+
"grad_norm": 0.030766665935516357,
|
| 694 |
+
"learning_rate": 1.3672328256518207e-05,
|
| 695 |
+
"loss": 0.1513,
|
| 696 |
+
"step": 2450
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 2.9676258992805753,
|
| 700 |
+
"grad_norm": 0.03894634544849396,
|
| 701 |
+
"learning_rate": 3.817240815084944e-06,
|
| 702 |
+
"loss": 0.1504,
|
| 703 |
+
"step": 2475
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 2.997601918465228,
|
| 707 |
+
"grad_norm": 0.03591805323958397,
|
| 708 |
+
"learning_rate": 4.382587937445948e-08,
|
| 709 |
+
"loss": 0.1516,
|
| 710 |
+
"step": 2500
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 3.0,
|
| 714 |
+
"step": 2502,
|
| 715 |
+
"total_flos": 2.43882352705536e+18,
|
| 716 |
+
"train_loss": 0.21604387271556733,
|
| 717 |
+
"train_runtime": 3302.176,
|
| 718 |
+
"train_samples_per_second": 36.34,
|
| 719 |
+
"train_steps_per_second": 0.758
|
| 720 |
+
}
|
| 721 |
+
],
|
| 722 |
+
"logging_steps": 25,
|
| 723 |
+
"max_steps": 2502,
|
| 724 |
+
"num_input_tokens_seen": 0,
|
| 725 |
+
"num_train_epochs": 3,
|
| 726 |
+
"save_steps": 0,
|
| 727 |
+
"stateful_callbacks": {
|
| 728 |
+
"TrainerControl": {
|
| 729 |
+
"args": {
|
| 730 |
+
"should_epoch_stop": false,
|
| 731 |
+
"should_evaluate": false,
|
| 732 |
+
"should_log": false,
|
| 733 |
+
"should_save": true,
|
| 734 |
+
"should_training_stop": true
|
| 735 |
+
},
|
| 736 |
+
"attributes": {}
|
| 737 |
+
}
|
| 738 |
+
},
|
| 739 |
+
"total_flos": 2.43882352705536e+18,
|
| 740 |
+
"train_batch_size": 48,
|
| 741 |
+
"trial_name": null,
|
| 742 |
+
"trial_params": null
|
| 743 |
+
}
|
nl_tasks/exps/run_ex34/gsm8k.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
gsm8k length==== 1319, gsm8k acc %====, 52.388172858225936
|
nl_tasks/exps/run_ex34/math.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
math length==== 5000, math acc %====, 7.84
|
nl_tasks/exps/run_ex34/trainer_state.json
ADDED
|
@@ -0,0 +1,743 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"best_global_step": null,
|
| 3 |
+
"best_metric": null,
|
| 4 |
+
"best_model_checkpoint": null,
|
| 5 |
+
"epoch": 3.0,
|
| 6 |
+
"eval_steps": 500,
|
| 7 |
+
"global_step": 2502,
|
| 8 |
+
"is_hyper_param_search": false,
|
| 9 |
+
"is_local_process_zero": true,
|
| 10 |
+
"is_world_process_zero": true,
|
| 11 |
+
"log_history": [
|
| 12 |
+
{
|
| 13 |
+
"epoch": 0.02997601918465228,
|
| 14 |
+
"grad_norm": 0.2532191574573517,
|
| 15 |
+
"learning_rate": 0.0019123505976095618,
|
| 16 |
+
"loss": 0.4625,
|
| 17 |
+
"step": 25
|
| 18 |
+
},
|
| 19 |
+
{
|
| 20 |
+
"epoch": 0.05995203836930456,
|
| 21 |
+
"grad_norm": 5.414860725402832,
|
| 22 |
+
"learning_rate": 0.003904382470119522,
|
| 23 |
+
"loss": 0.478,
|
| 24 |
+
"step": 50
|
| 25 |
+
},
|
| 26 |
+
{
|
| 27 |
+
"epoch": 0.08992805755395683,
|
| 28 |
+
"grad_norm": 0.6449404954910278,
|
| 29 |
+
"learning_rate": 0.005896414342629483,
|
| 30 |
+
"loss": 0.6016,
|
| 31 |
+
"step": 75
|
| 32 |
+
},
|
| 33 |
+
{
|
| 34 |
+
"epoch": 0.11990407673860912,
|
| 35 |
+
"grad_norm": 1.4888554811477661,
|
| 36 |
+
"learning_rate": 0.007888446215139441,
|
| 37 |
+
"loss": 0.4044,
|
| 38 |
+
"step": 100
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"epoch": 0.1498800959232614,
|
| 42 |
+
"grad_norm": 0.23530787229537964,
|
| 43 |
+
"learning_rate": 0.009880478087649403,
|
| 44 |
+
"loss": 0.3748,
|
| 45 |
+
"step": 125
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"epoch": 0.17985611510791366,
|
| 49 |
+
"grad_norm": 0.10670146346092224,
|
| 50 |
+
"learning_rate": 0.011872509960159363,
|
| 51 |
+
"loss": 0.3298,
|
| 52 |
+
"step": 150
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"epoch": 0.20983213429256595,
|
| 56 |
+
"grad_norm": 0.1089276447892189,
|
| 57 |
+
"learning_rate": 0.013864541832669323,
|
| 58 |
+
"loss": 0.3172,
|
| 59 |
+
"step": 175
|
| 60 |
+
},
|
| 61 |
+
{
|
| 62 |
+
"epoch": 0.23980815347721823,
|
| 63 |
+
"grad_norm": 0.09628577530384064,
|
| 64 |
+
"learning_rate": 0.015856573705179282,
|
| 65 |
+
"loss": 0.3173,
|
| 66 |
+
"step": 200
|
| 67 |
+
},
|
| 68 |
+
{
|
| 69 |
+
"epoch": 0.2697841726618705,
|
| 70 |
+
"grad_norm": 0.07994027435779572,
|
| 71 |
+
"learning_rate": 0.017848605577689244,
|
| 72 |
+
"loss": 0.3081,
|
| 73 |
+
"step": 225
|
| 74 |
+
},
|
| 75 |
+
{
|
| 76 |
+
"epoch": 0.2997601918465228,
|
| 77 |
+
"grad_norm": 0.06300719082355499,
|
| 78 |
+
"learning_rate": 0.019840637450199202,
|
| 79 |
+
"loss": 0.3046,
|
| 80 |
+
"step": 250
|
| 81 |
+
},
|
| 82 |
+
{
|
| 83 |
+
"epoch": 0.32973621103117506,
|
| 84 |
+
"grad_norm": 0.06899631768465042,
|
| 85 |
+
"learning_rate": 0.019994848459243056,
|
| 86 |
+
"loss": 0.2986,
|
| 87 |
+
"step": 275
|
| 88 |
+
},
|
| 89 |
+
{
|
| 90 |
+
"epoch": 0.3597122302158273,
|
| 91 |
+
"grad_norm": 0.08354010432958603,
|
| 92 |
+
"learning_rate": 0.019977569507449413,
|
| 93 |
+
"loss": 0.2875,
|
| 94 |
+
"step": 300
|
| 95 |
+
},
|
| 96 |
+
{
|
| 97 |
+
"epoch": 0.38968824940047964,
|
| 98 |
+
"grad_norm": 0.0717054083943367,
|
| 99 |
+
"learning_rate": 0.019948145221293085,
|
| 100 |
+
"loss": 0.2824,
|
| 101 |
+
"step": 325
|
| 102 |
+
},
|
| 103 |
+
{
|
| 104 |
+
"epoch": 0.4196642685851319,
|
| 105 |
+
"grad_norm": 0.04974915832281113,
|
| 106 |
+
"learning_rate": 0.019906611417893006,
|
| 107 |
+
"loss": 0.282,
|
| 108 |
+
"step": 350
|
| 109 |
+
},
|
| 110 |
+
{
|
| 111 |
+
"epoch": 0.44964028776978415,
|
| 112 |
+
"grad_norm": 0.0749383196234703,
|
| 113 |
+
"learning_rate": 0.01985301865484532,
|
| 114 |
+
"loss": 0.2738,
|
| 115 |
+
"step": 375
|
| 116 |
+
},
|
| 117 |
+
{
|
| 118 |
+
"epoch": 0.47961630695443647,
|
| 119 |
+
"grad_norm": 0.04245521500706673,
|
| 120 |
+
"learning_rate": 0.019787432168681444,
|
| 121 |
+
"loss": 0.2727,
|
| 122 |
+
"step": 400
|
| 123 |
+
},
|
| 124 |
+
{
|
| 125 |
+
"epoch": 0.5095923261390888,
|
| 126 |
+
"grad_norm": 0.04584546759724617,
|
| 127 |
+
"learning_rate": 0.019709931795458,
|
| 128 |
+
"loss": 0.2818,
|
| 129 |
+
"step": 425
|
| 130 |
+
},
|
| 131 |
+
{
|
| 132 |
+
"epoch": 0.539568345323741,
|
| 133 |
+
"grad_norm": 0.043414946645498276,
|
| 134 |
+
"learning_rate": 0.019620611873575267,
|
| 135 |
+
"loss": 0.2613,
|
| 136 |
+
"step": 450
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"epoch": 0.5695443645083933,
|
| 140 |
+
"grad_norm": 0.058648571372032166,
|
| 141 |
+
"learning_rate": 0.019519581128942465,
|
| 142 |
+
"loss": 0.2609,
|
| 143 |
+
"step": 475
|
| 144 |
+
},
|
| 145 |
+
{
|
| 146 |
+
"epoch": 0.5995203836930456,
|
| 147 |
+
"grad_norm": 0.04752543196082115,
|
| 148 |
+
"learning_rate": 0.019406962542629646,
|
| 149 |
+
"loss": 0.2677,
|
| 150 |
+
"step": 500
|
| 151 |
+
},
|
| 152 |
+
{
|
| 153 |
+
"epoch": 0.6294964028776978,
|
| 154 |
+
"grad_norm": 0.048361025750637054,
|
| 155 |
+
"learning_rate": 0.019282893201167266,
|
| 156 |
+
"loss": 0.2548,
|
| 157 |
+
"step": 525
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"epoch": 0.6594724220623501,
|
| 161 |
+
"grad_norm": 0.05573540925979614,
|
| 162 |
+
"learning_rate": 0.01914752412967573,
|
| 163 |
+
"loss": 0.2517,
|
| 164 |
+
"step": 550
|
| 165 |
+
},
|
| 166 |
+
{
|
| 167 |
+
"epoch": 0.6894484412470024,
|
| 168 |
+
"grad_norm": 0.03862696886062622,
|
| 169 |
+
"learning_rate": 0.019001020108027976,
|
| 170 |
+
"loss": 0.2583,
|
| 171 |
+
"step": 575
|
| 172 |
+
},
|
| 173 |
+
{
|
| 174 |
+
"epoch": 0.7194244604316546,
|
| 175 |
+
"grad_norm": 0.0389208160340786,
|
| 176 |
+
"learning_rate": 0.01884355947026889,
|
| 177 |
+
"loss": 0.253,
|
| 178 |
+
"step": 600
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"epoch": 0.749400479616307,
|
| 182 |
+
"grad_norm": 0.03260328620672226,
|
| 183 |
+
"learning_rate": 0.018675333887535724,
|
| 184 |
+
"loss": 0.2507,
|
| 185 |
+
"step": 625
|
| 186 |
+
},
|
| 187 |
+
{
|
| 188 |
+
"epoch": 0.7793764988009593,
|
| 189 |
+
"grad_norm": 0.038773685693740845,
|
| 190 |
+
"learning_rate": 0.01849654813474377,
|
| 191 |
+
"loss": 0.2481,
|
| 192 |
+
"step": 650
|
| 193 |
+
},
|
| 194 |
+
{
|
| 195 |
+
"epoch": 0.8093525179856115,
|
| 196 |
+
"grad_norm": 0.03589491918683052,
|
| 197 |
+
"learning_rate": 0.018307419841321244,
|
| 198 |
+
"loss": 0.2558,
|
| 199 |
+
"step": 675
|
| 200 |
+
},
|
| 201 |
+
{
|
| 202 |
+
"epoch": 0.8393285371702638,
|
| 203 |
+
"grad_norm": 0.040207505226135254,
|
| 204 |
+
"learning_rate": 0.018108179226296876,
|
| 205 |
+
"loss": 0.2432,
|
| 206 |
+
"step": 700
|
| 207 |
+
},
|
| 208 |
+
{
|
| 209 |
+
"epoch": 0.8693045563549161,
|
| 210 |
+
"grad_norm": 0.03414730727672577,
|
| 211 |
+
"learning_rate": 0.017899068818062608,
|
| 212 |
+
"loss": 0.2397,
|
| 213 |
+
"step": 725
|
| 214 |
+
},
|
| 215 |
+
{
|
| 216 |
+
"epoch": 0.8992805755395683,
|
| 217 |
+
"grad_norm": 0.0384533517062664,
|
| 218 |
+
"learning_rate": 0.017680343159152546,
|
| 219 |
+
"loss": 0.2469,
|
| 220 |
+
"step": 750
|
| 221 |
+
},
|
| 222 |
+
{
|
| 223 |
+
"epoch": 0.9292565947242206,
|
| 224 |
+
"grad_norm": 0.029624082148075104,
|
| 225 |
+
"learning_rate": 0.017452268496397562,
|
| 226 |
+
"loss": 0.2448,
|
| 227 |
+
"step": 775
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"epoch": 0.9592326139088729,
|
| 231 |
+
"grad_norm": 0.028060954064130783,
|
| 232 |
+
"learning_rate": 0.017215122456832658,
|
| 233 |
+
"loss": 0.243,
|
| 234 |
+
"step": 800
|
| 235 |
+
},
|
| 236 |
+
{
|
| 237 |
+
"epoch": 0.9892086330935251,
|
| 238 |
+
"grad_norm": 0.033197712153196335,
|
| 239 |
+
"learning_rate": 0.016969193709751612,
|
| 240 |
+
"loss": 0.2393,
|
| 241 |
+
"step": 825
|
| 242 |
+
},
|
| 243 |
+
{
|
| 244 |
+
"epoch": 1.0191846522781776,
|
| 245 |
+
"grad_norm": 0.03793744370341301,
|
| 246 |
+
"learning_rate": 0.01671478161532028,
|
| 247 |
+
"loss": 0.2231,
|
| 248 |
+
"step": 850
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"epoch": 1.0491606714628297,
|
| 252 |
+
"grad_norm": 0.03373177349567413,
|
| 253 |
+
"learning_rate": 0.016452195860176322,
|
| 254 |
+
"loss": 0.2136,
|
| 255 |
+
"step": 875
|
| 256 |
+
},
|
| 257 |
+
{
|
| 258 |
+
"epoch": 1.079136690647482,
|
| 259 |
+
"grad_norm": 0.029341645538806915,
|
| 260 |
+
"learning_rate": 0.01618175608045886,
|
| 261 |
+
"loss": 0.2158,
|
| 262 |
+
"step": 900
|
| 263 |
+
},
|
| 264 |
+
{
|
| 265 |
+
"epoch": 1.1091127098321343,
|
| 266 |
+
"grad_norm": 0.03401786461472511,
|
| 267 |
+
"learning_rate": 0.015903791472726955,
|
| 268 |
+
"loss": 0.2223,
|
| 269 |
+
"step": 925
|
| 270 |
+
},
|
| 271 |
+
{
|
| 272 |
+
"epoch": 1.1390887290167866,
|
| 273 |
+
"grad_norm": 0.02708265371620655,
|
| 274 |
+
"learning_rate": 0.015618640393240542,
|
| 275 |
+
"loss": 0.2121,
|
| 276 |
+
"step": 950
|
| 277 |
+
},
|
| 278 |
+
{
|
| 279 |
+
"epoch": 1.169064748201439,
|
| 280 |
+
"grad_norm": 0.029649930074810982,
|
| 281 |
+
"learning_rate": 0.015326649946091635,
|
| 282 |
+
"loss": 0.2168,
|
| 283 |
+
"step": 975
|
| 284 |
+
},
|
| 285 |
+
{
|
| 286 |
+
"epoch": 1.1990407673860912,
|
| 287 |
+
"grad_norm": 0.033625248819589615,
|
| 288 |
+
"learning_rate": 0.01502817556068702,
|
| 289 |
+
"loss": 0.2184,
|
| 290 |
+
"step": 1000
|
| 291 |
+
},
|
| 292 |
+
{
|
| 293 |
+
"epoch": 1.2290167865707433,
|
| 294 |
+
"grad_norm": 0.036567322909832,
|
| 295 |
+
"learning_rate": 0.014723580559096951,
|
| 296 |
+
"loss": 0.2205,
|
| 297 |
+
"step": 1025
|
| 298 |
+
},
|
| 299 |
+
{
|
| 300 |
+
"epoch": 1.2589928057553956,
|
| 301 |
+
"grad_norm": 0.029653819277882576,
|
| 302 |
+
"learning_rate": 0.014413235713796298,
|
| 303 |
+
"loss": 0.2122,
|
| 304 |
+
"step": 1050
|
| 305 |
+
},
|
| 306 |
+
{
|
| 307 |
+
"epoch": 1.288968824940048,
|
| 308 |
+
"grad_norm": 0.0300185214728117,
|
| 309 |
+
"learning_rate": 0.014097518796336648,
|
| 310 |
+
"loss": 0.2058,
|
| 311 |
+
"step": 1075
|
| 312 |
+
},
|
| 313 |
+
{
|
| 314 |
+
"epoch": 1.3189448441247003,
|
| 315 |
+
"grad_norm": 0.027810001745820045,
|
| 316 |
+
"learning_rate": 0.013776814117498662,
|
| 317 |
+
"loss": 0.207,
|
| 318 |
+
"step": 1100
|
| 319 |
+
},
|
| 320 |
+
{
|
| 321 |
+
"epoch": 1.3489208633093526,
|
| 322 |
+
"grad_norm": 0.027427321299910545,
|
| 323 |
+
"learning_rate": 0.013451512059484468,
|
| 324 |
+
"loss": 0.2108,
|
| 325 |
+
"step": 1125
|
| 326 |
+
},
|
| 327 |
+
{
|
| 328 |
+
"epoch": 1.3788968824940047,
|
| 329 |
+
"grad_norm": 0.028375081717967987,
|
| 330 |
+
"learning_rate": 0.013122008600719522,
|
| 331 |
+
"loss": 0.2048,
|
| 332 |
+
"step": 1150
|
| 333 |
+
},
|
| 334 |
+
{
|
| 335 |
+
"epoch": 1.4088729016786572,
|
| 336 |
+
"grad_norm": 0.029488051310181618,
|
| 337 |
+
"learning_rate": 0.012788704833842401,
|
| 338 |
+
"loss": 0.2121,
|
| 339 |
+
"step": 1175
|
| 340 |
+
},
|
| 341 |
+
{
|
| 342 |
+
"epoch": 1.4388489208633093,
|
| 343 |
+
"grad_norm": 0.029031606391072273,
|
| 344 |
+
"learning_rate": 0.012452006477469255,
|
| 345 |
+
"loss": 0.1975,
|
| 346 |
+
"step": 1200
|
| 347 |
+
},
|
| 348 |
+
{
|
| 349 |
+
"epoch": 1.4688249400479616,
|
| 350 |
+
"grad_norm": 0.029272671788930893,
|
| 351 |
+
"learning_rate": 0.012112323382327204,
|
| 352 |
+
"loss": 0.2055,
|
| 353 |
+
"step": 1225
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"epoch": 1.498800959232614,
|
| 357 |
+
"grad_norm": 0.027180878445506096,
|
| 358 |
+
"learning_rate": 0.01177006903235788,
|
| 359 |
+
"loss": 0.2082,
|
| 360 |
+
"step": 1250
|
| 361 |
+
},
|
| 362 |
+
{
|
| 363 |
+
"epoch": 1.5287769784172662,
|
| 364 |
+
"grad_norm": 0.02914930321276188,
|
| 365 |
+
"learning_rate": 0.011425660041398385,
|
| 366 |
+
"loss": 0.2067,
|
| 367 |
+
"step": 1275
|
| 368 |
+
},
|
| 369 |
+
{
|
| 370 |
+
"epoch": 1.5587529976019185,
|
| 371 |
+
"grad_norm": 0.024756262078881264,
|
| 372 |
+
"learning_rate": 0.011079515646052343,
|
| 373 |
+
"loss": 0.1988,
|
| 374 |
+
"step": 1300
|
| 375 |
+
},
|
| 376 |
+
{
|
| 377 |
+
"epoch": 1.5887290167865706,
|
| 378 |
+
"grad_norm": 0.0281531922519207,
|
| 379 |
+
"learning_rate": 0.010732057195368346,
|
| 380 |
+
"loss": 0.2043,
|
| 381 |
+
"step": 1325
|
| 382 |
+
},
|
| 383 |
+
{
|
| 384 |
+
"epoch": 1.6187050359712232,
|
| 385 |
+
"grad_norm": 0.02760651335120201,
|
| 386 |
+
"learning_rate": 0.01038370763794701,
|
| 387 |
+
"loss": 0.2008,
|
| 388 |
+
"step": 1350
|
| 389 |
+
},
|
| 390 |
+
{
|
| 391 |
+
"epoch": 1.6486810551558753,
|
| 392 |
+
"grad_norm": 0.026089100167155266,
|
| 393 |
+
"learning_rate": 0.010034891007100942,
|
| 394 |
+
"loss": 0.2065,
|
| 395 |
+
"step": 1375
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"epoch": 1.6786570743405276,
|
| 399 |
+
"grad_norm": 0.030709104612469673,
|
| 400 |
+
"learning_rate": 0.009686031904694317,
|
| 401 |
+
"loss": 0.2022,
|
| 402 |
+
"step": 1400
|
| 403 |
+
},
|
| 404 |
+
{
|
| 405 |
+
"epoch": 1.70863309352518,
|
| 406 |
+
"grad_norm": 0.028219345957040787,
|
| 407 |
+
"learning_rate": 0.009337554984290423,
|
| 408 |
+
"loss": 0.1998,
|
| 409 |
+
"step": 1425
|
| 410 |
+
},
|
| 411 |
+
{
|
| 412 |
+
"epoch": 1.738609112709832,
|
| 413 |
+
"grad_norm": 0.02828747034072876,
|
| 414 |
+
"learning_rate": 0.00898988443423621,
|
| 415 |
+
"loss": 0.2048,
|
| 416 |
+
"step": 1450
|
| 417 |
+
},
|
| 418 |
+
{
|
| 419 |
+
"epoch": 1.7685851318944845,
|
| 420 |
+
"grad_norm": 0.02299325354397297,
|
| 421 |
+
"learning_rate": 0.00864344346131318,
|
| 422 |
+
"loss": 0.1983,
|
| 423 |
+
"step": 1475
|
| 424 |
+
},
|
| 425 |
+
{
|
| 426 |
+
"epoch": 1.7985611510791366,
|
| 427 |
+
"grad_norm": 0.02619764395058155,
|
| 428 |
+
"learning_rate": 0.008298653775583083,
|
| 429 |
+
"loss": 0.2013,
|
| 430 |
+
"step": 1500
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"epoch": 1.828537170263789,
|
| 434 |
+
"grad_norm": 0.021840449422597885,
|
| 435 |
+
"learning_rate": 0.007955935077055509,
|
| 436 |
+
"loss": 0.1956,
|
| 437 |
+
"step": 1525
|
| 438 |
+
},
|
| 439 |
+
{
|
| 440 |
+
"epoch": 1.8585131894484412,
|
| 441 |
+
"grad_norm": 0.02665482647716999,
|
| 442 |
+
"learning_rate": 0.007615704544802264,
|
| 443 |
+
"loss": 0.2022,
|
| 444 |
+
"step": 1550
|
| 445 |
+
},
|
| 446 |
+
{
|
| 447 |
+
"epoch": 1.8884892086330936,
|
| 448 |
+
"grad_norm": 0.023482663556933403,
|
| 449 |
+
"learning_rate": 0.007278376329140371,
|
| 450 |
+
"loss": 0.2017,
|
| 451 |
+
"step": 1575
|
| 452 |
+
},
|
| 453 |
+
{
|
| 454 |
+
"epoch": 1.9184652278177459,
|
| 455 |
+
"grad_norm": 0.02401842176914215,
|
| 456 |
+
"learning_rate": 0.006944361047501866,
|
| 457 |
+
"loss": 0.2059,
|
| 458 |
+
"step": 1600
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"epoch": 1.948441247002398,
|
| 462 |
+
"grad_norm": 0.022366248071193695,
|
| 463 |
+
"learning_rate": 0.006614065284604081,
|
| 464 |
+
"loss": 0.1954,
|
| 465 |
+
"step": 1625
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"epoch": 1.9784172661870505,
|
| 469 |
+
"grad_norm": 0.023342736065387726,
|
| 470 |
+
"learning_rate": 0.0062878910975287415,
|
| 471 |
+
"loss": 0.1973,
|
| 472 |
+
"step": 1650
|
| 473 |
+
},
|
| 474 |
+
{
|
| 475 |
+
"epoch": 2.0083932853717026,
|
| 476 |
+
"grad_norm": 0.02544998750090599,
|
| 477 |
+
"learning_rate": 0.005966235526312461,
|
| 478 |
+
"loss": 0.1869,
|
| 479 |
+
"step": 1675
|
| 480 |
+
},
|
| 481 |
+
{
|
| 482 |
+
"epoch": 2.038369304556355,
|
| 483 |
+
"grad_norm": 0.027767734602093697,
|
| 484 |
+
"learning_rate": 0.005649490110644255,
|
| 485 |
+
"loss": 0.1598,
|
| 486 |
+
"step": 1700
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"epoch": 2.068345323741007,
|
| 490 |
+
"grad_norm": 0.0325852669775486,
|
| 491 |
+
"learning_rate": 0.005338040413258434,
|
| 492 |
+
"loss": 0.1628,
|
| 493 |
+
"step": 1725
|
| 494 |
+
},
|
| 495 |
+
{
|
| 496 |
+
"epoch": 2.0983213429256593,
|
| 497 |
+
"grad_norm": 0.027936723083257675,
|
| 498 |
+
"learning_rate": 0.005032265550603059,
|
| 499 |
+
"loss": 0.161,
|
| 500 |
+
"step": 1750
|
| 501 |
+
},
|
| 502 |
+
{
|
| 503 |
+
"epoch": 2.128297362110312,
|
| 504 |
+
"grad_norm": 0.027292126789689064,
|
| 505 |
+
"learning_rate": 0.0047325377313551945,
|
| 506 |
+
"loss": 0.1598,
|
| 507 |
+
"step": 1775
|
| 508 |
+
},
|
| 509 |
+
{
|
| 510 |
+
"epoch": 2.158273381294964,
|
| 511 |
+
"grad_norm": 0.030712289735674858,
|
| 512 |
+
"learning_rate": 0.0044392218033447416,
|
| 513 |
+
"loss": 0.1614,
|
| 514 |
+
"step": 1800
|
| 515 |
+
},
|
| 516 |
+
{
|
| 517 |
+
"epoch": 2.1882494004796165,
|
| 518 |
+
"grad_norm": 0.02516656182706356,
|
| 519 |
+
"learning_rate": 0.0041526748094384055,
|
| 520 |
+
"loss": 0.165,
|
| 521 |
+
"step": 1825
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"epoch": 2.2182254196642686,
|
| 525 |
+
"grad_norm": 0.028255818411707878,
|
| 526 |
+
"learning_rate": 0.003873245552924294,
|
| 527 |
+
"loss": 0.1584,
|
| 528 |
+
"step": 1850
|
| 529 |
+
},
|
| 530 |
+
{
|
| 531 |
+
"epoch": 2.2482014388489207,
|
| 532 |
+
"grad_norm": 0.028185885399580002,
|
| 533 |
+
"learning_rate": 0.003601274172926329,
|
| 534 |
+
"loss": 0.1619,
|
| 535 |
+
"step": 1875
|
| 536 |
+
},
|
| 537 |
+
{
|
| 538 |
+
"epoch": 2.278177458033573,
|
| 539 |
+
"grad_norm": 0.029240386560559273,
|
| 540 |
+
"learning_rate": 0.0033370917303651784,
|
| 541 |
+
"loss": 0.1575,
|
| 542 |
+
"step": 1900
|
| 543 |
+
},
|
| 544 |
+
{
|
| 545 |
+
"epoch": 2.3081534772182253,
|
| 546 |
+
"grad_norm": 0.026783913373947144,
|
| 547 |
+
"learning_rate": 0.003081019804969775,
|
| 548 |
+
"loss": 0.1636,
|
| 549 |
+
"step": 1925
|
| 550 |
+
},
|
| 551 |
+
{
|
| 552 |
+
"epoch": 2.338129496402878,
|
| 553 |
+
"grad_norm": 0.026911884546279907,
|
| 554 |
+
"learning_rate": 0.0028333701038299585,
|
| 555 |
+
"loss": 0.1583,
|
| 556 |
+
"step": 1950
|
| 557 |
+
},
|
| 558 |
+
{
|
| 559 |
+
"epoch": 2.36810551558753,
|
| 560 |
+
"grad_norm": 0.02731228433549404,
|
| 561 |
+
"learning_rate": 0.0025944440819667103,
|
| 562 |
+
"loss": 0.1596,
|
| 563 |
+
"step": 1975
|
| 564 |
+
},
|
| 565 |
+
{
|
| 566 |
+
"epoch": 2.3980815347721824,
|
| 567 |
+
"grad_norm": 0.026634665206074715,
|
| 568 |
+
"learning_rate": 0.002364532575381848,
|
| 569 |
+
"loss": 0.1559,
|
| 570 |
+
"step": 2000
|
| 571 |
+
},
|
| 572 |
+
{
|
| 573 |
+
"epoch": 2.4280575539568345,
|
| 574 |
+
"grad_norm": 0.02517741546034813,
|
| 575 |
+
"learning_rate": 0.0021439154470339074,
|
| 576 |
+
"loss": 0.1598,
|
| 577 |
+
"step": 2025
|
| 578 |
+
},
|
| 579 |
+
{
|
| 580 |
+
"epoch": 2.4580335731414866,
|
| 581 |
+
"grad_norm": 0.028662823140621185,
|
| 582 |
+
"learning_rate": 0.0019328612461710682,
|
| 583 |
+
"loss": 0.1592,
|
| 584 |
+
"step": 2050
|
| 585 |
+
},
|
| 586 |
+
{
|
| 587 |
+
"epoch": 2.488009592326139,
|
| 588 |
+
"grad_norm": 0.024708494544029236,
|
| 589 |
+
"learning_rate": 0.0017316268814358837,
|
| 590 |
+
"loss": 0.1558,
|
| 591 |
+
"step": 2075
|
| 592 |
+
},
|
| 593 |
+
{
|
| 594 |
+
"epoch": 2.5179856115107913,
|
| 595 |
+
"grad_norm": 0.023627523332834244,
|
| 596 |
+
"learning_rate": 0.0015404573081396833,
|
| 597 |
+
"loss": 0.1581,
|
| 598 |
+
"step": 2100
|
| 599 |
+
},
|
| 600 |
+
{
|
| 601 |
+
"epoch": 2.547961630695444,
|
| 602 |
+
"grad_norm": 0.02505665458738804,
|
| 603 |
+
"learning_rate": 0.0013595852300873235,
|
| 604 |
+
"loss": 0.1578,
|
| 605 |
+
"step": 2125
|
| 606 |
+
},
|
| 607 |
+
{
|
| 608 |
+
"epoch": 2.577937649880096,
|
| 609 |
+
"grad_norm": 0.02366657927632332,
|
| 610 |
+
"learning_rate": 0.001189230816315282,
|
| 611 |
+
"loss": 0.1536,
|
| 612 |
+
"step": 2150
|
| 613 |
+
},
|
| 614 |
+
{
|
| 615 |
+
"epoch": 2.6079136690647484,
|
| 616 |
+
"grad_norm": 0.028983445838093758,
|
| 617 |
+
"learning_rate": 0.0010296014330878466,
|
| 618 |
+
"loss": 0.1528,
|
| 619 |
+
"step": 2175
|
| 620 |
+
},
|
| 621 |
+
{
|
| 622 |
+
"epoch": 2.6378896882494005,
|
| 623 |
+
"grad_norm": 0.02606895938515663,
|
| 624 |
+
"learning_rate": 0.0008808913914776618,
|
| 625 |
+
"loss": 0.1542,
|
| 626 |
+
"step": 2200
|
| 627 |
+
},
|
| 628 |
+
{
|
| 629 |
+
"epoch": 2.6678657074340526,
|
| 630 |
+
"grad_norm": 0.021730564534664154,
|
| 631 |
+
"learning_rate": 0.0007432817108378987,
|
| 632 |
+
"loss": 0.1558,
|
| 633 |
+
"step": 2225
|
| 634 |
+
},
|
| 635 |
+
{
|
| 636 |
+
"epoch": 2.697841726618705,
|
| 637 |
+
"grad_norm": 0.0258767269551754,
|
| 638 |
+
"learning_rate": 0.000616939898453942,
|
| 639 |
+
"loss": 0.1587,
|
| 640 |
+
"step": 2250
|
| 641 |
+
},
|
| 642 |
+
{
|
| 643 |
+
"epoch": 2.7278177458033572,
|
| 644 |
+
"grad_norm": 0.024898972362279892,
|
| 645 |
+
"learning_rate": 0.0005020197456428266,
|
| 646 |
+
"loss": 0.1613,
|
| 647 |
+
"step": 2275
|
| 648 |
+
},
|
| 649 |
+
{
|
| 650 |
+
"epoch": 2.7577937649880093,
|
| 651 |
+
"grad_norm": 0.026595328003168106,
|
| 652 |
+
"learning_rate": 0.0003986611405486429,
|
| 653 |
+
"loss": 0.1518,
|
| 654 |
+
"step": 2300
|
| 655 |
+
},
|
| 656 |
+
{
|
| 657 |
+
"epoch": 2.787769784172662,
|
| 658 |
+
"grad_norm": 0.021675392985343933,
|
| 659 |
+
"learning_rate": 0.00030698989786175025,
|
| 660 |
+
"loss": 0.1495,
|
| 661 |
+
"step": 2325
|
| 662 |
+
},
|
| 663 |
+
{
|
| 664 |
+
"epoch": 2.8177458033573144,
|
| 665 |
+
"grad_norm": 0.026996396481990814,
|
| 666 |
+
"learning_rate": 0.00022711760566911045,
|
| 667 |
+
"loss": 0.1539,
|
| 668 |
+
"step": 2350
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"epoch": 2.8477218225419665,
|
| 672 |
+
"grad_norm": 0.02844955585896969,
|
| 673 |
+
"learning_rate": 0.00015914148962215102,
|
| 674 |
+
"loss": 0.1501,
|
| 675 |
+
"step": 2375
|
| 676 |
+
},
|
| 677 |
+
{
|
| 678 |
+
"epoch": 2.8776978417266186,
|
| 679 |
+
"grad_norm": 0.035385046154260635,
|
| 680 |
+
"learning_rate": 0.00010314429458748609,
|
| 681 |
+
"loss": 0.1533,
|
| 682 |
+
"step": 2400
|
| 683 |
+
},
|
| 684 |
+
{
|
| 685 |
+
"epoch": 2.907673860911271,
|
| 686 |
+
"grad_norm": 0.0226058941334486,
|
| 687 |
+
"learning_rate": 5.919418392459908e-05,
|
| 688 |
+
"loss": 0.1553,
|
| 689 |
+
"step": 2425
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"epoch": 2.937649880095923,
|
| 693 |
+
"grad_norm": 0.021318677812814713,
|
| 694 |
+
"learning_rate": 2.7344656513036413e-05,
|
| 695 |
+
"loss": 0.1513,
|
| 696 |
+
"step": 2450
|
| 697 |
+
},
|
| 698 |
+
{
|
| 699 |
+
"epoch": 2.9676258992805753,
|
| 700 |
+
"grad_norm": 0.03294537961483002,
|
| 701 |
+
"learning_rate": 7.634481630169888e-06,
|
| 702 |
+
"loss": 0.1507,
|
| 703 |
+
"step": 2475
|
| 704 |
+
},
|
| 705 |
+
{
|
| 706 |
+
"epoch": 2.997601918465228,
|
| 707 |
+
"grad_norm": 0.027222590520977974,
|
| 708 |
+
"learning_rate": 8.765175874891896e-08,
|
| 709 |
+
"loss": 0.1521,
|
| 710 |
+
"step": 2500
|
| 711 |
+
},
|
| 712 |
+
{
|
| 713 |
+
"epoch": 3.0,
|
| 714 |
+
"step": 2502,
|
| 715 |
+
"total_flos": 2.43882352705536e+18,
|
| 716 |
+
"train_loss": 0.22085073801110403,
|
| 717 |
+
"train_runtime": 3314.1845,
|
| 718 |
+
"train_samples_per_second": 36.208,
|
| 719 |
+
"train_steps_per_second": 0.755
|
| 720 |
+
}
|
| 721 |
+
],
|
| 722 |
+
"logging_steps": 25,
|
| 723 |
+
"max_steps": 2502,
|
| 724 |
+
"num_input_tokens_seen": 0,
|
| 725 |
+
"num_train_epochs": 3,
|
| 726 |
+
"save_steps": 0,
|
| 727 |
+
"stateful_callbacks": {
|
| 728 |
+
"TrainerControl": {
|
| 729 |
+
"args": {
|
| 730 |
+
"should_epoch_stop": false,
|
| 731 |
+
"should_evaluate": false,
|
| 732 |
+
"should_log": false,
|
| 733 |
+
"should_save": true,
|
| 734 |
+
"should_training_stop": true
|
| 735 |
+
},
|
| 736 |
+
"attributes": {}
|
| 737 |
+
}
|
| 738 |
+
},
|
| 739 |
+
"total_flos": 2.43882352705536e+18,
|
| 740 |
+
"train_batch_size": 48,
|
| 741 |
+
"trial_name": null,
|
| 742 |
+
"trial_params": null
|
| 743 |
+
}
|