nvan15 commited on
Commit
4e49694
·
verified ·
1 Parent(s): f96f8e0

Batch upload part 14

Browse files
Files changed (50) hide show
  1. nl_tasks/exp395/run_ex01/ft/adapter_config.json +18 -0
  2. nl_tasks/exp395/run_ex01/ft/added_tokens.json +3 -0
  3. nl_tasks/exp395/run_ex01/ft/special_tokens_map.json +30 -0
  4. nl_tasks/exp395/run_ex01/ft/tokenizer.json +0 -0
  5. nl_tasks/exp395/run_ex01/ft/tokenizer.model +3 -0
  6. nl_tasks/exp395/run_ex01/ft/tokenizer_config.json +51 -0
  7. nl_tasks/exp395/run_ex01/ft2/adapter_config.json +18 -0
  8. nl_tasks/exp395/run_ex01/ft2/adapter_model.bin +3 -0
  9. nl_tasks/exp395/run_ex01/trainer_state.json +51 -0
  10. nl_tasks/exp395/run_ex02/ft/adapter_config.json +18 -0
  11. nl_tasks/exp395/run_ex02/ft/added_tokens.json +3 -0
  12. nl_tasks/exp395/run_ex02/ft/special_tokens_map.json +30 -0
  13. nl_tasks/exp395/run_ex02/ft/tokenizer.json +0 -0
  14. nl_tasks/exp395/run_ex02/ft/tokenizer.model +3 -0
  15. nl_tasks/exp395/run_ex02/ft/tokenizer_config.json +51 -0
  16. nl_tasks/exp395/run_ex02/ft/training_args.bin +3 -0
  17. nl_tasks/exp395/run_ex02/ft2/adapter_config.json +18 -0
  18. nl_tasks/exp395/run_ex02/ft2/adapter_model.bin +3 -0
  19. nl_tasks/exp395/run_ex02/trainer_state.json +1096 -0
  20. nl_tasks/exp395/run_ex03/ft/added_tokens.json +3 -0
  21. nl_tasks/exp395/run_ex03/ft/special_tokens_map.json +30 -0
  22. nl_tasks/exp395/run_ex03/ft/tokenizer_config.json +51 -0
  23. nl_tasks/exp395/run_ex03/trainer_state.json +1096 -0
  24. nl_tasks/exprep/run_ex30/ft/adapter_config.json +19 -0
  25. nl_tasks/exprep/run_ex30/ft/special_tokens_map.json +24 -0
  26. nl_tasks/exprep/run_ex30/ft/tokenizer.json +0 -0
  27. nl_tasks/exprep/run_ex30/ft/tokenizer.model +3 -0
  28. nl_tasks/exprep/run_ex30/ft/tokenizer_config.json +43 -0
  29. nl_tasks/exprep/run_ex30/ft2/adapter_config.json +19 -0
  30. nl_tasks/exprep/run_ex30/ft2/adapter_model.bin +3 -0
  31. nl_tasks/exprep/run_ex30/output.txt +4 -0
  32. nl_tasks/exprep/run_ex30/trainer_state.json +743 -0
  33. nl_tasks/exprep/run_ex31/ft/adapter_config.json +19 -0
  34. nl_tasks/exprep/run_ex31/ft/special_tokens_map.json +24 -0
  35. nl_tasks/exprep/run_ex31/ft/tokenizer.json +0 -0
  36. nl_tasks/exprep/run_ex31/ft/tokenizer.model +3 -0
  37. nl_tasks/exprep/run_ex31/ft/tokenizer_config.json +43 -0
  38. nl_tasks/exprep/run_ex31/ft2/adapter_config.json +19 -0
  39. nl_tasks/exprep/run_ex31/ft2/adapter_model.bin +3 -0
  40. nl_tasks/exprep/run_ex31/output.txt +4 -0
  41. nl_tasks/exprep/run_ex31/trainer_state.json +743 -0
  42. nl_tasks/exprep/run_ex32/ft/adapter_config.json +19 -0
  43. nl_tasks/exprep/run_ex32/ft/special_tokens_map.json +24 -0
  44. nl_tasks/exprep/run_ex32/ft/tokenizer.json +0 -0
  45. nl_tasks/exprep/run_ex32/ft/tokenizer.model +3 -0
  46. nl_tasks/exprep/run_ex32/ft/tokenizer_config.json +43 -0
  47. nl_tasks/exprep/run_ex32/ft2/adapter_config.json +19 -0
  48. nl_tasks/exprep/run_ex32/ft2/adapter_model.bin +3 -0
  49. nl_tasks/exprep/run_ex32/output.txt +4 -0
  50. nl_tasks/exprep/run_ex32/trainer_state.json +743 -0
nl_tasks/exp395/run_ex01/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 4,
9
+ "peft_type": "ROTATION",
10
+ "r": 4,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex01/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex01/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex01/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex01/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex01/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex01/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex01/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83dbd8594d960f636558be950fd24b87006e7f2ff55b025b4f823c7b85c0c0b1
3
+ size 33602659
nl_tasks/exp395/run_ex01/trainer_state.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.001705653021442495,
6
+ "eval_steps": 20,
7
+ "global_step": 21,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.0016244314489928524,
14
+ "eval_loss": 0.387570321559906,
15
+ "eval_runtime": 20.2586,
16
+ "eval_samples_per_second": 50.349,
17
+ "eval_steps_per_second": 0.79,
18
+ "step": 20
19
+ },
20
+ {
21
+ "epoch": 0.001705653021442495,
22
+ "step": 21,
23
+ "total_flos": 1.365742020722688e+16,
24
+ "train_loss": 0.48108872913178946,
25
+ "train_runtime": 89.7275,
26
+ "train_samples_per_second": 7.489,
27
+ "train_steps_per_second": 0.234
28
+ }
29
+ ],
30
+ "logging_steps": 200,
31
+ "max_steps": 21,
32
+ "num_input_tokens_seen": 0,
33
+ "num_train_epochs": 1,
34
+ "save_steps": 0,
35
+ "stateful_callbacks": {
36
+ "TrainerControl": {
37
+ "args": {
38
+ "should_epoch_stop": false,
39
+ "should_evaluate": false,
40
+ "should_log": false,
41
+ "should_save": true,
42
+ "should_training_stop": true
43
+ },
44
+ "attributes": {}
45
+ }
46
+ },
47
+ "total_flos": 1.365742020722688e+16,
48
+ "train_batch_size": 32,
49
+ "trial_name": null,
50
+ "trial_params": null
51
+ }
nl_tasks/exp395/run_ex02/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex02/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex02/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex02/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex02/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex02/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex02/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31a6b842306cff1ec6938e7717d95de0854d28753f65619459b0b5a44deaf599
3
+ size 6545
nl_tasks/exp395/run_ex02/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex02/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:36af600a41199137954b7e5e610dc8b38d517bf76d32a3634572733e31415150
3
+ size 33602915
nl_tasks/exp395/run_ex02/trainer_state.json ADDED
@@ -0,0 +1,1096 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 24624,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016244314489928524,
14
+ "grad_norm": 0.25480222702026367,
15
+ "learning_rate": 0.00402834008097166,
16
+ "loss": 0.4081,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.03248862897985705,
21
+ "grad_norm": 0.10468661785125732,
22
+ "learning_rate": 0.004999520352117078,
23
+ "loss": 0.3127,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.04873294346978557,
28
+ "grad_norm": 0.08727595955133438,
29
+ "learning_rate": 0.004997428064390424,
30
+ "loss": 0.2765,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.0649772579597141,
35
+ "grad_norm": 0.07492764294147491,
36
+ "learning_rate": 0.004993676693462857,
37
+ "loss": 0.2623,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.08122157244964262,
42
+ "grad_norm": 0.06994384527206421,
43
+ "learning_rate": 0.004988268731432779,
44
+ "loss": 0.2543,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.08122157244964262,
49
+ "eval_loss": 0.24824216961860657,
50
+ "eval_runtime": 41.0298,
51
+ "eval_samples_per_second": 24.86,
52
+ "eval_steps_per_second": 0.39,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.09746588693957114,
57
+ "grad_norm": 0.057777680456638336,
58
+ "learning_rate": 0.004981207770899744,
59
+ "loss": 0.2449,
60
+ "step": 1200
61
+ },
62
+ {
63
+ "epoch": 0.11371020142949967,
64
+ "grad_norm": 0.055028438568115234,
65
+ "learning_rate": 0.00497249850257784,
66
+ "loss": 0.2379,
67
+ "step": 1400
68
+ },
69
+ {
70
+ "epoch": 0.1299545159194282,
71
+ "grad_norm": 0.04510676488280296,
72
+ "learning_rate": 0.004962146712179566,
73
+ "loss": 0.2396,
74
+ "step": 1600
75
+ },
76
+ {
77
+ "epoch": 0.14619883040935672,
78
+ "grad_norm": 0.04368717595934868,
79
+ "learning_rate": 0.004950159276572285,
80
+ "loss": 0.2315,
81
+ "step": 1800
82
+ },
83
+ {
84
+ "epoch": 0.16244314489928524,
85
+ "grad_norm": 0.036148250102996826,
86
+ "learning_rate": 0.004936544159209808,
87
+ "loss": 0.2306,
88
+ "step": 2000
89
+ },
90
+ {
91
+ "epoch": 0.16244314489928524,
92
+ "eval_loss": 0.22918154299259186,
93
+ "eval_runtime": 40.8654,
94
+ "eval_samples_per_second": 24.96,
95
+ "eval_steps_per_second": 0.392,
96
+ "step": 2000
97
+ },
98
+ {
99
+ "epoch": 0.17868745938921377,
100
+ "grad_norm": 0.040075842291116714,
101
+ "learning_rate": 0.004921310404842139,
102
+ "loss": 0.2265,
103
+ "step": 2200
104
+ },
105
+ {
106
+ "epoch": 0.1949317738791423,
107
+ "grad_norm": 0.04165196418762207,
108
+ "learning_rate": 0.004904468133506895,
109
+ "loss": 0.2265,
110
+ "step": 2400
111
+ },
112
+ {
113
+ "epoch": 0.2111760883690708,
114
+ "grad_norm": 0.035210952162742615,
115
+ "learning_rate": 0.004886028533806398,
116
+ "loss": 0.2226,
117
+ "step": 2600
118
+ },
119
+ {
120
+ "epoch": 0.22742040285899934,
121
+ "grad_norm": 0.03285546973347664,
122
+ "learning_rate": 0.004866003855474896,
123
+ "loss": 0.2225,
124
+ "step": 2800
125
+ },
126
+ {
127
+ "epoch": 0.24366471734892786,
128
+ "grad_norm": 0.030363937839865685,
129
+ "learning_rate": 0.004844407401240862,
130
+ "loss": 0.2183,
131
+ "step": 3000
132
+ },
133
+ {
134
+ "epoch": 0.24366471734892786,
135
+ "eval_loss": 0.220821812748909,
136
+ "eval_runtime": 19.6622,
137
+ "eval_samples_per_second": 51.876,
138
+ "eval_steps_per_second": 0.814,
139
+ "step": 3000
140
+ },
141
+ {
142
+ "epoch": 0.2599090318388564,
143
+ "grad_norm": 0.03195687755942345,
144
+ "learning_rate": 0.004821253517989772,
145
+ "loss": 0.2193,
146
+ "step": 3200
147
+ },
148
+ {
149
+ "epoch": 0.2761533463287849,
150
+ "grad_norm": 0.027379613369703293,
151
+ "learning_rate": 0.004796557587233222,
152
+ "loss": 0.2179,
153
+ "step": 3400
154
+ },
155
+ {
156
+ "epoch": 0.29239766081871343,
157
+ "grad_norm": 0.023950930684804916,
158
+ "learning_rate": 0.004770336014890741,
159
+ "loss": 0.2164,
160
+ "step": 3600
161
+ },
162
+ {
163
+ "epoch": 0.30864197530864196,
164
+ "grad_norm": 0.03380575031042099,
165
+ "learning_rate": 0.004742606220391057,
166
+ "loss": 0.2128,
167
+ "step": 3800
168
+ },
169
+ {
170
+ "epoch": 0.3248862897985705,
171
+ "grad_norm": 0.026672247797250748,
172
+ "learning_rate": 0.004713386625100085,
173
+ "loss": 0.2141,
174
+ "step": 4000
175
+ },
176
+ {
177
+ "epoch": 0.3248862897985705,
178
+ "eval_loss": 0.21518075466156006,
179
+ "eval_runtime": 19.7429,
180
+ "eval_samples_per_second": 51.664,
181
+ "eval_steps_per_second": 0.81,
182
+ "step": 4000
183
+ },
184
+ {
185
+ "epoch": 0.341130604288499,
186
+ "grad_norm": 0.033207934349775314,
187
+ "learning_rate": 0.004682696640083304,
188
+ "loss": 0.2112,
189
+ "step": 4200
190
+ },
191
+ {
192
+ "epoch": 0.35737491877842753,
193
+ "grad_norm": 0.02236669510602951,
194
+ "learning_rate": 0.0046505566532106584,
195
+ "loss": 0.2132,
196
+ "step": 4400
197
+ },
198
+ {
199
+ "epoch": 0.37361923326835605,
200
+ "grad_norm": 0.022743066772818565,
201
+ "learning_rate": 0.004616988015612562,
202
+ "loss": 0.2098,
203
+ "step": 4600
204
+ },
205
+ {
206
+ "epoch": 0.3898635477582846,
207
+ "grad_norm": 0.027080588042736053,
208
+ "learning_rate": 0.0045820130274959806,
209
+ "loss": 0.2078,
210
+ "step": 4800
211
+ },
212
+ {
213
+ "epoch": 0.4061078622482131,
214
+ "grad_norm": 0.02445564605295658,
215
+ "learning_rate": 0.00454565492333003,
216
+ "loss": 0.2091,
217
+ "step": 5000
218
+ },
219
+ {
220
+ "epoch": 0.4061078622482131,
221
+ "eval_loss": 0.21092304587364197,
222
+ "eval_runtime": 19.7299,
223
+ "eval_samples_per_second": 51.698,
224
+ "eval_steps_per_second": 0.811,
225
+ "step": 5000
226
+ },
227
+ {
228
+ "epoch": 0.4223521767381416,
229
+ "grad_norm": 0.021935012191534042,
230
+ "learning_rate": 0.004507937856410937,
231
+ "loss": 0.2089,
232
+ "step": 5200
233
+ },
234
+ {
235
+ "epoch": 0.43859649122807015,
236
+ "grad_norm": 0.02321953885257244,
237
+ "learning_rate": 0.004468886882816593,
238
+ "loss": 0.2027,
239
+ "step": 5400
240
+ },
241
+ {
242
+ "epoch": 0.4548408057179987,
243
+ "grad_norm": 0.02069736272096634,
244
+ "learning_rate": 0.004428527944761389,
245
+ "loss": 0.2045,
246
+ "step": 5600
247
+ },
248
+ {
249
+ "epoch": 0.4710851202079272,
250
+ "grad_norm": 0.02061382122337818,
251
+ "learning_rate": 0.004386887853362363,
252
+ "loss": 0.2023,
253
+ "step": 5800
254
+ },
255
+ {
256
+ "epoch": 0.4873294346978557,
257
+ "grad_norm": 0.02192777395248413,
258
+ "learning_rate": 0.004343994270828135,
259
+ "loss": 0.2026,
260
+ "step": 6000
261
+ },
262
+ {
263
+ "epoch": 0.4873294346978557,
264
+ "eval_loss": 0.20650118589401245,
265
+ "eval_runtime": 19.6917,
266
+ "eval_samples_per_second": 51.798,
267
+ "eval_steps_per_second": 0.813,
268
+ "step": 6000
269
+ },
270
+ {
271
+ "epoch": 0.5035737491877843,
272
+ "grad_norm": 0.020649125799536705,
273
+ "learning_rate": 0.004299875692082432,
274
+ "loss": 0.205,
275
+ "step": 6200
276
+ },
277
+ {
278
+ "epoch": 0.5198180636777128,
279
+ "grad_norm": 0.018906056880950928,
280
+ "learning_rate": 0.004254561425834437,
281
+ "loss": 0.2017,
282
+ "step": 6400
283
+ },
284
+ {
285
+ "epoch": 0.5360623781676414,
286
+ "grad_norm": 0.019335156306624413,
287
+ "learning_rate": 0.004208081575108523,
288
+ "loss": 0.1989,
289
+ "step": 6600
290
+ },
291
+ {
292
+ "epoch": 0.5523066926575698,
293
+ "grad_norm": 0.01864049769937992,
294
+ "learning_rate": 0.004160467017246312,
295
+ "loss": 0.2008,
296
+ "step": 6800
297
+ },
298
+ {
299
+ "epoch": 0.5685510071474984,
300
+ "grad_norm": 0.01772213727235794,
301
+ "learning_rate": 0.004111749383394341,
302
+ "loss": 0.1977,
303
+ "step": 7000
304
+ },
305
+ {
306
+ "epoch": 0.5685510071474984,
307
+ "eval_loss": 0.2017340511083603,
308
+ "eval_runtime": 19.6881,
309
+ "eval_samples_per_second": 51.808,
310
+ "eval_steps_per_second": 0.813,
311
+ "step": 7000
312
+ },
313
+ {
314
+ "epoch": 0.5847953216374269,
315
+ "grad_norm": 0.014698835089802742,
316
+ "learning_rate": 0.00406196103749096,
317
+ "loss": 0.1976,
318
+ "step": 7200
319
+ },
320
+ {
321
+ "epoch": 0.6010396361273554,
322
+ "grad_norm": 0.015146933495998383,
323
+ "learning_rate": 0.004011135054766431,
324
+ "loss": 0.1987,
325
+ "step": 7400
326
+ },
327
+ {
328
+ "epoch": 0.6172839506172839,
329
+ "grad_norm": 0.016058743000030518,
330
+ "learning_rate": 0.003959305199770494,
331
+ "loss": 0.1975,
332
+ "step": 7600
333
+ },
334
+ {
335
+ "epoch": 0.6335282651072125,
336
+ "grad_norm": 0.019068371504545212,
337
+ "learning_rate": 0.003906505903942021,
338
+ "loss": 0.1976,
339
+ "step": 7800
340
+ },
341
+ {
342
+ "epoch": 0.649772579597141,
343
+ "grad_norm": 0.01624867506325245,
344
+ "learning_rate": 0.0038527722427356276,
345
+ "loss": 0.1955,
346
+ "step": 8000
347
+ },
348
+ {
349
+ "epoch": 0.649772579597141,
350
+ "eval_loss": 0.19978894293308258,
351
+ "eval_runtime": 19.6705,
352
+ "eval_samples_per_second": 51.854,
353
+ "eval_steps_per_second": 0.813,
354
+ "step": 8000
355
+ },
356
+ {
357
+ "epoch": 0.6660168940870695,
358
+ "grad_norm": 0.015397582203149796,
359
+ "learning_rate": 0.0037981399123204643,
360
+ "loss": 0.1947,
361
+ "step": 8200
362
+ },
363
+ {
364
+ "epoch": 0.682261208576998,
365
+ "grad_norm": 0.015843288972973824,
366
+ "learning_rate": 0.0037426452058666577,
367
+ "loss": 0.1954,
368
+ "step": 8400
369
+ },
370
+ {
371
+ "epoch": 0.6985055230669266,
372
+ "grad_norm": 0.015419911593198776,
373
+ "learning_rate": 0.003686324989435149,
374
+ "loss": 0.1958,
375
+ "step": 8600
376
+ },
377
+ {
378
+ "epoch": 0.7147498375568551,
379
+ "grad_norm": 0.01585761457681656,
380
+ "learning_rate": 0.003629216677486953,
381
+ "loss": 0.1909,
382
+ "step": 8800
383
+ },
384
+ {
385
+ "epoch": 0.7309941520467836,
386
+ "grad_norm": 0.017794450744986534,
387
+ "learning_rate": 0.0035713582080281066,
388
+ "loss": 0.1919,
389
+ "step": 9000
390
+ },
391
+ {
392
+ "epoch": 0.7309941520467836,
393
+ "eval_loss": 0.19633081555366516,
394
+ "eval_runtime": 19.7217,
395
+ "eval_samples_per_second": 51.72,
396
+ "eval_steps_per_second": 0.811,
397
+ "step": 9000
398
+ },
399
+ {
400
+ "epoch": 0.7472384665367121,
401
+ "grad_norm": 0.016265571117401123,
402
+ "learning_rate": 0.00351278801740682,
403
+ "loss": 0.1922,
404
+ "step": 9200
405
+ },
406
+ {
407
+ "epoch": 0.7634827810266407,
408
+ "grad_norm": 0.015308309346437454,
409
+ "learning_rate": 0.003453545014779565,
410
+ "loss": 0.1926,
411
+ "step": 9400
412
+ },
413
+ {
414
+ "epoch": 0.7797270955165692,
415
+ "grad_norm": 0.016437450423836708,
416
+ "learning_rate": 0.003393668556263073,
417
+ "loss": 0.1889,
418
+ "step": 9600
419
+ },
420
+ {
421
+ "epoch": 0.7959714100064977,
422
+ "grad_norm": 0.01566031202673912,
423
+ "learning_rate": 0.0033331984187894076,
424
+ "loss": 0.1869,
425
+ "step": 9800
426
+ },
427
+ {
428
+ "epoch": 0.8122157244964262,
429
+ "grad_norm": 0.01510651409626007,
430
+ "learning_rate": 0.0032721747736814888,
431
+ "loss": 0.19,
432
+ "step": 10000
433
+ },
434
+ {
435
+ "epoch": 0.8122157244964262,
436
+ "eval_loss": 0.19334229826927185,
437
+ "eval_runtime": 19.6674,
438
+ "eval_samples_per_second": 51.862,
439
+ "eval_steps_per_second": 0.814,
440
+ "step": 10000
441
+ },
442
+ {
443
+ "epoch": 0.8284600389863548,
444
+ "grad_norm": 0.015091422945261002,
445
+ "learning_rate": 0.0032106381599666055,
446
+ "loss": 0.1901,
447
+ "step": 10200
448
+ },
449
+ {
450
+ "epoch": 0.8447043534762833,
451
+ "grad_norm": 0.013704156503081322,
452
+ "learning_rate": 0.0031486294574456664,
453
+ "loss": 0.1917,
454
+ "step": 10400
455
+ },
456
+ {
457
+ "epoch": 0.8609486679662118,
458
+ "grad_norm": 0.014794130809605122,
459
+ "learning_rate": 0.0030861898595360635,
460
+ "loss": 0.1898,
461
+ "step": 10600
462
+ },
463
+ {
464
+ "epoch": 0.8771929824561403,
465
+ "grad_norm": 0.016289081424474716,
466
+ "learning_rate": 0.0030233608459061968,
467
+ "loss": 0.187,
468
+ "step": 10800
469
+ },
470
+ {
471
+ "epoch": 0.8934372969460689,
472
+ "grad_norm": 0.014730295166373253,
473
+ "learning_rate": 0.0029601841549198368,
474
+ "loss": 0.1869,
475
+ "step": 11000
476
+ },
477
+ {
478
+ "epoch": 0.8934372969460689,
479
+ "eval_loss": 0.18983475863933563,
480
+ "eval_runtime": 19.7052,
481
+ "eval_samples_per_second": 51.763,
482
+ "eval_steps_per_second": 0.812,
483
+ "step": 11000
484
+ },
485
+ {
486
+ "epoch": 0.9096816114359974,
487
+ "grad_norm": 0.015831120312213898,
488
+ "learning_rate": 0.0028967017559086332,
489
+ "loss": 0.1846,
490
+ "step": 11200
491
+ },
492
+ {
493
+ "epoch": 0.9259259259259259,
494
+ "grad_norm": 0.013883775100111961,
495
+ "learning_rate": 0.002832955821291186,
496
+ "loss": 0.184,
497
+ "step": 11400
498
+ },
499
+ {
500
+ "epoch": 0.9421702404158544,
501
+ "grad_norm": 0.013128319755196571,
502
+ "learning_rate": 0.0027689886985572012,
503
+ "loss": 0.1875,
504
+ "step": 11600
505
+ },
506
+ {
507
+ "epoch": 0.958414554905783,
508
+ "grad_norm": 0.013796934857964516,
509
+ "learning_rate": 0.0027048428821353477,
510
+ "loss": 0.185,
511
+ "step": 11800
512
+ },
513
+ {
514
+ "epoch": 0.9746588693957114,
515
+ "grad_norm": 0.013713874854147434,
516
+ "learning_rate": 0.0026405609851634987,
517
+ "loss": 0.1838,
518
+ "step": 12000
519
+ },
520
+ {
521
+ "epoch": 0.9746588693957114,
522
+ "eval_loss": 0.18737617135047913,
523
+ "eval_runtime": 19.7078,
524
+ "eval_samples_per_second": 51.756,
525
+ "eval_steps_per_second": 0.812,
526
+ "step": 12000
527
+ },
528
+ {
529
+ "epoch": 0.99090318388564,
530
+ "grad_norm": 0.016229931265115738,
531
+ "learning_rate": 0.00257618571118011,
532
+ "loss": 0.184,
533
+ "step": 12200
534
+ },
535
+ {
536
+ "epoch": 1.0071474983755686,
537
+ "grad_norm": 0.014318772591650486,
538
+ "learning_rate": 0.0025117598257555457,
539
+ "loss": 0.1786,
540
+ "step": 12400
541
+ },
542
+ {
543
+ "epoch": 1.023391812865497,
544
+ "grad_norm": 0.014333798550069332,
545
+ "learning_rate": 0.0024473261280821995,
546
+ "loss": 0.1718,
547
+ "step": 12600
548
+ },
549
+ {
550
+ "epoch": 1.0396361273554255,
551
+ "grad_norm": 0.015595314092934132,
552
+ "learning_rate": 0.0023829274225422685,
553
+ "loss": 0.1708,
554
+ "step": 12800
555
+ },
556
+ {
557
+ "epoch": 1.0558804418453542,
558
+ "grad_norm": 0.01504795253276825,
559
+ "learning_rate": 0.002318606490272094,
560
+ "loss": 0.1694,
561
+ "step": 13000
562
+ },
563
+ {
564
+ "epoch": 1.0558804418453542,
565
+ "eval_loss": 0.18466883897781372,
566
+ "eval_runtime": 19.6551,
567
+ "eval_samples_per_second": 51.895,
568
+ "eval_steps_per_second": 0.814,
569
+ "step": 13000
570
+ },
571
+ {
572
+ "epoch": 1.0721247563352827,
573
+ "grad_norm": 0.013813730329275131,
574
+ "learning_rate": 0.002254406060741932,
575
+ "loss": 0.171,
576
+ "step": 13200
577
+ },
578
+ {
579
+ "epoch": 1.0883690708252112,
580
+ "grad_norm": 0.013750949874520302,
581
+ "learning_rate": 0.0021903687833700566,
582
+ "loss": 0.1729,
583
+ "step": 13400
584
+ },
585
+ {
586
+ "epoch": 1.1046133853151396,
587
+ "grad_norm": 0.012803665362298489,
588
+ "learning_rate": 0.002126537199190034,
589
+ "loss": 0.1696,
590
+ "step": 13600
591
+ },
592
+ {
593
+ "epoch": 1.120857699805068,
594
+ "grad_norm": 0.01384389866143465,
595
+ "learning_rate": 0.002062953712590007,
596
+ "loss": 0.1701,
597
+ "step": 13800
598
+ },
599
+ {
600
+ "epoch": 1.1371020142949968,
601
+ "grad_norm": 0.01575305685400963,
602
+ "learning_rate": 0.0019996605631427515,
603
+ "loss": 0.1712,
604
+ "step": 14000
605
+ },
606
+ {
607
+ "epoch": 1.1371020142949968,
608
+ "eval_loss": 0.18272888660430908,
609
+ "eval_runtime": 19.6972,
610
+ "eval_samples_per_second": 51.784,
611
+ "eval_steps_per_second": 0.812,
612
+ "step": 14000
613
+ },
614
+ {
615
+ "epoch": 1.1533463287849253,
616
+ "grad_norm": 0.016276616603136063,
617
+ "learning_rate": 0.0019366997975452161,
618
+ "loss": 0.1723,
619
+ "step": 14200
620
+ },
621
+ {
622
+ "epoch": 1.1695906432748537,
623
+ "grad_norm": 0.014685546979308128,
624
+ "learning_rate": 0.0018741132416862077,
625
+ "loss": 0.1724,
626
+ "step": 14400
627
+ },
628
+ {
629
+ "epoch": 1.1858349577647824,
630
+ "grad_norm": 0.012309509329497814,
631
+ "learning_rate": 0.0018119424728607464,
632
+ "loss": 0.1673,
633
+ "step": 14600
634
+ },
635
+ {
636
+ "epoch": 1.202079272254711,
637
+ "grad_norm": 0.017257582396268845,
638
+ "learning_rate": 0.0017502287921495807,
639
+ "loss": 0.1691,
640
+ "step": 14800
641
+ },
642
+ {
643
+ "epoch": 1.2183235867446394,
644
+ "grad_norm": 0.014502265490591526,
645
+ "learning_rate": 0.001689013196982182,
646
+ "loss": 0.1671,
647
+ "step": 15000
648
+ },
649
+ {
650
+ "epoch": 1.2183235867446394,
651
+ "eval_loss": 0.18109427392482758,
652
+ "eval_runtime": 19.6726,
653
+ "eval_samples_per_second": 51.849,
654
+ "eval_steps_per_second": 0.813,
655
+ "step": 15000
656
+ },
657
+ {
658
+ "epoch": 1.2345679012345678,
659
+ "grad_norm": 0.014362608082592487,
660
+ "learning_rate": 0.0016283363539014743,
661
+ "loss": 0.1704,
662
+ "step": 15200
663
+ },
664
+ {
665
+ "epoch": 1.2508122157244963,
666
+ "grad_norm": 0.01381103415042162,
667
+ "learning_rate": 0.001568238571548363,
668
+ "loss": 0.1708,
669
+ "step": 15400
670
+ },
671
+ {
672
+ "epoch": 1.267056530214425,
673
+ "grad_norm": 0.014473805204033852,
674
+ "learning_rate": 0.0015087597738840382,
675
+ "loss": 0.1709,
676
+ "step": 15600
677
+ },
678
+ {
679
+ "epoch": 1.2833008447043535,
680
+ "grad_norm": 0.01444827951490879,
681
+ "learning_rate": 0.0014499394736678152,
682
+ "loss": 0.1649,
683
+ "step": 15800
684
+ },
685
+ {
686
+ "epoch": 1.299545159194282,
687
+ "grad_norm": 0.014520540833473206,
688
+ "learning_rate": 0.0013918167462081495,
689
+ "loss": 0.1664,
690
+ "step": 16000
691
+ },
692
+ {
693
+ "epoch": 1.299545159194282,
694
+ "eval_loss": 0.17891472578048706,
695
+ "eval_runtime": 19.6583,
696
+ "eval_samples_per_second": 51.887,
697
+ "eval_steps_per_second": 0.814,
698
+ "step": 16000
699
+ },
700
+ {
701
+ "epoch": 1.3157894736842106,
702
+ "grad_norm": 0.015298468992114067,
703
+ "learning_rate": 0.0013344302034042554,
704
+ "loss": 0.166,
705
+ "step": 16200
706
+ },
707
+ {
708
+ "epoch": 1.332033788174139,
709
+ "grad_norm": 0.01416326779872179,
710
+ "learning_rate": 0.0012778179680955786,
711
+ "loss": 0.1649,
712
+ "step": 16400
713
+ },
714
+ {
715
+ "epoch": 1.3482781026640676,
716
+ "grad_norm": 0.015423670411109924,
717
+ "learning_rate": 0.001222017648736155,
718
+ "loss": 0.1675,
719
+ "step": 16600
720
+ },
721
+ {
722
+ "epoch": 1.364522417153996,
723
+ "grad_norm": 0.018960699439048767,
724
+ "learning_rate": 0.0011670663144106859,
725
+ "loss": 0.1646,
726
+ "step": 16800
727
+ },
728
+ {
729
+ "epoch": 1.3807667316439245,
730
+ "grad_norm": 0.014567695558071136,
731
+ "learning_rate": 0.0011130004702089255,
732
+ "loss": 0.1639,
733
+ "step": 17000
734
+ },
735
+ {
736
+ "epoch": 1.3807667316439245,
737
+ "eval_loss": 0.17684705555438995,
738
+ "eval_runtime": 19.7297,
739
+ "eval_samples_per_second": 51.699,
740
+ "eval_steps_per_second": 0.811,
741
+ "step": 17000
742
+ },
743
+ {
744
+ "epoch": 1.3970110461338532,
745
+ "grad_norm": 0.013393155299127102,
746
+ "learning_rate": 0.001059856032974741,
747
+ "loss": 0.1637,
748
+ "step": 17200
749
+ },
750
+ {
751
+ "epoch": 1.4132553606237817,
752
+ "grad_norm": 0.018768994137644768,
753
+ "learning_rate": 0.0010076683074459539,
754
+ "loss": 0.1654,
755
+ "step": 17400
756
+ },
757
+ {
758
+ "epoch": 1.4294996751137101,
759
+ "grad_norm": 0.015135309658944607,
760
+ "learning_rate": 0.0009564719628008081,
761
+ "loss": 0.164,
762
+ "step": 17600
763
+ },
764
+ {
765
+ "epoch": 1.4457439896036388,
766
+ "grad_norm": 0.015802372246980667,
767
+ "learning_rate": 0.0009063010096266597,
768
+ "loss": 0.1623,
769
+ "step": 17800
770
+ },
771
+ {
772
+ "epoch": 1.4619883040935673,
773
+ "grad_norm": 0.017514606937766075,
774
+ "learning_rate": 0.0008571887773261733,
775
+ "loss": 0.165,
776
+ "step": 18000
777
+ },
778
+ {
779
+ "epoch": 1.4619883040935673,
780
+ "eval_loss": 0.17532217502593994,
781
+ "eval_runtime": 19.6676,
782
+ "eval_samples_per_second": 51.862,
783
+ "eval_steps_per_second": 0.814,
784
+ "step": 18000
785
+ },
786
+ {
787
+ "epoch": 1.4782326185834957,
788
+ "grad_norm": 0.01355868112295866,
789
+ "learning_rate": 0.0008091678919760445,
790
+ "loss": 0.163,
791
+ "step": 18200
792
+ },
793
+ {
794
+ "epoch": 1.4944769330734242,
795
+ "grad_norm": 0.014759089797735214,
796
+ "learning_rate": 0.00076227025465295,
797
+ "loss": 0.164,
798
+ "step": 18400
799
+ },
800
+ {
801
+ "epoch": 1.5107212475633527,
802
+ "grad_norm": 0.01525470893830061,
803
+ "learning_rate": 0.0007165270202411303,
804
+ "loss": 0.1625,
805
+ "step": 18600
806
+ },
807
+ {
808
+ "epoch": 1.5269655620532814,
809
+ "grad_norm": 0.016354292631149292,
810
+ "learning_rate": 0.0006719685767356795,
811
+ "loss": 0.1627,
812
+ "step": 18800
813
+ },
814
+ {
815
+ "epoch": 1.5432098765432098,
816
+ "grad_norm": 0.01645779050886631,
817
+ "learning_rate": 0.0006286245250552911,
818
+ "loss": 0.1616,
819
+ "step": 19000
820
+ },
821
+ {
822
+ "epoch": 1.5432098765432098,
823
+ "eval_loss": 0.17353980243206024,
824
+ "eval_runtime": 19.6879,
825
+ "eval_samples_per_second": 51.808,
826
+ "eval_steps_per_second": 0.813,
827
+ "step": 19000
828
+ },
829
+ {
830
+ "epoch": 1.5594541910331383,
831
+ "grad_norm": 0.017305225133895874,
832
+ "learning_rate": 0.0005865236593778758,
833
+ "loss": 0.1638,
834
+ "step": 19200
835
+ },
836
+ {
837
+ "epoch": 1.575698505523067,
838
+ "grad_norm": 0.013052871450781822,
839
+ "learning_rate": 0.0005456939480121046,
840
+ "loss": 0.1609,
841
+ "step": 19400
842
+ },
843
+ {
844
+ "epoch": 1.5919428200129955,
845
+ "grad_norm": 0.015010879375040531,
846
+ "learning_rate": 0.0005061625148175956,
847
+ "loss": 0.1596,
848
+ "step": 19600
849
+ },
850
+ {
851
+ "epoch": 1.608187134502924,
852
+ "grad_norm": 0.015728702768683434,
853
+ "learning_rate": 0.00046795562118608026,
854
+ "loss": 0.1621,
855
+ "step": 19800
856
+ },
857
+ {
858
+ "epoch": 1.6244314489928526,
859
+ "grad_norm": 0.015889260917901993,
860
+ "learning_rate": 0.0004310986485955232,
861
+ "loss": 0.1595,
862
+ "step": 20000
863
+ },
864
+ {
865
+ "epoch": 1.6244314489928526,
866
+ "eval_loss": 0.17253118753433228,
867
+ "eval_runtime": 19.6544,
868
+ "eval_samples_per_second": 51.897,
869
+ "eval_steps_per_second": 0.814,
870
+ "step": 20000
871
+ },
872
+ {
873
+ "epoch": 1.6406757634827809,
874
+ "grad_norm": 0.016522224992513657,
875
+ "learning_rate": 0.00039561608174878027,
876
+ "loss": 0.161,
877
+ "step": 20200
878
+ },
879
+ {
880
+ "epoch": 1.6569200779727096,
881
+ "grad_norm": 0.013982519507408142,
882
+ "learning_rate": 0.0003615314923080029,
883
+ "loss": 0.1604,
884
+ "step": 20400
885
+ },
886
+ {
887
+ "epoch": 1.673164392462638,
888
+ "grad_norm": 0.013618958182632923,
889
+ "learning_rate": 0.0003288675232355878,
890
+ "loss": 0.1593,
891
+ "step": 20600
892
+ },
893
+ {
894
+ "epoch": 1.6894087069525665,
895
+ "grad_norm": 0.015021858736872673,
896
+ "learning_rate": 0.0002976458737520793,
897
+ "loss": 0.1592,
898
+ "step": 20800
899
+ },
900
+ {
901
+ "epoch": 1.7056530214424952,
902
+ "grad_norm": 0.016849158331751823,
903
+ "learning_rate": 0.0002678872849210154,
904
+ "loss": 0.159,
905
+ "step": 21000
906
+ },
907
+ {
908
+ "epoch": 1.7056530214424952,
909
+ "eval_loss": 0.17133571207523346,
910
+ "eval_runtime": 19.6843,
911
+ "eval_samples_per_second": 51.818,
912
+ "eval_steps_per_second": 0.813,
913
+ "step": 21000
914
+ },
915
+ {
916
+ "epoch": 1.7218973359324237,
917
+ "grad_norm": 0.0130222849547863,
918
+ "learning_rate": 0.00023961152587028995,
919
+ "loss": 0.1584,
920
+ "step": 21200
921
+ },
922
+ {
923
+ "epoch": 1.7381416504223521,
924
+ "grad_norm": 0.015901461243629456,
925
+ "learning_rate": 0.00021283738065919322,
926
+ "loss": 0.1584,
927
+ "step": 21400
928
+ },
929
+ {
930
+ "epoch": 1.7543859649122808,
931
+ "grad_norm": 0.016017524525523186,
932
+ "learning_rate": 0.00018758263579984614,
933
+ "loss": 0.1595,
934
+ "step": 21600
935
+ },
936
+ {
937
+ "epoch": 1.770630279402209,
938
+ "grad_norm": 0.013794663362205029,
939
+ "learning_rate": 0.0001638640684413234,
940
+ "loss": 0.1599,
941
+ "step": 21800
942
+ },
943
+ {
944
+ "epoch": 1.7868745938921378,
945
+ "grad_norm": 0.01649121195077896,
946
+ "learning_rate": 0.0001416974352243128,
947
+ "loss": 0.1575,
948
+ "step": 22000
949
+ },
950
+ {
951
+ "epoch": 1.7868745938921378,
952
+ "eval_loss": 0.17062582075595856,
953
+ "eval_runtime": 19.6651,
954
+ "eval_samples_per_second": 51.869,
955
+ "eval_steps_per_second": 0.814,
956
+ "step": 22000
957
+ },
958
+ {
959
+ "epoch": 1.8031189083820662,
960
+ "grad_norm": 0.015386915765702724,
961
+ "learning_rate": 0.00012109746181371561,
962
+ "loss": 0.1601,
963
+ "step": 22200
964
+ },
965
+ {
966
+ "epoch": 1.8193632228719947,
967
+ "grad_norm": 0.015663689002394676,
968
+ "learning_rate": 0.00010207783311614094,
969
+ "loss": 0.1606,
970
+ "step": 22400
971
+ },
972
+ {
973
+ "epoch": 1.8356075373619234,
974
+ "grad_norm": 0.015325063839554787,
975
+ "learning_rate": 8.465118418879398e-05,
976
+ "loss": 0.1604,
977
+ "step": 22600
978
+ },
979
+ {
980
+ "epoch": 1.8518518518518519,
981
+ "grad_norm": 0.015577482059597969,
982
+ "learning_rate": 6.882909184579706e-05,
983
+ "loss": 0.1564,
984
+ "step": 22800
985
+ },
986
+ {
987
+ "epoch": 1.8680961663417803,
988
+ "grad_norm": 0.015263117849826813,
989
+ "learning_rate": 5.462206696751654e-05,
990
+ "loss": 0.1574,
991
+ "step": 23000
992
+ },
993
+ {
994
+ "epoch": 1.8680961663417803,
995
+ "eval_loss": 0.17017242312431335,
996
+ "eval_runtime": 19.6661,
997
+ "eval_samples_per_second": 51.866,
998
+ "eval_steps_per_second": 0.814,
999
+ "step": 23000
1000
+ },
1001
+ {
1002
+ "epoch": 1.884340480831709,
1003
+ "grad_norm": 0.015429106540977955,
1004
+ "learning_rate": 4.2039547518011034e-05,
1005
+ "loss": 0.1575,
1006
+ "step": 23200
1007
+ },
1008
+ {
1009
+ "epoch": 1.9005847953216373,
1010
+ "grad_norm": 0.01706068590283394,
1011
+ "learning_rate": 3.108989227523368e-05,
1012
+ "loss": 0.1575,
1013
+ "step": 23400
1014
+ },
1015
+ {
1016
+ "epoch": 1.916829109811566,
1017
+ "grad_norm": 0.014579437673091888,
1018
+ "learning_rate": 2.1780375278155907e-05,
1019
+ "loss": 0.1601,
1020
+ "step": 23600
1021
+ },
1022
+ {
1023
+ "epoch": 1.9330734243014944,
1024
+ "grad_norm": 0.01484010647982359,
1025
+ "learning_rate": 1.4117180994502044e-05,
1026
+ "loss": 0.1553,
1027
+ "step": 23800
1028
+ },
1029
+ {
1030
+ "epoch": 1.949317738791423,
1031
+ "grad_norm": 0.015194403007626534,
1032
+ "learning_rate": 8.105400212304326e-06,
1033
+ "loss": 0.1598,
1034
+ "step": 24000
1035
+ },
1036
+ {
1037
+ "epoch": 1.949317738791423,
1038
+ "eval_loss": 0.1700344979763031,
1039
+ "eval_runtime": 19.6869,
1040
+ "eval_samples_per_second": 51.811,
1041
+ "eval_steps_per_second": 0.813,
1042
+ "step": 24000
1043
+ },
1044
+ {
1045
+ "epoch": 1.9655620532813516,
1046
+ "grad_norm": 0.01580938510596752,
1047
+ "learning_rate": 3.7490266580070265e-06,
1048
+ "loss": 0.1583,
1049
+ "step": 24200
1050
+ },
1051
+ {
1052
+ "epoch": 1.98180636777128,
1053
+ "grad_norm": 0.013688357546925545,
1054
+ "learning_rate": 1.0509543433673273e-06,
1055
+ "loss": 0.159,
1056
+ "step": 24400
1057
+ },
1058
+ {
1059
+ "epoch": 1.9980506822612085,
1060
+ "grad_norm": 0.015114161185920238,
1061
+ "learning_rate": 1.2975642914858642e-08,
1062
+ "loss": 0.1598,
1063
+ "step": 24600
1064
+ },
1065
+ {
1066
+ "epoch": 2.0,
1067
+ "step": 24624,
1068
+ "total_flos": 1.6014138134652518e+19,
1069
+ "train_loss": 0.18769870234666666,
1070
+ "train_runtime": 24009.3883,
1071
+ "train_samples_per_second": 32.819,
1072
+ "train_steps_per_second": 1.026
1073
+ }
1074
+ ],
1075
+ "logging_steps": 200,
1076
+ "max_steps": 24624,
1077
+ "num_input_tokens_seen": 0,
1078
+ "num_train_epochs": 2,
1079
+ "save_steps": 0,
1080
+ "stateful_callbacks": {
1081
+ "TrainerControl": {
1082
+ "args": {
1083
+ "should_epoch_stop": false,
1084
+ "should_evaluate": false,
1085
+ "should_log": false,
1086
+ "should_save": true,
1087
+ "should_training_stop": true
1088
+ },
1089
+ "attributes": {}
1090
+ }
1091
+ },
1092
+ "total_flos": 1.6014138134652518e+19,
1093
+ "train_batch_size": 32,
1094
+ "trial_name": null,
1095
+ "trial_params": null
1096
+ }
nl_tasks/exp395/run_ex03/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex03/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex03/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex03/trainer_state.json ADDED
@@ -0,0 +1,1096 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 24624,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.016244314489928524,
14
+ "grad_norm": 0.2691752314567566,
15
+ "learning_rate": 0.00016113360323886643,
16
+ "loss": 0.4431,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.03248862897985705,
21
+ "grad_norm": 0.1912466436624527,
22
+ "learning_rate": 0.00019998081408468315,
23
+ "loss": 0.3117,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.04873294346978557,
28
+ "grad_norm": 0.18917685747146606,
29
+ "learning_rate": 0.000199897122575617,
30
+ "loss": 0.29,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.0649772579597141,
35
+ "grad_norm": 0.1740494817495346,
36
+ "learning_rate": 0.0001997470677385143,
37
+ "loss": 0.2765,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.08122157244964262,
42
+ "grad_norm": 0.20022322237491608,
43
+ "learning_rate": 0.00019953074925731115,
44
+ "loss": 0.2693,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.08122157244964262,
49
+ "eval_loss": 0.2633211612701416,
50
+ "eval_runtime": 19.8939,
51
+ "eval_samples_per_second": 51.272,
52
+ "eval_steps_per_second": 0.804,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.09746588693957114,
57
+ "grad_norm": 0.16619279980659485,
58
+ "learning_rate": 0.00019924831083598975,
59
+ "loss": 0.259,
60
+ "step": 1200
61
+ },
62
+ {
63
+ "epoch": 0.11371020142949967,
64
+ "grad_norm": 0.18779638409614563,
65
+ "learning_rate": 0.00019889994010311362,
66
+ "loss": 0.2509,
67
+ "step": 1400
68
+ },
69
+ {
70
+ "epoch": 0.1299545159194282,
71
+ "grad_norm": 0.17294612526893616,
72
+ "learning_rate": 0.00019848586848718265,
73
+ "loss": 0.2517,
74
+ "step": 1600
75
+ },
76
+ {
77
+ "epoch": 0.14619883040935672,
78
+ "grad_norm": 0.17169740796089172,
79
+ "learning_rate": 0.0001980063710628914,
80
+ "loss": 0.2427,
81
+ "step": 1800
82
+ },
83
+ {
84
+ "epoch": 0.16244314489928524,
85
+ "grad_norm": 0.1682817041873932,
86
+ "learning_rate": 0.00019746176636839234,
87
+ "loss": 0.2408,
88
+ "step": 2000
89
+ },
90
+ {
91
+ "epoch": 0.16244314489928524,
92
+ "eval_loss": 0.23899686336517334,
93
+ "eval_runtime": 19.7109,
94
+ "eval_samples_per_second": 51.748,
95
+ "eval_steps_per_second": 0.812,
96
+ "step": 2000
97
+ },
98
+ {
99
+ "epoch": 0.17868745938921377,
100
+ "grad_norm": 0.19323141872882843,
101
+ "learning_rate": 0.00019685241619368557,
102
+ "loss": 0.2361,
103
+ "step": 2200
104
+ },
105
+ {
106
+ "epoch": 0.1949317738791423,
107
+ "grad_norm": 0.2126629501581192,
108
+ "learning_rate": 0.0001961787253402758,
109
+ "loss": 0.2347,
110
+ "step": 2400
111
+ },
112
+ {
113
+ "epoch": 0.2111760883690708,
114
+ "grad_norm": 0.2029845267534256,
115
+ "learning_rate": 0.00019544114135225594,
116
+ "loss": 0.2295,
117
+ "step": 2600
118
+ },
119
+ {
120
+ "epoch": 0.22742040285899934,
121
+ "grad_norm": 0.2121860682964325,
122
+ "learning_rate": 0.00019464015421899583,
123
+ "loss": 0.2286,
124
+ "step": 2800
125
+ },
126
+ {
127
+ "epoch": 0.24366471734892786,
128
+ "grad_norm": 0.20849257707595825,
129
+ "learning_rate": 0.0001937762960496345,
130
+ "loss": 0.2243,
131
+ "step": 3000
132
+ },
133
+ {
134
+ "epoch": 0.24366471734892786,
135
+ "eval_loss": 0.2256067991256714,
136
+ "eval_runtime": 19.7215,
137
+ "eval_samples_per_second": 51.72,
138
+ "eval_steps_per_second": 0.811,
139
+ "step": 3000
140
+ },
141
+ {
142
+ "epoch": 0.2599090318388564,
143
+ "grad_norm": 0.20677925646305084,
144
+ "learning_rate": 0.0001928501407195909,
145
+ "loss": 0.2246,
146
+ "step": 3200
147
+ },
148
+ {
149
+ "epoch": 0.2761533463287849,
150
+ "grad_norm": 0.1879427134990692,
151
+ "learning_rate": 0.0001918623034893289,
152
+ "loss": 0.222,
153
+ "step": 3400
154
+ },
155
+ {
156
+ "epoch": 0.29239766081871343,
157
+ "grad_norm": 0.16858752071857452,
158
+ "learning_rate": 0.00019081344059562963,
159
+ "loss": 0.22,
160
+ "step": 3600
161
+ },
162
+ {
163
+ "epoch": 0.30864197530864196,
164
+ "grad_norm": 0.21294647455215454,
165
+ "learning_rate": 0.00018970424881564228,
166
+ "loss": 0.2159,
167
+ "step": 3800
168
+ },
169
+ {
170
+ "epoch": 0.3248862897985705,
171
+ "grad_norm": 0.20722423493862152,
172
+ "learning_rate": 0.00018853546500400344,
173
+ "loss": 0.2163,
174
+ "step": 4000
175
+ },
176
+ {
177
+ "epoch": 0.3248862897985705,
178
+ "eval_loss": 0.2166534662246704,
179
+ "eval_runtime": 19.6837,
180
+ "eval_samples_per_second": 51.82,
181
+ "eval_steps_per_second": 0.813,
182
+ "step": 4000
183
+ },
184
+ {
185
+ "epoch": 0.341130604288499,
186
+ "grad_norm": 0.22913330793380737,
187
+ "learning_rate": 0.00018730786560333215,
188
+ "loss": 0.2134,
189
+ "step": 4200
190
+ },
191
+ {
192
+ "epoch": 0.35737491877842753,
193
+ "grad_norm": 0.19247737526893616,
194
+ "learning_rate": 0.00018602226612842636,
195
+ "loss": 0.215,
196
+ "step": 4400
197
+ },
198
+ {
199
+ "epoch": 0.37361923326835605,
200
+ "grad_norm": 0.20577874779701233,
201
+ "learning_rate": 0.00018467952062450248,
202
+ "loss": 0.2108,
203
+ "step": 4600
204
+ },
205
+ {
206
+ "epoch": 0.3898635477582846,
207
+ "grad_norm": 0.23740005493164062,
208
+ "learning_rate": 0.00018328052109983922,
209
+ "loss": 0.209,
210
+ "step": 4800
211
+ },
212
+ {
213
+ "epoch": 0.4061078622482131,
214
+ "grad_norm": 0.20311188697814941,
215
+ "learning_rate": 0.00018182619693320122,
216
+ "loss": 0.2096,
217
+ "step": 5000
218
+ },
219
+ {
220
+ "epoch": 0.4061078622482131,
221
+ "eval_loss": 0.2107837200164795,
222
+ "eval_runtime": 19.7072,
223
+ "eval_samples_per_second": 51.758,
224
+ "eval_steps_per_second": 0.812,
225
+ "step": 5000
226
+ },
227
+ {
228
+ "epoch": 0.4223521767381416,
229
+ "grad_norm": 0.196655735373497,
230
+ "learning_rate": 0.00018031751425643747,
231
+ "loss": 0.209,
232
+ "step": 5200
233
+ },
234
+ {
235
+ "epoch": 0.43859649122807015,
236
+ "grad_norm": 0.20977455377578735,
237
+ "learning_rate": 0.00017875547531266372,
238
+ "loss": 0.2028,
239
+ "step": 5400
240
+ },
241
+ {
242
+ "epoch": 0.4548408057179987,
243
+ "grad_norm": 0.20537307858467102,
244
+ "learning_rate": 0.00017714111779045555,
245
+ "loss": 0.2045,
246
+ "step": 5600
247
+ },
248
+ {
249
+ "epoch": 0.4710851202079272,
250
+ "grad_norm": 0.21848838031291962,
251
+ "learning_rate": 0.00017547551413449455,
252
+ "loss": 0.2018,
253
+ "step": 5800
254
+ },
255
+ {
256
+ "epoch": 0.4873294346978557,
257
+ "grad_norm": 0.2160533219575882,
258
+ "learning_rate": 0.0001737597708331254,
259
+ "loss": 0.202,
260
+ "step": 6000
261
+ },
262
+ {
263
+ "epoch": 0.4873294346978557,
264
+ "eval_loss": 0.20422479510307312,
265
+ "eval_runtime": 19.7114,
266
+ "eval_samples_per_second": 51.747,
267
+ "eval_steps_per_second": 0.812,
268
+ "step": 6000
269
+ },
270
+ {
271
+ "epoch": 0.5035737491877843,
272
+ "grad_norm": 0.20974092185497284,
273
+ "learning_rate": 0.0001719950276832973,
274
+ "loss": 0.2041,
275
+ "step": 6200
276
+ },
277
+ {
278
+ "epoch": 0.5198180636777128,
279
+ "grad_norm": 0.21118378639221191,
280
+ "learning_rate": 0.00017018245703337748,
281
+ "loss": 0.2005,
282
+ "step": 6400
283
+ },
284
+ {
285
+ "epoch": 0.5360623781676414,
286
+ "grad_norm": 0.19949886202812195,
287
+ "learning_rate": 0.00016832326300434094,
288
+ "loss": 0.1973,
289
+ "step": 6600
290
+ },
291
+ {
292
+ "epoch": 0.5523066926575698,
293
+ "grad_norm": 0.21284544467926025,
294
+ "learning_rate": 0.00016641868068985248,
295
+ "loss": 0.1996,
296
+ "step": 6800
297
+ },
298
+ {
299
+ "epoch": 0.5685510071474984,
300
+ "grad_norm": 0.2226913422346115,
301
+ "learning_rate": 0.00016446997533577364,
302
+ "loss": 0.1962,
303
+ "step": 7000
304
+ },
305
+ {
306
+ "epoch": 0.5685510071474984,
307
+ "eval_loss": 0.20037123560905457,
308
+ "eval_runtime": 19.7134,
309
+ "eval_samples_per_second": 51.741,
310
+ "eval_steps_per_second": 0.812,
311
+ "step": 7000
312
+ },
313
+ {
314
+ "epoch": 0.5847953216374269,
315
+ "grad_norm": 0.17608977854251862,
316
+ "learning_rate": 0.0001624784414996384,
317
+ "loss": 0.1961,
318
+ "step": 7200
319
+ },
320
+ {
321
+ "epoch": 0.6010396361273554,
322
+ "grad_norm": 0.17466193437576294,
323
+ "learning_rate": 0.00016044540219065722,
324
+ "loss": 0.1967,
325
+ "step": 7400
326
+ },
327
+ {
328
+ "epoch": 0.6172839506172839,
329
+ "grad_norm": 0.19703347980976105,
330
+ "learning_rate": 0.00015837220799081977,
331
+ "loss": 0.1957,
332
+ "step": 7600
333
+ },
334
+ {
335
+ "epoch": 0.6335282651072125,
336
+ "grad_norm": 0.23092713952064514,
337
+ "learning_rate": 0.00015626023615768087,
338
+ "loss": 0.1955,
339
+ "step": 7800
340
+ },
341
+ {
342
+ "epoch": 0.649772579597141,
343
+ "grad_norm": 0.19906634092330933,
344
+ "learning_rate": 0.0001541108897094251,
345
+ "loss": 0.1936,
346
+ "step": 8000
347
+ },
348
+ {
349
+ "epoch": 0.649772579597141,
350
+ "eval_loss": 0.1971496194601059,
351
+ "eval_runtime": 19.707,
352
+ "eval_samples_per_second": 51.758,
353
+ "eval_steps_per_second": 0.812,
354
+ "step": 8000
355
+ },
356
+ {
357
+ "epoch": 0.6660168940870695,
358
+ "grad_norm": 0.18658144772052765,
359
+ "learning_rate": 0.00015192559649281856,
360
+ "loss": 0.1927,
361
+ "step": 8200
362
+ },
363
+ {
364
+ "epoch": 0.682261208576998,
365
+ "grad_norm": 0.1791582852602005,
366
+ "learning_rate": 0.00014970580823466632,
367
+ "loss": 0.1932,
368
+ "step": 8400
369
+ },
370
+ {
371
+ "epoch": 0.6985055230669266,
372
+ "grad_norm": 0.2054402083158493,
373
+ "learning_rate": 0.00014745299957740596,
374
+ "loss": 0.1938,
375
+ "step": 8600
376
+ },
377
+ {
378
+ "epoch": 0.7147498375568551,
379
+ "grad_norm": 0.20000861585140228,
380
+ "learning_rate": 0.0001451686670994781,
381
+ "loss": 0.189,
382
+ "step": 8800
383
+ },
384
+ {
385
+ "epoch": 0.7309941520467836,
386
+ "grad_norm": 0.2405168116092682,
387
+ "learning_rate": 0.00014285432832112426,
388
+ "loss": 0.19,
389
+ "step": 9000
390
+ },
391
+ {
392
+ "epoch": 0.7309941520467836,
393
+ "eval_loss": 0.19398534297943115,
394
+ "eval_runtime": 19.7311,
395
+ "eval_samples_per_second": 51.695,
396
+ "eval_steps_per_second": 0.811,
397
+ "step": 9000
398
+ },
399
+ {
400
+ "epoch": 0.7472384665367121,
401
+ "grad_norm": 0.2128114253282547,
402
+ "learning_rate": 0.0001405115206962728,
403
+ "loss": 0.1899,
404
+ "step": 9200
405
+ },
406
+ {
407
+ "epoch": 0.7634827810266407,
408
+ "grad_norm": 0.2048308551311493,
409
+ "learning_rate": 0.0001381418005911826,
410
+ "loss": 0.191,
411
+ "step": 9400
412
+ },
413
+ {
414
+ "epoch": 0.7797270955165692,
415
+ "grad_norm": 0.23634588718414307,
416
+ "learning_rate": 0.0001357467422505229,
417
+ "loss": 0.187,
418
+ "step": 9600
419
+ },
420
+ {
421
+ "epoch": 0.7959714100064977,
422
+ "grad_norm": 0.2037070393562317,
423
+ "learning_rate": 0.0001333279367515763,
424
+ "loss": 0.1851,
425
+ "step": 9800
426
+ },
427
+ {
428
+ "epoch": 0.8122157244964262,
429
+ "grad_norm": 0.18611127138137817,
430
+ "learning_rate": 0.00013088699094725956,
431
+ "loss": 0.1881,
432
+ "step": 10000
433
+ },
434
+ {
435
+ "epoch": 0.8122157244964262,
436
+ "eval_loss": 0.19100497663021088,
437
+ "eval_runtime": 19.6987,
438
+ "eval_samples_per_second": 51.78,
439
+ "eval_steps_per_second": 0.812,
440
+ "step": 10000
441
+ },
442
+ {
443
+ "epoch": 0.8284600389863548,
444
+ "grad_norm": 0.2117115557193756,
445
+ "learning_rate": 0.00012842552639866423,
446
+ "loss": 0.1882,
447
+ "step": 10200
448
+ },
449
+ {
450
+ "epoch": 0.8447043534762833,
451
+ "grad_norm": 0.1892351508140564,
452
+ "learning_rate": 0.00012594517829782667,
453
+ "loss": 0.1894,
454
+ "step": 10400
455
+ },
456
+ {
457
+ "epoch": 0.8609486679662118,
458
+ "grad_norm": 0.20003050565719604,
459
+ "learning_rate": 0.00012344759438144253,
460
+ "loss": 0.1878,
461
+ "step": 10600
462
+ },
463
+ {
464
+ "epoch": 0.8771929824561403,
465
+ "grad_norm": 0.21592330932617188,
466
+ "learning_rate": 0.00012093443383624787,
467
+ "loss": 0.1855,
468
+ "step": 10800
469
+ },
470
+ {
471
+ "epoch": 0.8934372969460689,
472
+ "grad_norm": 0.19843119382858276,
473
+ "learning_rate": 0.00011840736619679346,
474
+ "loss": 0.1855,
475
+ "step": 11000
476
+ },
477
+ {
478
+ "epoch": 0.8934372969460689,
479
+ "eval_loss": 0.1880546659231186,
480
+ "eval_runtime": 19.7276,
481
+ "eval_samples_per_second": 51.704,
482
+ "eval_steps_per_second": 0.811,
483
+ "step": 11000
484
+ },
485
+ {
486
+ "epoch": 0.9096816114359974,
487
+ "grad_norm": 0.20809300243854523,
488
+ "learning_rate": 0.00011586807023634534,
489
+ "loss": 0.1832,
490
+ "step": 11200
491
+ },
492
+ {
493
+ "epoch": 0.9259259259259259,
494
+ "grad_norm": 0.19825714826583862,
495
+ "learning_rate": 0.00011331823285164744,
496
+ "loss": 0.1824,
497
+ "step": 11400
498
+ },
499
+ {
500
+ "epoch": 0.9421702404158544,
501
+ "grad_norm": 0.20077264308929443,
502
+ "learning_rate": 0.00011075954794228805,
503
+ "loss": 0.1864,
504
+ "step": 11600
505
+ },
506
+ {
507
+ "epoch": 0.958414554905783,
508
+ "grad_norm": 0.19551986455917358,
509
+ "learning_rate": 0.0001081937152854139,
510
+ "loss": 0.1833,
511
+ "step": 11800
512
+ },
513
+ {
514
+ "epoch": 0.9746588693957114,
515
+ "grad_norm": 0.20106661319732666,
516
+ "learning_rate": 0.00010562243940653995,
517
+ "loss": 0.1827,
518
+ "step": 12000
519
+ },
520
+ {
521
+ "epoch": 0.9746588693957114,
522
+ "eval_loss": 0.1855010986328125,
523
+ "eval_runtime": 19.7306,
524
+ "eval_samples_per_second": 51.696,
525
+ "eval_steps_per_second": 0.811,
526
+ "step": 12000
527
+ },
528
+ {
529
+ "epoch": 0.99090318388564,
530
+ "grad_norm": 0.22406485676765442,
531
+ "learning_rate": 0.00010304742844720441,
532
+ "loss": 0.1827,
533
+ "step": 12200
534
+ },
535
+ {
536
+ "epoch": 1.0071474983755686,
537
+ "grad_norm": 0.20113199949264526,
538
+ "learning_rate": 0.00010047039303022183,
539
+ "loss": 0.1779,
540
+ "step": 12400
541
+ },
542
+ {
543
+ "epoch": 1.023391812865497,
544
+ "grad_norm": 0.19607332348823547,
545
+ "learning_rate": 9.789304512328798e-05,
546
+ "loss": 0.1712,
547
+ "step": 12600
548
+ },
549
+ {
550
+ "epoch": 1.0396361273554255,
551
+ "grad_norm": 0.2127198576927185,
552
+ "learning_rate": 9.531709690169074e-05,
553
+ "loss": 0.1702,
554
+ "step": 12800
555
+ },
556
+ {
557
+ "epoch": 1.0558804418453542,
558
+ "grad_norm": 0.21598124504089355,
559
+ "learning_rate": 9.274425961088378e-05,
560
+ "loss": 0.1684,
561
+ "step": 13000
562
+ },
563
+ {
564
+ "epoch": 1.0558804418453542,
565
+ "eval_loss": 0.18371237814426422,
566
+ "eval_runtime": 19.6853,
567
+ "eval_samples_per_second": 51.815,
568
+ "eval_steps_per_second": 0.813,
569
+ "step": 13000
570
+ },
571
+ {
572
+ "epoch": 1.0721247563352827,
573
+ "grad_norm": 0.2000730037689209,
574
+ "learning_rate": 9.017624242967728e-05,
575
+ "loss": 0.1701,
576
+ "step": 13200
577
+ },
578
+ {
579
+ "epoch": 1.0883690708252112,
580
+ "grad_norm": 0.18844091892242432,
581
+ "learning_rate": 8.761475133480227e-05,
582
+ "loss": 0.1715,
583
+ "step": 13400
584
+ },
585
+ {
586
+ "epoch": 1.1046133853151396,
587
+ "grad_norm": 0.18997368216514587,
588
+ "learning_rate": 8.506148796760136e-05,
589
+ "loss": 0.1688,
590
+ "step": 13600
591
+ },
592
+ {
593
+ "epoch": 1.120857699805068,
594
+ "grad_norm": 0.20101472735404968,
595
+ "learning_rate": 8.251814850360028e-05,
596
+ "loss": 0.1692,
597
+ "step": 13800
598
+ },
599
+ {
600
+ "epoch": 1.1371020142949968,
601
+ "grad_norm": 0.23084598779678345,
602
+ "learning_rate": 7.998642252571007e-05,
603
+ "loss": 0.1702,
604
+ "step": 14000
605
+ },
606
+ {
607
+ "epoch": 1.1371020142949968,
608
+ "eval_loss": 0.1824684888124466,
609
+ "eval_runtime": 19.694,
610
+ "eval_samples_per_second": 51.792,
611
+ "eval_steps_per_second": 0.812,
612
+ "step": 14000
613
+ },
614
+ {
615
+ "epoch": 1.1533463287849253,
616
+ "grad_norm": 0.23068080842494965,
617
+ "learning_rate": 7.746799190180865e-05,
618
+ "loss": 0.1718,
619
+ "step": 14200
620
+ },
621
+ {
622
+ "epoch": 1.1695906432748537,
623
+ "grad_norm": 0.23622921109199524,
624
+ "learning_rate": 7.496452966744831e-05,
625
+ "loss": 0.1716,
626
+ "step": 14400
627
+ },
628
+ {
629
+ "epoch": 1.1858349577647824,
630
+ "grad_norm": 0.19296401739120483,
631
+ "learning_rate": 7.247769891442986e-05,
632
+ "loss": 0.1669,
633
+ "step": 14600
634
+ },
635
+ {
636
+ "epoch": 1.202079272254711,
637
+ "grad_norm": 0.23187531530857086,
638
+ "learning_rate": 7.000915168598323e-05,
639
+ "loss": 0.1685,
640
+ "step": 14800
641
+ },
642
+ {
643
+ "epoch": 1.2183235867446394,
644
+ "grad_norm": 0.1888766884803772,
645
+ "learning_rate": 6.756052787928729e-05,
646
+ "loss": 0.1666,
647
+ "step": 15000
648
+ },
649
+ {
650
+ "epoch": 1.2183235867446394,
651
+ "eval_loss": 0.18065877258777618,
652
+ "eval_runtime": 19.7279,
653
+ "eval_samples_per_second": 51.703,
654
+ "eval_steps_per_second": 0.811,
655
+ "step": 15000
656
+ },
657
+ {
658
+ "epoch": 1.2345679012345678,
659
+ "grad_norm": 0.21059545874595642,
660
+ "learning_rate": 6.513345415605897e-05,
661
+ "loss": 0.1698,
662
+ "step": 15200
663
+ },
664
+ {
665
+ "epoch": 1.2508122157244963,
666
+ "grad_norm": 0.19575056433677673,
667
+ "learning_rate": 6.272954286193452e-05,
668
+ "loss": 0.1707,
669
+ "step": 15400
670
+ },
671
+ {
672
+ "epoch": 1.267056530214425,
673
+ "grad_norm": 0.20981614291667938,
674
+ "learning_rate": 6.0350390955361526e-05,
675
+ "loss": 0.1709,
676
+ "step": 15600
677
+ },
678
+ {
679
+ "epoch": 1.2833008447043535,
680
+ "grad_norm": 0.2034655064344406,
681
+ "learning_rate": 5.7997578946712606e-05,
682
+ "loss": 0.1653,
683
+ "step": 15800
684
+ },
685
+ {
686
+ "epoch": 1.299545159194282,
687
+ "grad_norm": 0.20033232867717743,
688
+ "learning_rate": 5.5672669848325975e-05,
689
+ "loss": 0.1666,
690
+ "step": 16000
691
+ },
692
+ {
693
+ "epoch": 1.299545159194282,
694
+ "eval_loss": 0.17909197509288788,
695
+ "eval_runtime": 19.7164,
696
+ "eval_samples_per_second": 51.734,
697
+ "eval_steps_per_second": 0.812,
698
+ "step": 16000
699
+ },
700
+ {
701
+ "epoch": 1.3157894736842106,
702
+ "grad_norm": 0.21548283100128174,
703
+ "learning_rate": 5.337720813617022e-05,
704
+ "loss": 0.1665,
705
+ "step": 16200
706
+ },
707
+ {
708
+ "epoch": 1.332033788174139,
709
+ "grad_norm": 0.21256336569786072,
710
+ "learning_rate": 5.111271872382315e-05,
711
+ "loss": 0.1653,
712
+ "step": 16400
713
+ },
714
+ {
715
+ "epoch": 1.3482781026640676,
716
+ "grad_norm": 0.21538503468036652,
717
+ "learning_rate": 4.8880705949446205e-05,
718
+ "loss": 0.1679,
719
+ "step": 16600
720
+ },
721
+ {
722
+ "epoch": 1.364522417153996,
723
+ "grad_norm": 0.2733672857284546,
724
+ "learning_rate": 4.668265257642743e-05,
725
+ "loss": 0.165,
726
+ "step": 16800
727
+ },
728
+ {
729
+ "epoch": 1.3807667316439245,
730
+ "grad_norm": 0.205805242061615,
731
+ "learning_rate": 4.452001880835702e-05,
732
+ "loss": 0.1645,
733
+ "step": 17000
734
+ },
735
+ {
736
+ "epoch": 1.3807667316439245,
737
+ "eval_loss": 0.17794859409332275,
738
+ "eval_runtime": 19.728,
739
+ "eval_samples_per_second": 51.703,
740
+ "eval_steps_per_second": 0.811,
741
+ "step": 17000
742
+ },
743
+ {
744
+ "epoch": 1.3970110461338532,
745
+ "grad_norm": 0.21134072542190552,
746
+ "learning_rate": 4.239424131898965e-05,
747
+ "loss": 0.1648,
748
+ "step": 17200
749
+ },
750
+ {
751
+ "epoch": 1.4132553606237817,
752
+ "grad_norm": 0.2676503360271454,
753
+ "learning_rate": 4.0306732297838156e-05,
754
+ "loss": 0.1657,
755
+ "step": 17400
756
+ },
757
+ {
758
+ "epoch": 1.4294996751137101,
759
+ "grad_norm": 0.21757201850414276,
760
+ "learning_rate": 3.8258878512032325e-05,
761
+ "loss": 0.1652,
762
+ "step": 17600
763
+ },
764
+ {
765
+ "epoch": 1.4457439896036388,
766
+ "grad_norm": 0.2398853451013565,
767
+ "learning_rate": 3.625204038506639e-05,
768
+ "loss": 0.1636,
769
+ "step": 17800
770
+ },
771
+ {
772
+ "epoch": 1.4619883040935673,
773
+ "grad_norm": 0.25057992339134216,
774
+ "learning_rate": 3.4287551093046935e-05,
775
+ "loss": 0.1659,
776
+ "step": 18000
777
+ },
778
+ {
779
+ "epoch": 1.4619883040935673,
780
+ "eval_loss": 0.1766623556613922,
781
+ "eval_runtime": 19.714,
782
+ "eval_samples_per_second": 51.74,
783
+ "eval_steps_per_second": 0.812,
784
+ "step": 18000
785
+ },
786
+ {
787
+ "epoch": 1.4782326185834957,
788
+ "grad_norm": 0.20068474113941193,
789
+ "learning_rate": 3.236671567904178e-05,
790
+ "loss": 0.1645,
791
+ "step": 18200
792
+ },
793
+ {
794
+ "epoch": 1.4944769330734242,
795
+ "grad_norm": 0.20740877091884613,
796
+ "learning_rate": 3.0490810186118e-05,
797
+ "loss": 0.1654,
798
+ "step": 18400
799
+ },
800
+ {
801
+ "epoch": 1.5107212475633527,
802
+ "grad_norm": 0.2191922813653946,
803
+ "learning_rate": 2.8661080809645212e-05,
804
+ "loss": 0.164,
805
+ "step": 18600
806
+ },
807
+ {
808
+ "epoch": 1.5269655620532814,
809
+ "grad_norm": 0.2751601040363312,
810
+ "learning_rate": 2.6878743069427183e-05,
811
+ "loss": 0.1644,
812
+ "step": 18800
813
+ },
814
+ {
815
+ "epoch": 1.5432098765432098,
816
+ "grad_norm": 0.2415914088487625,
817
+ "learning_rate": 2.5144981002211642e-05,
818
+ "loss": 0.1636,
819
+ "step": 19000
820
+ },
821
+ {
822
+ "epoch": 1.5432098765432098,
823
+ "eval_loss": 0.17572885751724243,
824
+ "eval_runtime": 19.7271,
825
+ "eval_samples_per_second": 51.705,
826
+ "eval_steps_per_second": 0.811,
827
+ "step": 19000
828
+ },
829
+ {
830
+ "epoch": 1.5594541910331383,
831
+ "grad_norm": 0.26504069566726685,
832
+ "learning_rate": 2.3460946375115032e-05,
833
+ "loss": 0.1659,
834
+ "step": 19200
835
+ },
836
+ {
837
+ "epoch": 1.575698505523067,
838
+ "grad_norm": 0.18608888983726501,
839
+ "learning_rate": 2.1827757920484182e-05,
840
+ "loss": 0.1628,
841
+ "step": 19400
842
+ },
843
+ {
844
+ "epoch": 1.5919428200129955,
845
+ "grad_norm": 0.22266308963298798,
846
+ "learning_rate": 2.024650059270382e-05,
847
+ "loss": 0.1618,
848
+ "step": 19600
849
+ },
850
+ {
851
+ "epoch": 1.608187134502924,
852
+ "grad_norm": 0.23721040785312653,
853
+ "learning_rate": 1.871822484744321e-05,
854
+ "loss": 0.1644,
855
+ "step": 19800
856
+ },
857
+ {
858
+ "epoch": 1.6244314489928526,
859
+ "grad_norm": 0.2157219648361206,
860
+ "learning_rate": 1.7243945943820928e-05,
861
+ "loss": 0.162,
862
+ "step": 20000
863
+ },
864
+ {
865
+ "epoch": 1.6244314489928526,
866
+ "eval_loss": 0.17499883472919464,
867
+ "eval_runtime": 19.6987,
868
+ "eval_samples_per_second": 51.78,
869
+ "eval_steps_per_second": 0.812,
870
+ "step": 20000
871
+ },
872
+ {
873
+ "epoch": 1.6406757634827809,
874
+ "grad_norm": 0.2527523934841156,
875
+ "learning_rate": 1.582464326995121e-05,
876
+ "loss": 0.1637,
877
+ "step": 20200
878
+ },
879
+ {
880
+ "epoch": 1.6569200779727096,
881
+ "grad_norm": 0.191372349858284,
882
+ "learning_rate": 1.4461259692320117e-05,
883
+ "loss": 0.1632,
884
+ "step": 20400
885
+ },
886
+ {
887
+ "epoch": 1.673164392462638,
888
+ "grad_norm": 0.19453167915344238,
889
+ "learning_rate": 1.3154700929423513e-05,
890
+ "loss": 0.162,
891
+ "step": 20600
892
+ },
893
+ {
894
+ "epoch": 1.6894087069525665,
895
+ "grad_norm": 0.21305137872695923,
896
+ "learning_rate": 1.1905834950083172e-05,
897
+ "loss": 0.1619,
898
+ "step": 20800
899
+ },
900
+ {
901
+ "epoch": 1.7056530214424952,
902
+ "grad_norm": 0.23795385658740997,
903
+ "learning_rate": 1.0715491396840616e-05,
904
+ "loss": 0.1616,
905
+ "step": 21000
906
+ },
907
+ {
908
+ "epoch": 1.7056530214424952,
909
+ "eval_loss": 0.17426368594169617,
910
+ "eval_runtime": 19.7022,
911
+ "eval_samples_per_second": 51.771,
912
+ "eval_steps_per_second": 0.812,
913
+ "step": 21000
914
+ },
915
+ {
916
+ "epoch": 1.7218973359324237,
917
+ "grad_norm": 0.18130673468112946,
918
+ "learning_rate": 9.584461034811598e-06,
919
+ "loss": 0.1612,
920
+ "step": 21200
921
+ },
922
+ {
923
+ "epoch": 1.7381416504223521,
924
+ "grad_norm": 0.22604148089885712,
925
+ "learning_rate": 8.51349522636773e-06,
926
+ "loss": 0.1611,
927
+ "step": 21400
928
+ },
929
+ {
930
+ "epoch": 1.7543859649122808,
931
+ "grad_norm": 0.21537244319915771,
932
+ "learning_rate": 7.5033054319938455e-06,
933
+ "loss": 0.1627,
934
+ "step": 21600
935
+ },
936
+ {
937
+ "epoch": 1.770630279402209,
938
+ "grad_norm": 0.20729561150074005,
939
+ "learning_rate": 6.554562737652936e-06,
940
+ "loss": 0.1629,
941
+ "step": 21800
942
+ },
943
+ {
944
+ "epoch": 1.7868745938921378,
945
+ "grad_norm": 0.22702686488628387,
946
+ "learning_rate": 5.667897408972511e-06,
947
+ "loss": 0.1604,
948
+ "step": 22000
949
+ },
950
+ {
951
+ "epoch": 1.7868745938921378,
952
+ "eval_loss": 0.17392824590206146,
953
+ "eval_runtime": 19.7328,
954
+ "eval_samples_per_second": 51.69,
955
+ "eval_steps_per_second": 0.811,
956
+ "step": 22000
957
+ },
958
+ {
959
+ "epoch": 1.8031189083820662,
960
+ "grad_norm": 0.2154800146818161,
961
+ "learning_rate": 4.843898472548625e-06,
962
+ "loss": 0.1633,
963
+ "step": 22200
964
+ },
965
+ {
966
+ "epoch": 1.8193632228719947,
967
+ "grad_norm": 0.2301916629076004,
968
+ "learning_rate": 4.083113324645638e-06,
969
+ "loss": 0.1639,
970
+ "step": 22400
971
+ },
972
+ {
973
+ "epoch": 1.8356075373619234,
974
+ "grad_norm": 0.21402500569820404,
975
+ "learning_rate": 3.386047367551759e-06,
976
+ "loss": 0.1634,
977
+ "step": 22600
978
+ },
979
+ {
980
+ "epoch": 1.8518518518518519,
981
+ "grad_norm": 0.208083838224411,
982
+ "learning_rate": 2.753163673831882e-06,
983
+ "loss": 0.1597,
984
+ "step": 22800
985
+ },
986
+ {
987
+ "epoch": 1.8680961663417803,
988
+ "grad_norm": 0.2186596691608429,
989
+ "learning_rate": 2.1848826787006615e-06,
990
+ "loss": 0.1607,
991
+ "step": 23000
992
+ },
993
+ {
994
+ "epoch": 1.8680961663417803,
995
+ "eval_loss": 0.17368420958518982,
996
+ "eval_runtime": 19.6988,
997
+ "eval_samples_per_second": 51.78,
998
+ "eval_steps_per_second": 0.812,
999
+ "step": 23000
1000
+ },
1001
+ {
1002
+ "epoch": 1.884340480831709,
1003
+ "grad_norm": 0.21974900364875793,
1004
+ "learning_rate": 1.6815819007204414e-06,
1005
+ "loss": 0.1608,
1006
+ "step": 23200
1007
+ },
1008
+ {
1009
+ "epoch": 1.9005847953216373,
1010
+ "grad_norm": 0.23063206672668457,
1011
+ "learning_rate": 1.2435956910093472e-06,
1012
+ "loss": 0.1608,
1013
+ "step": 23400
1014
+ },
1015
+ {
1016
+ "epoch": 1.916829109811566,
1017
+ "grad_norm": 0.1983364224433899,
1018
+ "learning_rate": 8.712150111262362e-07,
1019
+ "loss": 0.1633,
1020
+ "step": 23600
1021
+ },
1022
+ {
1023
+ "epoch": 1.9330734243014944,
1024
+ "grad_norm": 0.20724935829639435,
1025
+ "learning_rate": 5.646872397800817e-07,
1026
+ "loss": 0.1587,
1027
+ "step": 23800
1028
+ },
1029
+ {
1030
+ "epoch": 1.949317738791423,
1031
+ "grad_norm": 0.21330738067626953,
1032
+ "learning_rate": 3.2421600849217305e-07,
1033
+ "loss": 0.1631,
1034
+ "step": 24000
1035
+ },
1036
+ {
1037
+ "epoch": 1.949317738791423,
1038
+ "eval_loss": 0.17362003028392792,
1039
+ "eval_runtime": 19.7231,
1040
+ "eval_samples_per_second": 51.716,
1041
+ "eval_steps_per_second": 0.811,
1042
+ "step": 24000
1043
+ },
1044
+ {
1045
+ "epoch": 1.9655620532813516,
1046
+ "grad_norm": 0.23539352416992188,
1047
+ "learning_rate": 1.4996106632028105e-07,
1048
+ "loss": 0.1617,
1049
+ "step": 24200
1050
+ },
1051
+ {
1052
+ "epoch": 1.98180636777128,
1053
+ "grad_norm": 0.19572819769382477,
1054
+ "learning_rate": 4.20381737346931e-08,
1055
+ "loss": 0.1624,
1056
+ "step": 24400
1057
+ },
1058
+ {
1059
+ "epoch": 1.9980506822612085,
1060
+ "grad_norm": 0.2120121866464615,
1061
+ "learning_rate": 5.190257165943457e-10,
1062
+ "loss": 0.1628,
1063
+ "step": 24600
1064
+ },
1065
+ {
1066
+ "epoch": 2.0,
1067
+ "step": 24624,
1068
+ "total_flos": 1.6014138134652518e+19,
1069
+ "train_loss": 0.18958150440024585,
1070
+ "train_runtime": 22595.591,
1071
+ "train_samples_per_second": 34.872,
1072
+ "train_steps_per_second": 1.09
1073
+ }
1074
+ ],
1075
+ "logging_steps": 200,
1076
+ "max_steps": 24624,
1077
+ "num_input_tokens_seen": 0,
1078
+ "num_train_epochs": 2,
1079
+ "save_steps": 0,
1080
+ "stateful_callbacks": {
1081
+ "TrainerControl": {
1082
+ "args": {
1083
+ "should_epoch_stop": false,
1084
+ "should_evaluate": false,
1085
+ "should_log": false,
1086
+ "should_save": true,
1087
+ "should_training_stop": true
1088
+ },
1089
+ "attributes": {}
1090
+ }
1091
+ },
1092
+ "total_flos": 1.6014138134652518e+19,
1093
+ "train_batch_size": 32,
1094
+ "trial_name": null,
1095
+ "trial_params": null
1096
+ }
nl_tasks/exprep/run_ex30/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex30/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex30/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex30/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex30/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex30/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "v_proj",
15
+ "q_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex30/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:93c119ca2562922eb094806c55580b346b76da571a6afb000de0de2804526457
3
+ size 33602915
nl_tasks/exprep/run_ex30/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.7
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 47.83927217589083
nl_tasks/exprep/run_ex30/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.2035895735025406,
15
+ "learning_rate": 0.0007998180972402738,
16
+ "loss": 0.4427,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.18317027390003204,
22
+ "learning_rate": 0.0007992419381164945,
23
+ "loss": 0.3411,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.1971741020679474,
29
+ "learning_rate": 0.0007982717754008577,
30
+ "loss": 0.3349,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.19311119616031647,
36
+ "learning_rate": 0.0007969085665268344,
37
+ "loss": 0.3144,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.20192188024520874,
43
+ "learning_rate": 0.0007951536568170009,
44
+ "loss": 0.3043,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.20001259446144104,
50
+ "learning_rate": 0.0007930087781553683,
51
+ "loss": 0.2996,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.18023760616779327,
57
+ "learning_rate": 0.0007904760472782212,
58
+ "loss": 0.2956,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.22623836994171143,
64
+ "learning_rate": 0.0007875579636851548,
65
+ "loss": 0.2975,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.19716477394104004,
71
+ "learning_rate": 0.0007842574071723712,
72
+ "loss": 0.2929,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.20640788972377777,
78
+ "learning_rate": 0.0007805776349906676,
79
+ "loss": 0.2872,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.20449821650981903,
85
+ "learning_rate": 0.0007765222786309228,
86
+ "loss": 0.2867,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.2329019457101822,
92
+ "learning_rate": 0.0007720953402402549,
93
+ "loss": 0.2964,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.18425531685352325,
99
+ "learning_rate": 0.0007673011886723848,
100
+ "loss": 0.2887,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.26162323355674744,
106
+ "learning_rate": 0.0007621445551761047,
107
+ "loss": 0.2889,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.2894303798675537,
113
+ "learning_rate": 0.0007566305287261081,
114
+ "loss": 0.2827,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.25710630416870117,
120
+ "learning_rate": 0.0007507645510007842,
121
+ "loss": 0.2834,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.18078552186489105,
127
+ "learning_rate": 0.00074455241101194,
128
+ "loss": 0.272,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.2192271500825882,
134
+ "learning_rate": 0.0007380002393917437,
135
+ "loss": 0.2734,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.21021409332752228,
141
+ "learning_rate": 0.0007311145023425311,
142
+ "loss": 0.27,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.2234291136264801,
148
+ "learning_rate": 0.000723901995255445,
149
+ "loss": 0.2681,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.25978487730026245,
155
+ "learning_rate": 0.0007163698360042034,
156
+ "loss": 0.2704,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.21893468499183655,
162
+ "learning_rate": 0.0007085254579206188,
163
+ "loss": 0.2586,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.2398931384086609,
169
+ "learning_rate": 0.0007003766024587967,
170
+ "loss": 0.2755,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.20067504048347473,
176
+ "learning_rate": 0.0006919313115552542,
177
+ "loss": 0.2605,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.2185235172510147,
183
+ "learning_rate": 0.0006831979196925012,
184
+ "loss": 0.2745,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.18560826778411865,
190
+ "learning_rate": 0.0006741850456739107,
191
+ "loss": 0.2645,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.16883058845996857,
197
+ "learning_rate": 0.0006649015841180021,
198
+ "loss": 0.2541,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.21809938549995422,
204
+ "learning_rate": 0.0006553566966805246,
205
+ "loss": 0.2569,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.22009630501270294,
211
+ "learning_rate": 0.0006455598030130111,
212
+ "loss": 0.2627,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.18782204389572144,
218
+ "learning_rate": 0.000635520571466718,
219
+ "loss": 0.259,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.19043266773223877,
225
+ "learning_rate": 0.0006252489095511306,
226
+ "loss": 0.2472,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.21763938665390015,
232
+ "learning_rate": 0.0006147549541564466,
233
+ "loss": 0.2519,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.2114492952823639,
239
+ "learning_rate": 0.0006040490615496899,
240
+ "loss": 0.2459,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.20547744631767273,
246
+ "learning_rate": 0.0005931417971543255,
247
+ "loss": 0.2476,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.21233369410037994,
253
+ "learning_rate": 0.0005820439251234615,
254
+ "loss": 0.2551,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.1882777214050293,
260
+ "learning_rate": 0.0005707663977169301,
261
+ "loss": 0.2503,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.21111281216144562,
267
+ "learning_rate": 0.0005593203444927291,
268
+ "loss": 0.2461,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.18701361119747162,
274
+ "learning_rate": 0.0005477170613234922,
275
+ "loss": 0.2436,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.1812943071126938,
281
+ "learning_rate": 0.0005359679992488259,
282
+ "loss": 0.2399,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.2267715334892273,
288
+ "learning_rate": 0.0005240847531745164,
289
+ "loss": 0.2459,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.2205505669116974,
295
+ "learning_rate": 0.0005120790504297575,
296
+ "loss": 0.2456,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.19082708656787872,
302
+ "learning_rate": 0.0004999627391936922,
303
+ "loss": 0.236,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.22549352049827576,
309
+ "learning_rate": 0.00048774777680269044,
310
+ "loss": 0.2329,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.183608740568161,
316
+ "learning_rate": 0.00047544621794990197,
317
+ "loss": 0.2326,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.19716492295265198,
323
+ "learning_rate": 0.0004630702027887291,
324
+ "loss": 0.2423,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.18563736975193024,
330
+ "learning_rate": 0.0004506319449519617,
331
+ "loss": 0.2348,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.19331008195877075,
337
+ "learning_rate": 0.00043814371949839543,
338
+ "loss": 0.2375,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.1977284848690033,
344
+ "learning_rate": 0.0004256178507988314,
345
+ "loss": 0.2375,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.20143352448940277,
351
+ "learning_rate": 0.00041306670037340957,
352
+ "loss": 0.2385,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.1983175277709961,
358
+ "learning_rate": 0.00040050265469228103,
359
+ "loss": 0.2283,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.18391765654087067,
365
+ "learning_rate": 0.00038793811295165725,
366
+ "loss": 0.197,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.16245220601558685,
372
+ "learning_rate": 0.0003753854748373012,
373
+ "loss": 0.204,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.20023547112941742,
379
+ "learning_rate": 0.00036285712828753295,
380
+ "loss": 0.199,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.19259585440158844,
386
+ "learning_rate": 0.0003503654372678317,
387
+ "loss": 0.2051,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.1709466576576233,
393
+ "learning_rate": 0.0003379227295690936,
394
+ "loss": 0.2091,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.1860114187002182,
400
+ "learning_rate": 0.0003255412846415912,
401
+ "loss": 0.2073,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.18231141567230225,
407
+ "learning_rate": 0.0003132333214766379,
408
+ "loss": 0.1989,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.18689118325710297,
414
+ "learning_rate": 0.0003010109865479191,
415
+ "loss": 0.2046,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.17233537137508392,
421
+ "learning_rate": 0.0002888863418243891,
422
+ "loss": 0.2024,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.20785579085350037,
428
+ "learning_rate": 0.0002768713528665639,
429
+ "loss": 0.2026,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.1873149424791336,
435
+ "learning_rate": 0.0002649778770179578,
436
+ "loss": 0.2056,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.16736917197704315,
442
+ "learning_rate": 0.00025321765170331634,
443
+ "loss": 0.1975,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.17514999210834503,
449
+ "learning_rate": 0.00024160228284519604,
450
+ "loss": 0.1966,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.20123517513275146,
456
+ "learning_rate": 0.0002301432334103195,
457
+ "loss": 0.1913,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.1817624270915985,
463
+ "learning_rate": 0.00021885181209701105,
464
+ "loss": 0.1928,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.2657231390476227,
470
+ "learning_rate": 0.0002077391621748769,
471
+ "loss": 0.1871,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.1619703769683838,
477
+ "learning_rate": 0.0001968162504877441,
478
+ "loss": 0.1914,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.17354673147201538,
484
+ "learning_rate": 0.0001860938566307083,
485
+ "loss": 0.1895,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.1780649721622467,
491
+ "learning_rate": 0.00017558256231197655,
492
+ "loss": 0.1911,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.19166697561740875,
498
+ "learning_rate": 0.00016529274090999708,
499
+ "loss": 0.1905,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.17973805963993073,
505
+ "learning_rate": 0.00015523454723618882,
506
+ "loss": 0.1922,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.19029191136360168,
512
+ "learning_rate": 0.0001454179075133671,
513
+ "loss": 0.1801,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.1597386747598648,
519
+ "learning_rate": 0.00013585250957976128,
520
+ "loss": 0.1873,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.21397747099399567,
526
+ "learning_rate": 0.00012654779332828725,
527
+ "loss": 0.1891,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.18154075741767883,
533
+ "learning_rate": 0.00011751294139051308,
534
+ "loss": 0.1898,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.19641812145709991,
540
+ "learning_rate": 0.00010875687007451065,
541
+ "loss": 0.1883,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.1968475729227066,
547
+ "learning_rate": 0.00010028822056553551,
548
+ "loss": 0.187,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.1694275140762329,
554
+ "learning_rate": 9.211535039822043e-05,
555
+ "loss": 0.1776,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.21530286967754364,
561
+ "learning_rate": 8.424632520869823e-05,
562
+ "loss": 0.1902,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.1810137927532196,
568
+ "learning_rate": 7.668891077479186e-05,
569
+ "loss": 0.1885,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.1794368177652359,
575
+ "learning_rate": 6.945056535212984e-05,
576
+ "loss": 0.1765,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.1721193790435791,
582
+ "learning_rate": 6.253843231374847e-05,
583
+ "loss": 0.1851,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.16067421436309814,
589
+ "learning_rate": 5.5959333100444563e-05,
590
+ "loss": 0.188,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.189836785197258,
596
+ "learning_rate": 4.971976048883749e-05,
597
+ "loss": 0.1857,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.1649639755487442,
603
+ "learning_rate": 4.3825872183782046e-05,
604
+ "loss": 0.1844,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.16502904891967773,
610
+ "learning_rate": 3.8283484741457754e-05,
611
+ "loss": 0.1776,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.16162428259849548,
617
+ "learning_rate": 3.3098067829129405e-05,
618
+ "loss": 0.1927,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.17041239142417908,
624
+ "learning_rate": 2.827473882724667e-05,
625
+ "loss": 0.1844,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.17938221991062164,
631
+ "learning_rate": 2.381825777920681e-05,
632
+ "loss": 0.1779,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.1670071929693222,
638
+ "learning_rate": 1.9733022693766732e-05,
639
+ "loss": 0.1864,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1783543825149536,
645
+ "learning_rate": 1.602306520473933e-05,
646
+ "loss": 0.1786,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.29920652508735657,
652
+ "learning_rate": 1.2692046592257711e-05,
653
+ "loss": 0.1826,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.18573811650276184,
659
+ "learning_rate": 9.743254169533612e-06,
660
+ "loss": 0.1853,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.18126627802848816,
666
+ "learning_rate": 7.1795980386763166e-06,
667
+ "loss": 0.177,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.20497091114521027,
673
+ "learning_rate": 5.003608218772949e-06,
674
+ "loss": 0.2003,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.20054228603839874,
680
+ "learning_rate": 3.2174321490654023e-06,
681
+ "loss": 0.1859,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.2017177790403366,
687
+ "learning_rate": 1.8228325696867388e-06,
688
+ "loss": 0.1799,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.1703929454088211,
694
+ "learning_rate": 8.211857820497936e-07,
695
+ "loss": 0.1817,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.17276810109615326,
701
+ "learning_rate": 2.134802906036626e-07,
702
+ "loss": 0.1864,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.1665695160627365,
708
+ "learning_rate": 3.158272992909161e-10,
709
+ "loss": 0.1824,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.23009043865203857,
717
+ "train_runtime": 2201.1437,
718
+ "train_samples_per_second": 36.345,
719
+ "train_steps_per_second": 1.136
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exprep/run_ex31/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex31/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex31/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex31/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex31/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex31/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex31/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3a80f922e9595d6db3071d39c2d423c36f10395f12d866ebc7c05f1e234375fc
3
+ size 33602915
nl_tasks/exprep/run_ex31/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.86
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 49.73464746019712
nl_tasks/exprep/run_ex31/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 0.6388801336288452,
15
+ "learning_rate": 0.0007998180972402738,
16
+ "loss": 0.4521,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.1956762820482254,
22
+ "learning_rate": 0.0007992419381164945,
23
+ "loss": 0.3443,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.20113986730575562,
29
+ "learning_rate": 0.0007982717754008577,
30
+ "loss": 0.3332,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.19363462924957275,
36
+ "learning_rate": 0.0007969085665268344,
37
+ "loss": 0.3138,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.19487079977989197,
43
+ "learning_rate": 0.0007951536568170009,
44
+ "loss": 0.3055,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.19735132157802582,
50
+ "learning_rate": 0.0007930087781553683,
51
+ "loss": 0.2989,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.18417081236839294,
57
+ "learning_rate": 0.0007904760472782212,
58
+ "loss": 0.2963,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.22813156247138977,
64
+ "learning_rate": 0.0007875579636851548,
65
+ "loss": 0.2967,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.20101189613342285,
71
+ "learning_rate": 0.0007842574071723712,
72
+ "loss": 0.2923,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2177446186542511,
78
+ "learning_rate": 0.0007805776349906676,
79
+ "loss": 0.2868,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.1931687593460083,
85
+ "learning_rate": 0.0007765222786309228,
86
+ "loss": 0.2866,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.20569033920764923,
92
+ "learning_rate": 0.0007720953402402549,
93
+ "loss": 0.2971,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.19738474488258362,
99
+ "learning_rate": 0.0007673011886723848,
100
+ "loss": 0.2892,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.2222980111837387,
106
+ "learning_rate": 0.0007621445551761047,
107
+ "loss": 0.2899,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.20760799944400787,
113
+ "learning_rate": 0.0007566305287261081,
114
+ "loss": 0.2817,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.22088174521923065,
120
+ "learning_rate": 0.0007507645510007842,
121
+ "loss": 0.2826,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.19174650311470032,
127
+ "learning_rate": 0.00074455241101194,
128
+ "loss": 0.2719,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.23881922662258148,
134
+ "learning_rate": 0.0007380002393917437,
135
+ "loss": 0.2733,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.21565735340118408,
141
+ "learning_rate": 0.0007311145023425311,
142
+ "loss": 0.2703,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.23798316717147827,
148
+ "learning_rate": 0.000723901995255445,
149
+ "loss": 0.2687,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.19707560539245605,
155
+ "learning_rate": 0.0007163698360042034,
156
+ "loss": 0.2697,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.2223428189754486,
162
+ "learning_rate": 0.0007085254579206188,
163
+ "loss": 0.2585,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.21256311237812042,
169
+ "learning_rate": 0.0007003766024587967,
170
+ "loss": 0.2765,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.198720782995224,
176
+ "learning_rate": 0.0006919313115552542,
177
+ "loss": 0.2606,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.2130541056394577,
183
+ "learning_rate": 0.0006831979196925012,
184
+ "loss": 0.2756,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.18196836113929749,
190
+ "learning_rate": 0.0006741850456739107,
191
+ "loss": 0.2645,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.17608144879341125,
197
+ "learning_rate": 0.0006649015841180021,
198
+ "loss": 0.2543,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.21213066577911377,
204
+ "learning_rate": 0.0006553566966805246,
205
+ "loss": 0.2566,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.2531004548072815,
211
+ "learning_rate": 0.0006455598030130111,
212
+ "loss": 0.2627,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.2586396336555481,
218
+ "learning_rate": 0.000635520571466718,
219
+ "loss": 0.2585,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.20283308625221252,
225
+ "learning_rate": 0.0006252489095511306,
226
+ "loss": 0.2476,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.23439405858516693,
232
+ "learning_rate": 0.0006147549541564466,
233
+ "loss": 0.2523,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.2257372885942459,
239
+ "learning_rate": 0.0006040490615496899,
240
+ "loss": 0.2468,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.22245371341705322,
246
+ "learning_rate": 0.0005931417971543255,
247
+ "loss": 0.2476,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.1757330298423767,
253
+ "learning_rate": 0.0005820439251234615,
254
+ "loss": 0.2548,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.19649666547775269,
260
+ "learning_rate": 0.0005707663977169301,
261
+ "loss": 0.2493,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.20930099487304688,
267
+ "learning_rate": 0.0005593203444927291,
268
+ "loss": 0.2475,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.19596275687217712,
274
+ "learning_rate": 0.0005477170613234922,
275
+ "loss": 0.2435,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.18458303809165955,
281
+ "learning_rate": 0.0005359679992488259,
282
+ "loss": 0.2401,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.2254686802625656,
288
+ "learning_rate": 0.0005240847531745164,
289
+ "loss": 0.2467,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.22228467464447021,
295
+ "learning_rate": 0.0005120790504297575,
296
+ "loss": 0.246,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.19225908815860748,
302
+ "learning_rate": 0.0004999627391936922,
303
+ "loss": 0.2365,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.2088271975517273,
309
+ "learning_rate": 0.00048774777680269044,
310
+ "loss": 0.2327,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.1883237510919571,
316
+ "learning_rate": 0.00047544621794990197,
317
+ "loss": 0.2334,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.18923914432525635,
323
+ "learning_rate": 0.0004630702027887291,
324
+ "loss": 0.2421,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.1844753921031952,
330
+ "learning_rate": 0.0004506319449519617,
331
+ "loss": 0.2356,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.20344331860542297,
337
+ "learning_rate": 0.00043814371949839543,
338
+ "loss": 0.2384,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.1936650276184082,
344
+ "learning_rate": 0.0004256178507988314,
345
+ "loss": 0.2373,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.21648016571998596,
351
+ "learning_rate": 0.00041306670037340957,
352
+ "loss": 0.2381,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.20282672345638275,
358
+ "learning_rate": 0.00040050265469228103,
359
+ "loss": 0.229,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.1919662058353424,
365
+ "learning_rate": 0.00038793811295165725,
366
+ "loss": 0.1969,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.15275239944458008,
372
+ "learning_rate": 0.0003753854748373012,
373
+ "loss": 0.2049,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.18093539774417877,
379
+ "learning_rate": 0.00036285712828753295,
380
+ "loss": 0.2007,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.18538235127925873,
386
+ "learning_rate": 0.0003503654372678317,
387
+ "loss": 0.2044,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.16673167049884796,
393
+ "learning_rate": 0.0003379227295690936,
394
+ "loss": 0.2087,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.18630419671535492,
400
+ "learning_rate": 0.0003255412846415912,
401
+ "loss": 0.2063,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.18694503605365753,
407
+ "learning_rate": 0.0003132333214766379,
408
+ "loss": 0.1984,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.20606280863285065,
414
+ "learning_rate": 0.0003010109865479191,
415
+ "loss": 0.204,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.16723816096782684,
421
+ "learning_rate": 0.0002888863418243891,
422
+ "loss": 0.2022,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.2071099728345871,
428
+ "learning_rate": 0.0002768713528665639,
429
+ "loss": 0.2024,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.20365071296691895,
435
+ "learning_rate": 0.0002649778770179578,
436
+ "loss": 0.2057,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.16362306475639343,
442
+ "learning_rate": 0.00025321765170331634,
443
+ "loss": 0.1974,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.1981133073568344,
449
+ "learning_rate": 0.00024160228284519604,
450
+ "loss": 0.1961,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.16871990263462067,
456
+ "learning_rate": 0.0002301432334103195,
457
+ "loss": 0.192,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.18189150094985962,
463
+ "learning_rate": 0.00021885181209701105,
464
+ "loss": 0.1929,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.19691045582294464,
470
+ "learning_rate": 0.0002077391621748769,
471
+ "loss": 0.1877,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.15086142718791962,
477
+ "learning_rate": 0.0001968162504877441,
478
+ "loss": 0.1907,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.178297221660614,
484
+ "learning_rate": 0.0001860938566307083,
485
+ "loss": 0.1893,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.1767084002494812,
491
+ "learning_rate": 0.00017558256231197655,
492
+ "loss": 0.1909,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.19547300040721893,
498
+ "learning_rate": 0.00016529274090999708,
499
+ "loss": 0.19,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.17399638891220093,
505
+ "learning_rate": 0.00015523454723618882,
506
+ "loss": 0.1921,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.18990087509155273,
512
+ "learning_rate": 0.0001454179075133671,
513
+ "loss": 0.1796,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.16542664170265198,
519
+ "learning_rate": 0.00013585250957976128,
520
+ "loss": 0.1865,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.204952672123909,
526
+ "learning_rate": 0.00012654779332828725,
527
+ "loss": 0.1899,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.18542811274528503,
533
+ "learning_rate": 0.00011751294139051308,
534
+ "loss": 0.1895,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.18334172666072845,
540
+ "learning_rate": 0.00010875687007451065,
541
+ "loss": 0.1876,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.19455808401107788,
547
+ "learning_rate": 0.00010028822056553551,
548
+ "loss": 0.1875,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.1701984405517578,
554
+ "learning_rate": 9.211535039822043e-05,
555
+ "loss": 0.1779,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.20316354930400848,
561
+ "learning_rate": 8.424632520869823e-05,
562
+ "loss": 0.19,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.1809937059879303,
568
+ "learning_rate": 7.668891077479186e-05,
569
+ "loss": 0.1887,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.16416801512241364,
575
+ "learning_rate": 6.945056535212984e-05,
576
+ "loss": 0.1762,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.17844462394714355,
582
+ "learning_rate": 6.253843231374847e-05,
583
+ "loss": 0.1853,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.16785769164562225,
589
+ "learning_rate": 5.5959333100444563e-05,
590
+ "loss": 0.1883,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.19007396697998047,
596
+ "learning_rate": 4.971976048883749e-05,
597
+ "loss": 0.1859,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.17088015377521515,
603
+ "learning_rate": 4.3825872183782046e-05,
604
+ "loss": 0.1839,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.1742444932460785,
610
+ "learning_rate": 3.8283484741457754e-05,
611
+ "loss": 0.1777,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.17171719670295715,
617
+ "learning_rate": 3.3098067829129405e-05,
618
+ "loss": 0.1935,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.17821335792541504,
624
+ "learning_rate": 2.827473882724667e-05,
625
+ "loss": 0.1836,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.18346573412418365,
631
+ "learning_rate": 2.381825777920681e-05,
632
+ "loss": 0.1788,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.17552965879440308,
638
+ "learning_rate": 1.9733022693766732e-05,
639
+ "loss": 0.1864,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1744184046983719,
645
+ "learning_rate": 1.602306520473933e-05,
646
+ "loss": 0.1788,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.18287444114685059,
652
+ "learning_rate": 1.2692046592257711e-05,
653
+ "loss": 0.1828,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.1811319887638092,
659
+ "learning_rate": 9.743254169533612e-06,
660
+ "loss": 0.186,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.1763424277305603,
666
+ "learning_rate": 7.1795980386763166e-06,
667
+ "loss": 0.1775,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.1989917904138565,
673
+ "learning_rate": 5.003608218772949e-06,
674
+ "loss": 0.1995,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.18446530401706696,
680
+ "learning_rate": 3.2174321490654023e-06,
681
+ "loss": 0.1866,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.2140699326992035,
687
+ "learning_rate": 1.8228325696867388e-06,
688
+ "loss": 0.1796,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.17141887545585632,
694
+ "learning_rate": 8.211857820497936e-07,
695
+ "loss": 0.1815,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.18069347739219666,
701
+ "learning_rate": 2.134802906036626e-07,
702
+ "loss": 0.1862,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.16899904608726501,
708
+ "learning_rate": 3.158272992909161e-10,
709
+ "loss": 0.1825,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.23025748538970947,
717
+ "train_runtime": 2197.7586,
718
+ "train_samples_per_second": 36.401,
719
+ "train_steps_per_second": 1.138
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exprep/run_ex32/ft/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": false,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex32/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exprep/run_ex32/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exprep/run_ex32/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exprep/run_ex32/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exprep/run_ex32/ft2/adapter_config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "drop_out": 0.0,
6
+ "inference_mode": true,
7
+ "layers_to_transform": null,
8
+ "modules_to_save": null,
9
+ "num_rotations": 1,
10
+ "peft_type": "ROTATION",
11
+ "r": 16,
12
+ "revision": null,
13
+ "target_modules": [
14
+ "q_proj",
15
+ "v_proj"
16
+ ],
17
+ "target_modules_to_skip": null,
18
+ "task_type": "CAUSAL_LM"
19
+ }
nl_tasks/exprep/run_ex32/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:635c88299c2034ba5e2a622ae7d875726a71563bbc3f16921f31680c42fe6fb6
3
+ size 33602915
nl_tasks/exprep/run_ex32/output.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+
2
+ MATH math MAX TOKEN = 1408, length==== 5000, math acc %====, 7.4399999999999995
3
+
4
+ gsm8k MAX TOKEN = 1024, length==== 1319, gsm8k acc %====, 50.265352539802876
nl_tasks/exprep/run_ex32/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 100,
7
+ "global_step": 2500,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02,
14
+ "grad_norm": 2.4573163986206055,
15
+ "learning_rate": 0.0007998180972402738,
16
+ "loss": 0.4445,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04,
21
+ "grad_norm": 0.18224593997001648,
22
+ "learning_rate": 0.0007992419381164945,
23
+ "loss": 0.3412,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.06,
28
+ "grad_norm": 0.20321156084537506,
29
+ "learning_rate": 0.0007982717754008577,
30
+ "loss": 0.3329,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.08,
35
+ "grad_norm": 0.17889830470085144,
36
+ "learning_rate": 0.0007969085665268344,
37
+ "loss": 0.3141,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1,
42
+ "grad_norm": 0.19375747442245483,
43
+ "learning_rate": 0.0007951536568170009,
44
+ "loss": 0.3047,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.12,
49
+ "grad_norm": 0.23067152500152588,
50
+ "learning_rate": 0.0007930087781553683,
51
+ "loss": 0.2988,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.14,
56
+ "grad_norm": 0.18743064999580383,
57
+ "learning_rate": 0.0007904760472782212,
58
+ "loss": 0.2956,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.16,
63
+ "grad_norm": 0.24874503910541534,
64
+ "learning_rate": 0.0007875579636851548,
65
+ "loss": 0.296,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.18,
70
+ "grad_norm": 0.21282872557640076,
71
+ "learning_rate": 0.0007842574071723712,
72
+ "loss": 0.2923,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2,
77
+ "grad_norm": 0.2218734323978424,
78
+ "learning_rate": 0.0007805776349906676,
79
+ "loss": 0.2889,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.22,
84
+ "grad_norm": 0.22166799008846283,
85
+ "learning_rate": 0.0007765222786309228,
86
+ "loss": 0.2882,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.24,
91
+ "grad_norm": 0.2501453459262848,
92
+ "learning_rate": 0.0007720953402402549,
93
+ "loss": 0.2956,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.26,
98
+ "grad_norm": 0.1966608613729477,
99
+ "learning_rate": 0.0007673011886723848,
100
+ "loss": 0.2881,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.28,
105
+ "grad_norm": 0.2205989956855774,
106
+ "learning_rate": 0.0007621445551761047,
107
+ "loss": 0.2888,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.3,
112
+ "grad_norm": 0.22034476697444916,
113
+ "learning_rate": 0.0007566305287261081,
114
+ "loss": 0.2811,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.32,
119
+ "grad_norm": 0.2284107804298401,
120
+ "learning_rate": 0.0007507645510007842,
121
+ "loss": 0.2823,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.34,
126
+ "grad_norm": 0.18093684315681458,
127
+ "learning_rate": 0.00074455241101194,
128
+ "loss": 0.2711,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.36,
133
+ "grad_norm": 0.20792332291603088,
134
+ "learning_rate": 0.0007380002393917437,
135
+ "loss": 0.2727,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.38,
140
+ "grad_norm": 0.21218571066856384,
141
+ "learning_rate": 0.0007311145023425311,
142
+ "loss": 0.2698,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.4,
147
+ "grad_norm": 0.20940104126930237,
148
+ "learning_rate": 0.000723901995255445,
149
+ "loss": 0.2688,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.42,
154
+ "grad_norm": 0.19124098122119904,
155
+ "learning_rate": 0.0007163698360042034,
156
+ "loss": 0.2698,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.44,
161
+ "grad_norm": 0.23809689283370972,
162
+ "learning_rate": 0.0007085254579206188,
163
+ "loss": 0.2583,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.46,
168
+ "grad_norm": 0.22469568252563477,
169
+ "learning_rate": 0.0007003766024587967,
170
+ "loss": 0.2754,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.48,
175
+ "grad_norm": 0.22428779304027557,
176
+ "learning_rate": 0.0006919313115552542,
177
+ "loss": 0.2613,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.5,
182
+ "grad_norm": 0.20287081599235535,
183
+ "learning_rate": 0.0006831979196925012,
184
+ "loss": 0.2747,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.52,
189
+ "grad_norm": 0.18675899505615234,
190
+ "learning_rate": 0.0006741850456739107,
191
+ "loss": 0.2642,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.54,
196
+ "grad_norm": 0.18064279854297638,
197
+ "learning_rate": 0.0006649015841180021,
198
+ "loss": 0.2552,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.56,
203
+ "grad_norm": 0.19596098363399506,
204
+ "learning_rate": 0.0006553566966805246,
205
+ "loss": 0.2562,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.58,
210
+ "grad_norm": 0.21535809338092804,
211
+ "learning_rate": 0.0006455598030130111,
212
+ "loss": 0.2628,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.6,
217
+ "grad_norm": 0.18167726695537567,
218
+ "learning_rate": 0.000635520571466718,
219
+ "loss": 0.2576,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.62,
224
+ "grad_norm": 0.21124687790870667,
225
+ "learning_rate": 0.0006252489095511306,
226
+ "loss": 0.2456,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.64,
231
+ "grad_norm": 0.22079136967658997,
232
+ "learning_rate": 0.0006147549541564466,
233
+ "loss": 0.2517,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.66,
238
+ "grad_norm": 0.20711511373519897,
239
+ "learning_rate": 0.0006040490615496899,
240
+ "loss": 0.2447,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 0.68,
245
+ "grad_norm": 0.22470833361148834,
246
+ "learning_rate": 0.0005931417971543255,
247
+ "loss": 0.2465,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 0.7,
252
+ "grad_norm": 0.18045905232429504,
253
+ "learning_rate": 0.0005820439251234615,
254
+ "loss": 0.2536,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 0.72,
259
+ "grad_norm": 0.19067174196243286,
260
+ "learning_rate": 0.0005707663977169301,
261
+ "loss": 0.2495,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 0.74,
266
+ "grad_norm": 0.2075628638267517,
267
+ "learning_rate": 0.0005593203444927291,
268
+ "loss": 0.2451,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 0.76,
273
+ "grad_norm": 0.1913316696882248,
274
+ "learning_rate": 0.0005477170613234922,
275
+ "loss": 0.2432,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 0.78,
280
+ "grad_norm": 0.17860470712184906,
281
+ "learning_rate": 0.0005359679992488259,
282
+ "loss": 0.2394,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 0.8,
287
+ "grad_norm": 0.22497022151947021,
288
+ "learning_rate": 0.0005240847531745164,
289
+ "loss": 0.2464,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 0.82,
294
+ "grad_norm": 0.2618238031864166,
295
+ "learning_rate": 0.0005120790504297575,
296
+ "loss": 0.2462,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 0.84,
301
+ "grad_norm": 0.1910582333803177,
302
+ "learning_rate": 0.0004999627391936922,
303
+ "loss": 0.2352,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 0.86,
308
+ "grad_norm": 0.19254150986671448,
309
+ "learning_rate": 0.00048774777680269044,
310
+ "loss": 0.2323,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 0.88,
315
+ "grad_norm": 0.1851769983768463,
316
+ "learning_rate": 0.00047544621794990197,
317
+ "loss": 0.2342,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 0.9,
322
+ "grad_norm": 0.18372896313667297,
323
+ "learning_rate": 0.0004630702027887291,
324
+ "loss": 0.2424,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 0.92,
329
+ "grad_norm": 0.1747942566871643,
330
+ "learning_rate": 0.0004506319449519617,
331
+ "loss": 0.2348,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 0.94,
336
+ "grad_norm": 0.19792233407497406,
337
+ "learning_rate": 0.00043814371949839543,
338
+ "loss": 0.2373,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 0.96,
343
+ "grad_norm": 0.17501504719257355,
344
+ "learning_rate": 0.0004256178507988314,
345
+ "loss": 0.2363,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 0.98,
350
+ "grad_norm": 0.1910821944475174,
351
+ "learning_rate": 0.00041306670037340957,
352
+ "loss": 0.2384,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.0,
357
+ "grad_norm": 0.20117861032485962,
358
+ "learning_rate": 0.00040050265469228103,
359
+ "loss": 0.2273,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.02,
364
+ "grad_norm": 0.1948338747024536,
365
+ "learning_rate": 0.00038793811295165725,
366
+ "loss": 0.1971,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.04,
371
+ "grad_norm": 0.1658090204000473,
372
+ "learning_rate": 0.0003753854748373012,
373
+ "loss": 0.2041,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.06,
378
+ "grad_norm": 0.19527383148670197,
379
+ "learning_rate": 0.00036285712828753295,
380
+ "loss": 0.1997,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.08,
385
+ "grad_norm": 0.19869408011436462,
386
+ "learning_rate": 0.0003503654372678317,
387
+ "loss": 0.2049,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.1,
392
+ "grad_norm": 0.1618577539920807,
393
+ "learning_rate": 0.0003379227295690936,
394
+ "loss": 0.2079,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.12,
399
+ "grad_norm": 0.17939826846122742,
400
+ "learning_rate": 0.0003255412846415912,
401
+ "loss": 0.2053,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.1400000000000001,
406
+ "grad_norm": 0.17189370095729828,
407
+ "learning_rate": 0.0003132333214766379,
408
+ "loss": 0.1969,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.16,
413
+ "grad_norm": 0.17572352290153503,
414
+ "learning_rate": 0.0003010109865479191,
415
+ "loss": 0.2043,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.18,
420
+ "grad_norm": 0.16858956217765808,
421
+ "learning_rate": 0.0002888863418243891,
422
+ "loss": 0.2015,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.2,
427
+ "grad_norm": 0.20846350491046906,
428
+ "learning_rate": 0.0002768713528665639,
429
+ "loss": 0.2025,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.22,
434
+ "grad_norm": 0.1764461249113083,
435
+ "learning_rate": 0.0002649778770179578,
436
+ "loss": 0.2042,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.24,
441
+ "grad_norm": 0.16379545629024506,
442
+ "learning_rate": 0.00025321765170331634,
443
+ "loss": 0.1971,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.26,
448
+ "grad_norm": 0.17492444813251495,
449
+ "learning_rate": 0.00024160228284519604,
450
+ "loss": 0.1954,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.28,
455
+ "grad_norm": 0.17563839256763458,
456
+ "learning_rate": 0.0002301432334103195,
457
+ "loss": 0.1914,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.3,
462
+ "grad_norm": 0.17704229056835175,
463
+ "learning_rate": 0.00021885181209701105,
464
+ "loss": 0.1922,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.32,
469
+ "grad_norm": 0.18759162724018097,
470
+ "learning_rate": 0.0002077391621748769,
471
+ "loss": 0.1869,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 1.34,
476
+ "grad_norm": 0.14872781932353973,
477
+ "learning_rate": 0.0001968162504877441,
478
+ "loss": 0.1909,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 1.3599999999999999,
483
+ "grad_norm": 0.1769675761461258,
484
+ "learning_rate": 0.0001860938566307083,
485
+ "loss": 0.1889,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 1.38,
490
+ "grad_norm": 0.1767898052930832,
491
+ "learning_rate": 0.00017558256231197655,
492
+ "loss": 0.1904,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 1.4,
497
+ "grad_norm": 0.19452469050884247,
498
+ "learning_rate": 0.00016529274090999708,
499
+ "loss": 0.1898,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 1.42,
504
+ "grad_norm": 0.17662617564201355,
505
+ "learning_rate": 0.00015523454723618882,
506
+ "loss": 0.1918,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 1.44,
511
+ "grad_norm": 0.19268639385700226,
512
+ "learning_rate": 0.0001454179075133671,
513
+ "loss": 0.1796,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 1.46,
518
+ "grad_norm": 0.16467879712581635,
519
+ "learning_rate": 0.00013585250957976128,
520
+ "loss": 0.1851,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 1.48,
525
+ "grad_norm": 0.21923720836639404,
526
+ "learning_rate": 0.00012654779332828725,
527
+ "loss": 0.1891,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 1.5,
532
+ "grad_norm": 0.1770046353340149,
533
+ "learning_rate": 0.00011751294139051308,
534
+ "loss": 0.1892,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 1.52,
539
+ "grad_norm": 0.21088558435440063,
540
+ "learning_rate": 0.00010875687007451065,
541
+ "loss": 0.1878,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 1.54,
546
+ "grad_norm": 0.19151797890663147,
547
+ "learning_rate": 0.00010028822056553551,
548
+ "loss": 0.1865,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 1.56,
553
+ "grad_norm": 0.15753497183322906,
554
+ "learning_rate": 9.211535039822043e-05,
555
+ "loss": 0.1774,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 1.58,
560
+ "grad_norm": 0.19507814943790436,
561
+ "learning_rate": 8.424632520869823e-05,
562
+ "loss": 0.1896,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 1.6,
567
+ "grad_norm": 0.17970746755599976,
568
+ "learning_rate": 7.668891077479186e-05,
569
+ "loss": 0.1883,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 1.62,
574
+ "grad_norm": 0.16696929931640625,
575
+ "learning_rate": 6.945056535212984e-05,
576
+ "loss": 0.176,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 1.6400000000000001,
581
+ "grad_norm": 0.18560630083084106,
582
+ "learning_rate": 6.253843231374847e-05,
583
+ "loss": 0.1846,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 1.6600000000000001,
588
+ "grad_norm": 0.16515544056892395,
589
+ "learning_rate": 5.5959333100444563e-05,
590
+ "loss": 0.1879,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 1.6800000000000002,
595
+ "grad_norm": 0.1885310411453247,
596
+ "learning_rate": 4.971976048883749e-05,
597
+ "loss": 0.1856,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 1.7,
602
+ "grad_norm": 0.16086508333683014,
603
+ "learning_rate": 4.3825872183782046e-05,
604
+ "loss": 0.1829,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 1.72,
609
+ "grad_norm": 0.17448872327804565,
610
+ "learning_rate": 3.8283484741457754e-05,
611
+ "loss": 0.1763,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 1.74,
616
+ "grad_norm": 0.165937602519989,
617
+ "learning_rate": 3.3098067829129405e-05,
618
+ "loss": 0.1934,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 1.76,
623
+ "grad_norm": 0.20669575035572052,
624
+ "learning_rate": 2.827473882724667e-05,
625
+ "loss": 0.1834,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 1.78,
630
+ "grad_norm": 0.18644139170646667,
631
+ "learning_rate": 2.381825777920681e-05,
632
+ "loss": 0.1776,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 1.8,
637
+ "grad_norm": 0.1659824401140213,
638
+ "learning_rate": 1.9733022693766732e-05,
639
+ "loss": 0.1858,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 1.8199999999999998,
644
+ "grad_norm": 0.1649240255355835,
645
+ "learning_rate": 1.602306520473933e-05,
646
+ "loss": 0.1781,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 1.8399999999999999,
651
+ "grad_norm": 0.1927357017993927,
652
+ "learning_rate": 1.2692046592257711e-05,
653
+ "loss": 0.1819,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 1.8599999999999999,
658
+ "grad_norm": 0.18707315623760223,
659
+ "learning_rate": 9.743254169533612e-06,
660
+ "loss": 0.1856,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 1.88,
665
+ "grad_norm": 0.17311322689056396,
666
+ "learning_rate": 7.1795980386763166e-06,
667
+ "loss": 0.1762,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 1.9,
672
+ "grad_norm": 0.19358113408088684,
673
+ "learning_rate": 5.003608218772949e-06,
674
+ "loss": 0.1991,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 1.92,
679
+ "grad_norm": 0.1988118588924408,
680
+ "learning_rate": 3.2174321490654023e-06,
681
+ "loss": 0.1853,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 1.94,
686
+ "grad_norm": 0.20467230677604675,
687
+ "learning_rate": 1.8228325696867388e-06,
688
+ "loss": 0.1797,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 1.96,
693
+ "grad_norm": 0.21009773015975952,
694
+ "learning_rate": 8.211857820497936e-07,
695
+ "loss": 0.1806,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 1.98,
700
+ "grad_norm": 0.17240630090236664,
701
+ "learning_rate": 2.134802906036626e-07,
702
+ "loss": 0.1858,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.0,
707
+ "grad_norm": 0.17474809288978577,
708
+ "learning_rate": 3.158272992909161e-10,
709
+ "loss": 0.1828,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 2.0,
714
+ "step": 2500,
715
+ "total_flos": 1.62588235137024e+18,
716
+ "train_loss": 0.22965620460510253,
717
+ "train_runtime": 2199.3812,
718
+ "train_samples_per_second": 36.374,
719
+ "train_steps_per_second": 1.137
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2500,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 2,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": false,
734
+ "should_training_stop": false
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 1.62588235137024e+18,
740
+ "train_batch_size": 32,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }