nvan15 commited on
Commit
1af4760
·
verified ·
1 Parent(s): ade215c

Batch upload part 9

Browse files
Files changed (50) hide show
  1. nl_tasks/exps/run_ex34/ft/adapter_config.json +18 -0
  2. nl_tasks/exps/run_ex34/ft/special_tokens_map.json +24 -0
  3. nl_tasks/exps/run_ex34/ft/tokenizer.json +0 -0
  4. nl_tasks/exps/run_ex34/ft/tokenizer.model +3 -0
  5. nl_tasks/exps/run_ex34/ft/tokenizer_config.json +43 -0
  6. nl_tasks/exps/run_ex34/ft2/adapter_config.json +18 -0
  7. nl_tasks/exps/run_ex34/ft2/adapter_model.bin +3 -0
  8. nl_tasks/run_all/exnr14/ft/adapter_config.json +18 -0
  9. nl_tasks/run_all/exnr14/ft/added_tokens.json +3 -0
  10. nl_tasks/run_all/exnr14/ft/special_tokens_map.json +30 -0
  11. nl_tasks/run_all/exnr14/ft/tokenizer.json +0 -0
  12. nl_tasks/run_all/exnr14/ft/tokenizer.model +3 -0
  13. nl_tasks/run_all/exnr14/ft/tokenizer_config.json +51 -0
  14. nl_tasks/run_all/exnr14/ft/training_args.bin +3 -0
  15. nl_tasks/run_all/exnr14/ft2/adapter_config.json +18 -0
  16. nl_tasks/run_all/exnr14/ft2/adapter_model.bin +3 -0
  17. nl_tasks/run_all/exnr14/trainer_state.json +1106 -0
  18. nl_tasks/run_all/exnr15/ft/adapter_config.json +18 -0
  19. nl_tasks/run_all/exnr15/ft/added_tokens.json +3 -0
  20. nl_tasks/run_all/exnr15/ft/special_tokens_map.json +30 -0
  21. nl_tasks/run_all/exnr15/ft/tokenizer.json +0 -0
  22. nl_tasks/run_all/exnr15/ft/tokenizer.model +3 -0
  23. nl_tasks/run_all/exnr15/ft/tokenizer_config.json +51 -0
  24. nl_tasks/run_all/exnr15/ft/training_args.bin +3 -0
  25. nl_tasks/run_all/exnr15/ft2/adapter_config.json +18 -0
  26. nl_tasks/run_all/exnr15/ft2/adapter_model.bin +3 -0
  27. nl_tasks/run_all/exnr15/trainer_state.json +135 -0
  28. nl_tasks/run_all/run_exnr10/ft/adapter_config.json +18 -0
  29. nl_tasks/run_all/run_exnr10/ft/added_tokens.json +3 -0
  30. nl_tasks/run_all/run_exnr10/ft/special_tokens_map.json +30 -0
  31. nl_tasks/run_all/run_exnr10/ft/tokenizer.json +0 -0
  32. nl_tasks/run_all/run_exnr10/ft/tokenizer.model +3 -0
  33. nl_tasks/run_all/run_exnr10/ft/tokenizer_config.json +51 -0
  34. nl_tasks/run_all/run_exnr10/ft/training_args.bin +3 -0
  35. nl_tasks/run_all/run_exnr10/ft2/adapter_config.json +18 -0
  36. nl_tasks/run_all/run_exnr10/ft2/adapter_model.bin +3 -0
  37. nl_tasks/run_all/run_exnr10/trainer_state.json +1106 -0
  38. nl_tasks/run_all/run_exnr11/ft/adapter_config.json +18 -0
  39. nl_tasks/run_all/run_exnr11/ft/added_tokens.json +3 -0
  40. nl_tasks/run_all/run_exnr11/ft/special_tokens_map.json +30 -0
  41. nl_tasks/run_all/run_exnr11/ft/tokenizer.json +0 -0
  42. nl_tasks/run_all/run_exnr11/ft/tokenizer.model +3 -0
  43. nl_tasks/run_all/run_exnr11/ft/tokenizer_config.json +51 -0
  44. nl_tasks/run_all/run_exnr11/ft/training_args.bin +3 -0
  45. nl_tasks/run_all/run_exnr11/ft2/adapter_config.json +18 -0
  46. nl_tasks/run_all/run_exnr11/ft2/adapter_model.bin +3 -0
  47. nl_tasks/run_all/run_exnr11/trainer_state.json +1106 -0
  48. nl_tasks/run_all/run_exnr12/ft/special_tokens_map.json +30 -0
  49. nl_tasks/run_all/run_exnr12/ft/tokenizer_config.json +51 -0
  50. nl_tasks/run_all/run_exnr12/trainer_state.json +1106 -0
nl_tasks/exps/run_ex34/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex34/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex34/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex34/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex34/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex34/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex34/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0d263a69315b36f2b766405381d1a2c89d10e2377e9d5f50e9ff5f74bfa0c189
3
+ size 33602915
nl_tasks/run_all/exnr14/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/exnr14/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/run_all/exnr14/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_all/exnr14/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_all/exnr14/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/run_all/exnr14/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_all/exnr14/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba427f2d53ed75e43f98a703940afe9017468efadf9a60725d88bde4c1c2c303
3
+ size 6545
nl_tasks/run_all/exnr14/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/exnr14/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:29b2d1c56e46e1472d61486ef0f7c4570c1678d2f3ea6b0c9652a5e9a979117e
3
+ size 33602915
nl_tasks/run_all/exnr14/trainer_state.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 50,
7
+ "global_step": 2438,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.020508613617719443,
14
+ "grad_norm": 0.16105684638023376,
15
+ "learning_rate": 0.00019672131147540983,
16
+ "loss": 0.6249,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04101722723543889,
21
+ "grad_norm": 0.19086705148220062,
22
+ "learning_rate": 0.00040163934426229507,
23
+ "loss": 0.3842,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.04101722723543889,
28
+ "eval_loss": 0.3523162603378296,
29
+ "eval_runtime": 19.5722,
30
+ "eval_samples_per_second": 51.093,
31
+ "eval_steps_per_second": 0.817,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.06152584085315833,
36
+ "grad_norm": 0.40827786922454834,
37
+ "learning_rate": 0.0006065573770491804,
38
+ "loss": 0.3562,
39
+ "step": 75
40
+ },
41
+ {
42
+ "epoch": 0.08203445447087777,
43
+ "grad_norm": 0.2028273344039917,
44
+ "learning_rate": 0.0008114754098360656,
45
+ "loss": 0.3428,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.08203445447087777,
50
+ "eval_loss": 0.32449883222579956,
51
+ "eval_runtime": 19.5267,
52
+ "eval_samples_per_second": 51.212,
53
+ "eval_steps_per_second": 0.819,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 0.10254306808859721,
58
+ "grad_norm": 0.2950042188167572,
59
+ "learning_rate": 0.0010163934426229509,
60
+ "loss": 0.3332,
61
+ "step": 125
62
+ },
63
+ {
64
+ "epoch": 0.12305168170631665,
65
+ "grad_norm": 2.916337013244629,
66
+ "learning_rate": 0.001221311475409836,
67
+ "loss": 0.5147,
68
+ "step": 150
69
+ },
70
+ {
71
+ "epoch": 0.12305168170631665,
72
+ "eval_loss": 0.410254567861557,
73
+ "eval_runtime": 19.5194,
74
+ "eval_samples_per_second": 51.231,
75
+ "eval_steps_per_second": 0.82,
76
+ "step": 150
77
+ },
78
+ {
79
+ "epoch": 0.1435602953240361,
80
+ "grad_norm": 0.40234318375587463,
81
+ "learning_rate": 0.0014262295081967215,
82
+ "loss": 0.3616,
83
+ "step": 175
84
+ },
85
+ {
86
+ "epoch": 0.16406890894175555,
87
+ "grad_norm": 0.5344768762588501,
88
+ "learning_rate": 0.0016311475409836065,
89
+ "loss": 0.3605,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.16406890894175555,
94
+ "eval_loss": 0.3446679413318634,
95
+ "eval_runtime": 19.5174,
96
+ "eval_samples_per_second": 51.236,
97
+ "eval_steps_per_second": 0.82,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 0.184577522559475,
102
+ "grad_norm": 392.2628173828125,
103
+ "learning_rate": 0.0018360655737704918,
104
+ "loss": 0.4885,
105
+ "step": 225
106
+ },
107
+ {
108
+ "epoch": 0.20508613617719443,
109
+ "grad_norm": 0.4538051187992096,
110
+ "learning_rate": 0.0019999743708232127,
111
+ "loss": 0.4975,
112
+ "step": 250
113
+ },
114
+ {
115
+ "epoch": 0.20508613617719443,
116
+ "eval_loss": 0.3520921468734741,
117
+ "eval_runtime": 19.5139,
118
+ "eval_samples_per_second": 51.245,
119
+ "eval_steps_per_second": 0.82,
120
+ "step": 250
121
+ },
122
+ {
123
+ "epoch": 0.22559474979491387,
124
+ "grad_norm": 0.6222809553146362,
125
+ "learning_rate": 0.0019990774875676054,
126
+ "loss": 0.3541,
127
+ "step": 275
128
+ },
129
+ {
130
+ "epoch": 0.2461033634126333,
131
+ "grad_norm": 0.44931286573410034,
132
+ "learning_rate": 0.001996900458879386,
133
+ "loss": 0.3521,
134
+ "step": 300
135
+ },
136
+ {
137
+ "epoch": 0.2461033634126333,
138
+ "eval_loss": 0.3311863839626312,
139
+ "eval_runtime": 19.5345,
140
+ "eval_samples_per_second": 51.192,
141
+ "eval_steps_per_second": 0.819,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 0.2666119770303528,
146
+ "grad_norm": 0.23350577056407928,
147
+ "learning_rate": 0.001993446074245224,
148
+ "loss": 0.3325,
149
+ "step": 325
150
+ },
151
+ {
152
+ "epoch": 0.2871205906480722,
153
+ "grad_norm": 0.21402505040168762,
154
+ "learning_rate": 0.0019887187598630527,
155
+ "loss": 0.3221,
156
+ "step": 350
157
+ },
158
+ {
159
+ "epoch": 0.2871205906480722,
160
+ "eval_loss": 0.2994612455368042,
161
+ "eval_runtime": 19.5154,
162
+ "eval_samples_per_second": 51.242,
163
+ "eval_steps_per_second": 0.82,
164
+ "step": 350
165
+ },
166
+ {
167
+ "epoch": 0.30762920426579166,
168
+ "grad_norm": 0.17019234597682953,
169
+ "learning_rate": 0.0019827245729706648,
170
+ "loss": 0.3031,
171
+ "step": 375
172
+ },
173
+ {
174
+ "epoch": 0.3281378178835111,
175
+ "grad_norm": 0.17441657185554504,
176
+ "learning_rate": 0.0019754711940844047,
177
+ "loss": 0.3047,
178
+ "step": 400
179
+ },
180
+ {
181
+ "epoch": 0.3281378178835111,
182
+ "eval_loss": 0.28233733773231506,
183
+ "eval_runtime": 19.5238,
184
+ "eval_samples_per_second": 51.22,
185
+ "eval_steps_per_second": 0.82,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 0.34864643150123054,
190
+ "grad_norm": 0.12981465458869934,
191
+ "learning_rate": 0.0019669679171579117,
192
+ "loss": 0.2964,
193
+ "step": 425
194
+ },
195
+ {
196
+ "epoch": 0.36915504511895,
197
+ "grad_norm": 0.14443928003311157,
198
+ "learning_rate": 0.001957225637673524,
199
+ "loss": 0.2879,
200
+ "step": 450
201
+ },
202
+ {
203
+ "epoch": 0.36915504511895,
204
+ "eval_loss": 0.27039000391960144,
205
+ "eval_runtime": 19.544,
206
+ "eval_samples_per_second": 51.167,
207
+ "eval_steps_per_second": 0.819,
208
+ "step": 450
209
+ },
210
+ {
211
+ "epoch": 0.3896636587366694,
212
+ "grad_norm": 0.1316242665052414,
213
+ "learning_rate": 0.0019462568386815961,
214
+ "loss": 0.2839,
215
+ "step": 475
216
+ },
217
+ {
218
+ "epoch": 0.41017227235438886,
219
+ "grad_norm": 0.12701553106307983,
220
+ "learning_rate": 0.0019340755748056234,
221
+ "loss": 0.2759,
222
+ "step": 500
223
+ },
224
+ {
225
+ "epoch": 0.41017227235438886,
226
+ "eval_loss": 0.2636236250400543,
227
+ "eval_runtime": 19.5187,
228
+ "eval_samples_per_second": 51.233,
229
+ "eval_steps_per_second": 0.82,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 0.4306808859721083,
234
+ "grad_norm": 0.1544770896434784,
235
+ "learning_rate": 0.0019206974542336672,
236
+ "loss": 0.2795,
237
+ "step": 525
238
+ },
239
+ {
240
+ "epoch": 0.45118949958982774,
241
+ "grad_norm": 0.11463268101215363,
242
+ "learning_rate": 0.0019061396187191563,
243
+ "loss": 0.2735,
244
+ "step": 550
245
+ },
246
+ {
247
+ "epoch": 0.45118949958982774,
248
+ "eval_loss": 0.2576003074645996,
249
+ "eval_runtime": 19.5029,
250
+ "eval_samples_per_second": 51.274,
251
+ "eval_steps_per_second": 0.82,
252
+ "step": 550
253
+ },
254
+ {
255
+ "epoch": 0.4716981132075472,
256
+ "grad_norm": 0.12292572110891342,
257
+ "learning_rate": 0.0018904207216166836,
258
+ "loss": 0.2611,
259
+ "step": 575
260
+ },
261
+ {
262
+ "epoch": 0.4922067268252666,
263
+ "grad_norm": 0.11286451667547226,
264
+ "learning_rate": 0.001873560903980955,
265
+ "loss": 0.2699,
266
+ "step": 600
267
+ },
268
+ {
269
+ "epoch": 0.4922067268252666,
270
+ "eval_loss": 0.2554730474948883,
271
+ "eval_runtime": 19.529,
272
+ "eval_samples_per_second": 51.206,
273
+ "eval_steps_per_second": 0.819,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 0.5127153404429861,
278
+ "grad_norm": 0.09815208613872528,
279
+ "learning_rate": 0.0018555817687594984,
280
+ "loss": 0.2573,
281
+ "step": 625
282
+ },
283
+ {
284
+ "epoch": 0.5332239540607056,
285
+ "grad_norm": 0.10865656286478043,
286
+ "learning_rate": 0.0018365063531122169,
287
+ "loss": 0.2564,
288
+ "step": 650
289
+ },
290
+ {
291
+ "epoch": 0.5332239540607056,
292
+ "eval_loss": 0.25100600719451904,
293
+ "eval_runtime": 19.5294,
294
+ "eval_samples_per_second": 51.205,
295
+ "eval_steps_per_second": 0.819,
296
+ "step": 650
297
+ },
298
+ {
299
+ "epoch": 0.5537325676784249,
300
+ "grad_norm": 0.11892469227313995,
301
+ "learning_rate": 0.0018163590988932402,
302
+ "loss": 0.2556,
303
+ "step": 675
304
+ },
305
+ {
306
+ "epoch": 0.5742411812961444,
307
+ "grad_norm": 0.08931335061788559,
308
+ "learning_rate": 0.0017951658213329078,
309
+ "loss": 0.2507,
310
+ "step": 700
311
+ },
312
+ {
313
+ "epoch": 0.5742411812961444,
314
+ "eval_loss": 0.2487325370311737,
315
+ "eval_runtime": 19.5503,
316
+ "eval_samples_per_second": 51.15,
317
+ "eval_steps_per_second": 0.818,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 0.5947497949138638,
322
+ "grad_norm": 0.09335417300462723,
323
+ "learning_rate": 0.0017729536759600033,
324
+ "loss": 0.2479,
325
+ "step": 725
326
+ },
327
+ {
328
+ "epoch": 0.6152584085315833,
329
+ "grad_norm": 0.08527883887290955,
330
+ "learning_rate": 0.0017497511238066307,
331
+ "loss": 0.2571,
332
+ "step": 750
333
+ },
334
+ {
335
+ "epoch": 0.6152584085315833,
336
+ "eval_loss": 0.24877095222473145,
337
+ "eval_runtime": 19.5122,
338
+ "eval_samples_per_second": 51.25,
339
+ "eval_steps_per_second": 0.82,
340
+ "step": 750
341
+ },
342
+ {
343
+ "epoch": 0.6357670221493027,
344
+ "grad_norm": 0.10340578854084015,
345
+ "learning_rate": 0.00172558789494031,
346
+ "loss": 0.2621,
347
+ "step": 775
348
+ },
349
+ {
350
+ "epoch": 0.6562756357670222,
351
+ "grad_norm": 0.10272800922393799,
352
+ "learning_rate": 0.0017004949503700284,
353
+ "loss": 0.2519,
354
+ "step": 800
355
+ },
356
+ {
357
+ "epoch": 0.6562756357670222,
358
+ "eval_loss": 0.24430014193058014,
359
+ "eval_runtime": 19.5231,
360
+ "eval_samples_per_second": 51.221,
361
+ "eval_steps_per_second": 0.82,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 0.6767842493847416,
366
+ "grad_norm": 0.11489123106002808,
367
+ "learning_rate": 0.0016745044423750449,
368
+ "loss": 0.2472,
369
+ "step": 825
370
+ },
371
+ {
372
+ "epoch": 0.6972928630024611,
373
+ "grad_norm": 0.07966622710227966,
374
+ "learning_rate": 0.0016476496733072946,
375
+ "loss": 0.261,
376
+ "step": 850
377
+ },
378
+ {
379
+ "epoch": 0.6972928630024611,
380
+ "eval_loss": 0.24359455704689026,
381
+ "eval_runtime": 19.5061,
382
+ "eval_samples_per_second": 51.266,
383
+ "eval_steps_per_second": 0.82,
384
+ "step": 850
385
+ },
386
+ {
387
+ "epoch": 0.7178014766201805,
388
+ "grad_norm": 0.07211313396692276,
389
+ "learning_rate": 0.0016199650529201684,
390
+ "loss": 0.2548,
391
+ "step": 875
392
+ },
393
+ {
394
+ "epoch": 0.7383100902379,
395
+ "grad_norm": 0.08206778764724731,
396
+ "learning_rate": 0.0015914860542783522,
397
+ "loss": 0.2479,
398
+ "step": 900
399
+ },
400
+ {
401
+ "epoch": 0.7383100902379,
402
+ "eval_loss": 0.24049904942512512,
403
+ "eval_runtime": 19.5237,
404
+ "eval_samples_per_second": 51.22,
405
+ "eval_steps_per_second": 0.82,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 0.7588187038556193,
410
+ "grad_norm": 0.08992116898298264,
411
+ "learning_rate": 0.0015622491683052124,
412
+ "loss": 0.2502,
413
+ "step": 925
414
+ },
415
+ {
416
+ "epoch": 0.7793273174733388,
417
+ "grad_norm": 0.08781281113624573,
418
+ "learning_rate": 0.0015322918570259759,
419
+ "loss": 0.2341,
420
+ "step": 950
421
+ },
422
+ {
423
+ "epoch": 0.7793273174733388,
424
+ "eval_loss": 0.23792409896850586,
425
+ "eval_runtime": 19.5338,
426
+ "eval_samples_per_second": 51.193,
427
+ "eval_steps_per_second": 0.819,
428
+ "step": 950
429
+ },
430
+ {
431
+ "epoch": 0.7998359310910582,
432
+ "grad_norm": 0.08975056558847427,
433
+ "learning_rate": 0.0015016525055666057,
434
+ "loss": 0.2471,
435
+ "step": 975
436
+ },
437
+ {
438
+ "epoch": 0.8203445447087777,
439
+ "grad_norm": 0.08536435663700104,
440
+ "learning_rate": 0.001470370372969886,
441
+ "loss": 0.2501,
442
+ "step": 1000
443
+ },
444
+ {
445
+ "epoch": 0.8203445447087777,
446
+ "eval_loss": 0.23709805309772491,
447
+ "eval_runtime": 19.5182,
448
+ "eval_samples_per_second": 51.234,
449
+ "eval_steps_per_second": 0.82,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 0.8408531583264971,
454
+ "grad_norm": 0.07575884461402893,
455
+ "learning_rate": 0.0014384855418917311,
456
+ "loss": 0.2366,
457
+ "step": 1025
458
+ },
459
+ {
460
+ "epoch": 0.8613617719442166,
461
+ "grad_norm": 0.08482314646244049,
462
+ "learning_rate": 0.0014060388672421775,
463
+ "loss": 0.2432,
464
+ "step": 1050
465
+ },
466
+ {
467
+ "epoch": 0.8613617719442166,
468
+ "eval_loss": 0.23468729853630066,
469
+ "eval_runtime": 19.4897,
470
+ "eval_samples_per_second": 51.309,
471
+ "eval_steps_per_second": 0.821,
472
+ "step": 1050
473
+ },
474
+ {
475
+ "epoch": 0.881870385561936,
476
+ "grad_norm": 0.08808961510658264,
477
+ "learning_rate": 0.0013730719238368662,
478
+ "loss": 0.248,
479
+ "step": 1075
480
+ },
481
+ {
482
+ "epoch": 0.9023789991796555,
483
+ "grad_norm": 0.0849333256483078,
484
+ "learning_rate": 0.0013396269531260867,
485
+ "loss": 0.24,
486
+ "step": 1100
487
+ },
488
+ {
489
+ "epoch": 0.9023789991796555,
490
+ "eval_loss": 0.23305058479309082,
491
+ "eval_runtime": 19.5254,
492
+ "eval_samples_per_second": 51.215,
493
+ "eval_steps_per_second": 0.819,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 0.9228876127973749,
498
+ "grad_norm": 0.13481223583221436,
499
+ "learning_rate": 0.0013057468090696496,
500
+ "loss": 0.2376,
501
+ "step": 1125
502
+ },
503
+ {
504
+ "epoch": 0.9433962264150944,
505
+ "grad_norm": 0.07862788438796997,
506
+ "learning_rate": 0.0012714749032269287,
507
+ "loss": 0.2511,
508
+ "step": 1150
509
+ },
510
+ {
511
+ "epoch": 0.9433962264150944,
512
+ "eval_loss": 0.23162627220153809,
513
+ "eval_runtime": 19.5241,
514
+ "eval_samples_per_second": 51.219,
515
+ "eval_steps_per_second": 0.819,
516
+ "step": 1150
517
+ },
518
+ {
519
+ "epoch": 0.9639048400328137,
520
+ "grad_norm": 0.08472246676683426,
521
+ "learning_rate": 0.0012368551491324358,
522
+ "loss": 0.2415,
523
+ "step": 1175
524
+ },
525
+ {
526
+ "epoch": 0.9844134536505332,
527
+ "grad_norm": 0.07140998542308807,
528
+ "learning_rate": 0.0012019319060282063,
529
+ "loss": 0.2432,
530
+ "step": 1200
531
+ },
532
+ {
533
+ "epoch": 0.9844134536505332,
534
+ "eval_loss": 0.22965233027935028,
535
+ "eval_runtime": 19.5217,
536
+ "eval_samples_per_second": 51.225,
537
+ "eval_steps_per_second": 0.82,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 1.0049220672682526,
542
+ "grad_norm": 0.09055250138044357,
543
+ "learning_rate": 0.0011667499220250803,
544
+ "loss": 0.2404,
545
+ "step": 1225
546
+ },
547
+ {
548
+ "epoch": 1.0254306808859721,
549
+ "grad_norm": 0.08059660345315933,
550
+ "learning_rate": 0.0011313542767657204,
551
+ "loss": 0.2058,
552
+ "step": 1250
553
+ },
554
+ {
555
+ "epoch": 1.0254306808859721,
556
+ "eval_loss": 0.23091378808021545,
557
+ "eval_runtime": 19.5068,
558
+ "eval_samples_per_second": 51.264,
559
+ "eval_steps_per_second": 0.82,
560
+ "step": 1250
561
+ },
562
+ {
563
+ "epoch": 1.0459392945036916,
564
+ "grad_norm": 0.08628836274147034,
565
+ "learning_rate": 0.0010957903236628267,
566
+ "loss": 0.2037,
567
+ "step": 1275
568
+ },
569
+ {
570
+ "epoch": 1.066447908121411,
571
+ "grad_norm": 0.08117768913507462,
572
+ "learning_rate": 0.001060103631786563,
573
+ "loss": 0.2115,
574
+ "step": 1300
575
+ },
576
+ {
577
+ "epoch": 1.066447908121411,
578
+ "eval_loss": 0.22918300330638885,
579
+ "eval_runtime": 19.5375,
580
+ "eval_samples_per_second": 51.184,
581
+ "eval_steps_per_second": 0.819,
582
+ "step": 1300
583
+ },
584
+ {
585
+ "epoch": 1.0869565217391304,
586
+ "grad_norm": 0.0947440043091774,
587
+ "learning_rate": 0.0010243399274756564,
588
+ "loss": 0.2192,
589
+ "step": 1325
590
+ },
591
+ {
592
+ "epoch": 1.1074651353568499,
593
+ "grad_norm": 0.09542486071586609,
594
+ "learning_rate": 0.0009885450357469806,
595
+ "loss": 0.2086,
596
+ "step": 1350
597
+ },
598
+ {
599
+ "epoch": 1.1074651353568499,
600
+ "eval_loss": 0.2292700856924057,
601
+ "eval_runtime": 19.5283,
602
+ "eval_samples_per_second": 51.208,
603
+ "eval_steps_per_second": 0.819,
604
+ "step": 1350
605
+ },
606
+ {
607
+ "epoch": 1.1279737489745694,
608
+ "grad_norm": 0.0885721817612648,
609
+ "learning_rate": 0.0009527648215787065,
610
+ "loss": 0.2072,
611
+ "step": 1375
612
+ },
613
+ {
614
+ "epoch": 1.1484823625922886,
615
+ "grad_norm": 0.10220566391944885,
616
+ "learning_rate": 0.000917045131142242,
617
+ "loss": 0.2056,
618
+ "step": 1400
619
+ },
620
+ {
621
+ "epoch": 1.1484823625922886,
622
+ "eval_loss": 0.2266603708267212,
623
+ "eval_runtime": 19.5243,
624
+ "eval_samples_per_second": 51.218,
625
+ "eval_steps_per_second": 0.819,
626
+ "step": 1400
627
+ },
628
+ {
629
+ "epoch": 1.1689909762100081,
630
+ "grad_norm": 0.08700072765350342,
631
+ "learning_rate": 0.0008814317330582753,
632
+ "loss": 0.2087,
633
+ "step": 1425
634
+ },
635
+ {
636
+ "epoch": 1.1894995898277276,
637
+ "grad_norm": 0.09305619448423386,
638
+ "learning_rate": 0.000845970259752183,
639
+ "loss": 0.2154,
640
+ "step": 1450
641
+ },
642
+ {
643
+ "epoch": 1.1894995898277276,
644
+ "eval_loss": 0.225525364279747,
645
+ "eval_runtime": 19.536,
646
+ "eval_samples_per_second": 51.188,
647
+ "eval_steps_per_second": 0.819,
648
+ "step": 1450
649
+ },
650
+ {
651
+ "epoch": 1.2100082034454471,
652
+ "grad_norm": 0.08418343216180801,
653
+ "learning_rate": 0.0008107061489839498,
654
+ "loss": 0.2074,
655
+ "step": 1475
656
+ },
657
+ {
658
+ "epoch": 1.2305168170631666,
659
+ "grad_norm": 0.08762308210134506,
660
+ "learning_rate": 0.0007756845856275194,
661
+ "loss": 0.2142,
662
+ "step": 1500
663
+ },
664
+ {
665
+ "epoch": 1.2305168170631666,
666
+ "eval_loss": 0.2249392718076706,
667
+ "eval_runtime": 19.5325,
668
+ "eval_samples_per_second": 51.197,
669
+ "eval_steps_per_second": 0.819,
670
+ "step": 1500
671
+ },
672
+ {
673
+ "epoch": 1.251025430680886,
674
+ "grad_norm": 0.08624199777841568,
675
+ "learning_rate": 0.0007409504437741722,
676
+ "loss": 0.2036,
677
+ "step": 1525
678
+ },
679
+ {
680
+ "epoch": 1.2715340442986054,
681
+ "grad_norm": 0.10422079265117645,
682
+ "learning_rate": 0.0007065482292341205,
683
+ "loss": 0.2064,
684
+ "step": 1550
685
+ },
686
+ {
687
+ "epoch": 1.2715340442986054,
688
+ "eval_loss": 0.22372664511203766,
689
+ "eval_runtime": 19.4993,
690
+ "eval_samples_per_second": 51.284,
691
+ "eval_steps_per_second": 0.821,
692
+ "step": 1550
693
+ },
694
+ {
695
+ "epoch": 1.2920426579163249,
696
+ "grad_norm": 0.10072669386863708,
697
+ "learning_rate": 0.0006725220225099911,
698
+ "loss": 0.2026,
699
+ "step": 1575
700
+ },
701
+ {
702
+ "epoch": 1.3125512715340442,
703
+ "grad_norm": 0.08004007488489151,
704
+ "learning_rate": 0.0006389154223152666,
705
+ "loss": 0.1966,
706
+ "step": 1600
707
+ },
708
+ {
709
+ "epoch": 1.3125512715340442,
710
+ "eval_loss": 0.22237621247768402,
711
+ "eval_runtime": 19.5461,
712
+ "eval_samples_per_second": 51.161,
713
+ "eval_steps_per_second": 0.819,
714
+ "step": 1600
715
+ },
716
+ {
717
+ "epoch": 1.3330598851517639,
718
+ "grad_norm": 0.08824878185987473,
719
+ "learning_rate": 0.0006057714897100551,
720
+ "loss": 0.2107,
721
+ "step": 1625
722
+ },
723
+ {
724
+ "epoch": 1.3535684987694832,
725
+ "grad_norm": 0.09968467056751251,
726
+ "learning_rate": 0.0005731326929257713,
727
+ "loss": 0.2025,
728
+ "step": 1650
729
+ },
730
+ {
731
+ "epoch": 1.3535684987694832,
732
+ "eval_loss": 0.22190262377262115,
733
+ "eval_runtime": 19.5253,
734
+ "eval_samples_per_second": 51.215,
735
+ "eval_steps_per_second": 0.819,
736
+ "step": 1650
737
+ },
738
+ {
739
+ "epoch": 1.3740771123872026,
740
+ "grad_norm": 0.08944033086299896,
741
+ "learning_rate": 0.0005410408529494251,
742
+ "loss": 0.2009,
743
+ "step": 1675
744
+ },
745
+ {
746
+ "epoch": 1.3945857260049221,
747
+ "grad_norm": 0.10983427613973618,
748
+ "learning_rate": 0.0005095370899372412,
749
+ "loss": 0.2047,
750
+ "step": 1700
751
+ },
752
+ {
753
+ "epoch": 1.3945857260049221,
754
+ "eval_loss": 0.2195666879415512,
755
+ "eval_runtime": 19.5298,
756
+ "eval_samples_per_second": 51.204,
757
+ "eval_steps_per_second": 0.819,
758
+ "step": 1700
759
+ },
760
+ {
761
+ "epoch": 1.4150943396226414,
762
+ "grad_norm": 0.0977800264954567,
763
+ "learning_rate": 0.0004786617705262746,
764
+ "loss": 0.1974,
765
+ "step": 1725
766
+ },
767
+ {
768
+ "epoch": 1.435602953240361,
769
+ "grad_norm": 0.09195175021886826,
770
+ "learning_rate": 0.000448454456111529,
771
+ "loss": 0.2108,
772
+ "step": 1750
773
+ },
774
+ {
775
+ "epoch": 1.435602953240361,
776
+ "eval_loss": 0.21818041801452637,
777
+ "eval_runtime": 19.4982,
778
+ "eval_samples_per_second": 51.287,
779
+ "eval_steps_per_second": 0.821,
780
+ "step": 1750
781
+ },
782
+ {
783
+ "epoch": 1.4561115668580804,
784
+ "grad_norm": 0.07893190532922745,
785
+ "learning_rate": 0.0004189538521548524,
786
+ "loss": 0.2039,
787
+ "step": 1775
788
+ },
789
+ {
790
+ "epoch": 1.4766201804758,
791
+ "grad_norm": 0.08887404948472977,
792
+ "learning_rate": 0.00039019775859056916,
793
+ "loss": 0.2013,
794
+ "step": 1800
795
+ },
796
+ {
797
+ "epoch": 1.4766201804758,
798
+ "eval_loss": 0.21688690781593323,
799
+ "eval_runtime": 19.5295,
800
+ "eval_samples_per_second": 51.205,
801
+ "eval_steps_per_second": 0.819,
802
+ "step": 1800
803
+ },
804
+ {
805
+ "epoch": 1.4971287940935194,
806
+ "grad_norm": 0.10588109493255615,
807
+ "learning_rate": 0.0003622230213913836,
808
+ "loss": 0.1978,
809
+ "step": 1825
810
+ },
811
+ {
812
+ "epoch": 1.5176374077112387,
813
+ "grad_norm": 0.09462971240282059,
814
+ "learning_rate": 0.0003350654853566223,
815
+ "loss": 0.2083,
816
+ "step": 1850
817
+ },
818
+ {
819
+ "epoch": 1.5176374077112387,
820
+ "eval_loss": 0.21524257957935333,
821
+ "eval_runtime": 19.5333,
822
+ "eval_samples_per_second": 51.195,
823
+ "eval_steps_per_second": 0.819,
824
+ "step": 1850
825
+ },
826
+ {
827
+ "epoch": 1.5381460213289582,
828
+ "grad_norm": 0.092497818171978,
829
+ "learning_rate": 0.00030875994818330957,
830
+ "loss": 0.1978,
831
+ "step": 1875
832
+ },
833
+ {
834
+ "epoch": 1.5586546349466777,
835
+ "grad_norm": 0.0886370837688446,
836
+ "learning_rate": 0.0002833401158789207,
837
+ "loss": 0.2083,
838
+ "step": 1900
839
+ },
840
+ {
841
+ "epoch": 1.5586546349466777,
842
+ "eval_loss": 0.21452394127845764,
843
+ "eval_runtime": 19.5059,
844
+ "eval_samples_per_second": 51.266,
845
+ "eval_steps_per_second": 0.82,
846
+ "step": 1900
847
+ },
848
+ {
849
+ "epoch": 1.579163248564397,
850
+ "grad_norm": 0.09655001759529114,
851
+ "learning_rate": 0.00025883855957295053,
852
+ "loss": 0.192,
853
+ "step": 1925
854
+ },
855
+ {
856
+ "epoch": 1.5996718621821167,
857
+ "grad_norm": 0.08523295074701309,
858
+ "learning_rate": 0.0002352866737826277,
859
+ "loss": 0.2012,
860
+ "step": 1950
861
+ },
862
+ {
863
+ "epoch": 1.5996718621821167,
864
+ "eval_loss": 0.21258948743343353,
865
+ "eval_runtime": 19.4933,
866
+ "eval_samples_per_second": 51.3,
867
+ "eval_steps_per_second": 0.821,
868
+ "step": 1950
869
+ },
870
+ {
871
+ "epoch": 1.620180475799836,
872
+ "grad_norm": 0.09865439683198929,
873
+ "learning_rate": 0.00021271463618625986,
874
+ "loss": 0.1998,
875
+ "step": 1975
876
+ },
877
+ {
878
+ "epoch": 1.6406890894175554,
879
+ "grad_norm": 0.10992613434791565,
880
+ "learning_rate": 0.00019115136895574402,
881
+ "loss": 0.1974,
882
+ "step": 2000
883
+ },
884
+ {
885
+ "epoch": 1.6406890894175554,
886
+ "eval_loss": 0.21244320273399353,
887
+ "eval_runtime": 19.5047,
888
+ "eval_samples_per_second": 51.27,
889
+ "eval_steps_per_second": 0.82,
890
+ "step": 2000
891
+ },
892
+ {
893
+ "epoch": 1.661197703035275,
894
+ "grad_norm": 0.09869453310966492,
895
+ "learning_rate": 0.0001706245016977931,
896
+ "loss": 0.1984,
897
+ "step": 2025
898
+ },
899
+ {
900
+ "epoch": 1.6817063166529942,
901
+ "grad_norm": 0.08547994494438171,
902
+ "learning_rate": 0.00015116033605136182,
903
+ "loss": 0.2059,
904
+ "step": 2050
905
+ },
906
+ {
907
+ "epoch": 1.6817063166529942,
908
+ "eval_loss": 0.2115185409784317,
909
+ "eval_runtime": 19.4906,
910
+ "eval_samples_per_second": 51.307,
911
+ "eval_steps_per_second": 0.821,
912
+ "step": 2050
913
+ },
914
+ {
915
+ "epoch": 1.7022149302707137,
916
+ "grad_norm": 0.09922394901514053,
917
+ "learning_rate": 0.00013278381198663492,
918
+ "loss": 0.194,
919
+ "step": 2075
920
+ },
921
+ {
922
+ "epoch": 1.7227235438884332,
923
+ "grad_norm": 0.09176763892173767,
924
+ "learning_rate": 0.0001155184758487573,
925
+ "loss": 0.1929,
926
+ "step": 2100
927
+ },
928
+ {
929
+ "epoch": 1.7227235438884332,
930
+ "eval_loss": 0.21128395199775696,
931
+ "eval_runtime": 19.5212,
932
+ "eval_samples_per_second": 51.226,
933
+ "eval_steps_per_second": 0.82,
934
+ "step": 2100
935
+ },
936
+ {
937
+ "epoch": 1.7432321575061525,
938
+ "grad_norm": 0.10438892245292664,
939
+ "learning_rate": 9.938645018725523e-05,
940
+ "loss": 0.1987,
941
+ "step": 2125
942
+ },
943
+ {
944
+ "epoch": 1.7637407711238722,
945
+ "grad_norm": 0.10125499963760376,
946
+ "learning_rate": 8.440840540980587e-05,
947
+ "loss": 0.192,
948
+ "step": 2150
949
+ },
950
+ {
951
+ "epoch": 1.7637407711238722,
952
+ "eval_loss": 0.21018683910369873,
953
+ "eval_runtime": 19.5296,
954
+ "eval_samples_per_second": 51.204,
955
+ "eval_steps_per_second": 0.819,
956
+ "step": 2150
957
+ },
958
+ {
959
+ "epoch": 1.7842493847415914,
960
+ "grad_norm": 0.1060590147972107,
961
+ "learning_rate": 7.060353329667668e-05,
962
+ "loss": 0.205,
963
+ "step": 2175
964
+ },
965
+ {
966
+ "epoch": 1.804757998359311,
967
+ "grad_norm": 0.09252982586622238,
968
+ "learning_rate": 5.798952240976951e-05,
969
+ "loss": 0.197,
970
+ "step": 2200
971
+ },
972
+ {
973
+ "epoch": 1.804757998359311,
974
+ "eval_loss": 0.2095346748828888,
975
+ "eval_runtime": 19.5291,
976
+ "eval_samples_per_second": 51.206,
977
+ "eval_steps_per_second": 0.819,
978
+ "step": 2200
979
+ },
980
+ {
981
+ "epoch": 1.8252666119770304,
982
+ "grad_norm": 0.08407966792583466,
983
+ "learning_rate": 4.65825354277799e-05,
984
+ "loss": 0.1919,
985
+ "step": 2225
986
+ },
987
+ {
988
+ "epoch": 1.8457752255947497,
989
+ "grad_norm": 0.10161825269460678,
990
+ "learning_rate": 3.639718843651363e-05,
991
+ "loss": 0.1963,
992
+ "step": 2250
993
+ },
994
+ {
995
+ "epoch": 1.8457752255947497,
996
+ "eval_loss": 0.20933018624782562,
997
+ "eval_runtime": 19.5415,
998
+ "eval_samples_per_second": 51.173,
999
+ "eval_steps_per_second": 0.819,
1000
+ "step": 2250
1001
+ },
1002
+ {
1003
+ "epoch": 1.8662838392124692,
1004
+ "grad_norm": 0.09280096739530563,
1005
+ "learning_rate": 2.7446532200894104e-05,
1006
+ "loss": 0.1916,
1007
+ "step": 2275
1008
+ },
1009
+ {
1010
+ "epoch": 1.8867924528301887,
1011
+ "grad_norm": 0.11424104869365692,
1012
+ "learning_rate": 1.9742035442658403e-05,
1013
+ "loss": 0.2008,
1014
+ "step": 2300
1015
+ },
1016
+ {
1017
+ "epoch": 1.8867924528301887,
1018
+ "eval_loss": 0.20905601978302002,
1019
+ "eval_runtime": 19.5492,
1020
+ "eval_samples_per_second": 51.153,
1021
+ "eval_steps_per_second": 0.818,
1022
+ "step": 2300
1023
+ },
1024
+ {
1025
+ "epoch": 1.907301066447908,
1026
+ "grad_norm": 0.08300191909074783,
1027
+ "learning_rate": 1.3293570145169742e-05,
1028
+ "loss": 0.1947,
1029
+ "step": 2325
1030
+ },
1031
+ {
1032
+ "epoch": 1.9278096800656277,
1033
+ "grad_norm": 0.08883219957351685,
1034
+ "learning_rate": 8.109398904173282e-06,
1035
+ "loss": 0.1932,
1036
+ "step": 2350
1037
+ },
1038
+ {
1039
+ "epoch": 1.9278096800656277,
1040
+ "eval_loss": 0.20888373255729675,
1041
+ "eval_runtime": 19.541,
1042
+ "eval_samples_per_second": 51.174,
1043
+ "eval_steps_per_second": 0.819,
1044
+ "step": 2350
1045
+ },
1046
+ {
1047
+ "epoch": 1.948318293683347,
1048
+ "grad_norm": 0.08763596415519714,
1049
+ "learning_rate": 4.196164340705577e-06,
1050
+ "loss": 0.1902,
1051
+ "step": 2375
1052
+ },
1053
+ {
1054
+ "epoch": 1.9688269073010665,
1055
+ "grad_norm": 0.07976502925157547,
1056
+ "learning_rate": 1.5588805897215342e-06,
1057
+ "loss": 0.1963,
1058
+ "step": 2400
1059
+ },
1060
+ {
1061
+ "epoch": 1.9688269073010665,
1062
+ "eval_loss": 0.2088588923215866,
1063
+ "eval_runtime": 19.5331,
1064
+ "eval_samples_per_second": 51.195,
1065
+ "eval_steps_per_second": 0.819,
1066
+ "step": 2400
1067
+ },
1068
+ {
1069
+ "epoch": 1.989335520918786,
1070
+ "grad_norm": 0.08866149187088013,
1071
+ "learning_rate": 2.0092687534589705e-07,
1072
+ "loss": 0.1948,
1073
+ "step": 2425
1074
+ },
1075
+ {
1076
+ "epoch": 2.0,
1077
+ "step": 2438,
1078
+ "total_flos": 1.58523627405312e+18,
1079
+ "train_loss": 0.25077993895207595,
1080
+ "train_runtime": 3357.8107,
1081
+ "train_samples_per_second": 23.229,
1082
+ "train_steps_per_second": 0.726
1083
+ }
1084
+ ],
1085
+ "logging_steps": 25,
1086
+ "max_steps": 2438,
1087
+ "num_input_tokens_seen": 0,
1088
+ "num_train_epochs": 2,
1089
+ "save_steps": 500,
1090
+ "stateful_callbacks": {
1091
+ "TrainerControl": {
1092
+ "args": {
1093
+ "should_epoch_stop": false,
1094
+ "should_evaluate": false,
1095
+ "should_log": false,
1096
+ "should_save": true,
1097
+ "should_training_stop": true
1098
+ },
1099
+ "attributes": {}
1100
+ }
1101
+ },
1102
+ "total_flos": 1.58523627405312e+18,
1103
+ "train_batch_size": 32,
1104
+ "trial_name": null,
1105
+ "trial_params": null
1106
+ }
nl_tasks/run_all/exnr15/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 4,
9
+ "peft_type": "ROTATION",
10
+ "r": 4,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/exnr15/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/run_all/exnr15/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_all/exnr15/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_all/exnr15/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/run_all/exnr15/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_all/exnr15/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e78d8cf553e1bd2f18138af07f897785457fbcd326b6f0026cb8b64209a2ab44
3
+ size 6545
nl_tasks/run_all/exnr15/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 4,
9
+ "peft_type": "ROTATION",
10
+ "r": 4,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/exnr15/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80a7fb292572659a561e65a657b939feb3d3203477c08f5e6ace0852fb4af627
3
+ size 33602659
nl_tasks/run_all/exnr15/trainer_state.json ADDED
@@ -0,0 +1,135 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 0.16406890894175555,
6
+ "eval_steps": 50,
7
+ "global_step": 200,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.020508613617719443,
14
+ "grad_norm": 0.0006118718883953989,
15
+ "learning_rate": 0.0009987820251299122,
16
+ "loss": 0.7714,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04101722723543889,
21
+ "grad_norm": 0.0006668232963420451,
22
+ "learning_rate": 0.0009373098535696979,
23
+ "loss": 0.7596,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.04101722723543889,
28
+ "eval_loss": 0.7446768283843994,
29
+ "eval_model_preparation_time": 0.009,
30
+ "eval_runtime": 53.4492,
31
+ "eval_samples_per_second": 18.709,
32
+ "eval_steps_per_second": 0.299,
33
+ "step": 50
34
+ },
35
+ {
36
+ "epoch": 0.06152584085315833,
37
+ "grad_norm": 0.0006718848599120975,
38
+ "learning_rate": 0.0007938926261462366,
39
+ "loss": 0.7554,
40
+ "step": 75
41
+ },
42
+ {
43
+ "epoch": 0.08203445447087777,
44
+ "grad_norm": 0.0006972206756472588,
45
+ "learning_rate": 0.0005954044976882724,
46
+ "loss": 0.7346,
47
+ "step": 100
48
+ },
49
+ {
50
+ "epoch": 0.08203445447087777,
51
+ "eval_loss": 0.7248194813728333,
52
+ "eval_model_preparation_time": 0.009,
53
+ "eval_runtime": 19.7467,
54
+ "eval_samples_per_second": 50.641,
55
+ "eval_steps_per_second": 0.81,
56
+ "step": 100
57
+ },
58
+ {
59
+ "epoch": 0.10254306808859721,
60
+ "grad_norm": 0.0006274359184317291,
61
+ "learning_rate": 0.0003790390522001662,
62
+ "loss": 0.7366,
63
+ "step": 125
64
+ },
65
+ {
66
+ "epoch": 0.12305168170631665,
67
+ "grad_norm": 0.0005907765007577837,
68
+ "learning_rate": 0.00018533980447508135,
69
+ "loss": 0.7368,
70
+ "step": 150
71
+ },
72
+ {
73
+ "epoch": 0.12305168170631665,
74
+ "eval_loss": 0.7148993015289307,
75
+ "eval_model_preparation_time": 0.009,
76
+ "eval_runtime": 19.6656,
77
+ "eval_samples_per_second": 50.85,
78
+ "eval_steps_per_second": 0.814,
79
+ "step": 150
80
+ },
81
+ {
82
+ "epoch": 0.1435602953240361,
83
+ "grad_norm": 0.0007314748945645988,
84
+ "learning_rate": 5.060297685041659e-05,
85
+ "loss": 0.7228,
86
+ "step": 175
87
+ },
88
+ {
89
+ "epoch": 0.16406890894175555,
90
+ "grad_norm": 0.0006935550482012331,
91
+ "learning_rate": 7.615242180436521e-08,
92
+ "loss": 0.7275,
93
+ "step": 200
94
+ },
95
+ {
96
+ "epoch": 0.16406890894175555,
97
+ "eval_loss": 0.7132272720336914,
98
+ "eval_model_preparation_time": 0.009,
99
+ "eval_runtime": 19.665,
100
+ "eval_samples_per_second": 50.852,
101
+ "eval_steps_per_second": 0.814,
102
+ "step": 200
103
+ },
104
+ {
105
+ "epoch": 0.16406890894175555,
106
+ "step": 200,
107
+ "total_flos": 1.30070668640256e+17,
108
+ "train_loss": 0.7430903720855713,
109
+ "train_runtime": 454.6042,
110
+ "train_samples_per_second": 14.078,
111
+ "train_steps_per_second": 0.44
112
+ }
113
+ ],
114
+ "logging_steps": 25,
115
+ "max_steps": 200,
116
+ "num_input_tokens_seen": 0,
117
+ "num_train_epochs": 1,
118
+ "save_steps": 500,
119
+ "stateful_callbacks": {
120
+ "TrainerControl": {
121
+ "args": {
122
+ "should_epoch_stop": false,
123
+ "should_evaluate": false,
124
+ "should_log": false,
125
+ "should_save": true,
126
+ "should_training_stop": true
127
+ },
128
+ "attributes": {}
129
+ }
130
+ },
131
+ "total_flos": 1.30070668640256e+17,
132
+ "train_batch_size": 32,
133
+ "trial_name": null,
134
+ "trial_params": null
135
+ }
nl_tasks/run_all/run_exnr10/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 8,
9
+ "peft_type": "ROTATION",
10
+ "r": 2,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/run_exnr10/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/run_all/run_exnr10/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_all/run_exnr10/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_all/run_exnr10/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/run_all/run_exnr10/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_all/run_exnr10/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80200b92f1eba2aa222a8b5e89fb435709191eb6ae0eb5fad588e902f3ef01b1
3
+ size 6481
nl_tasks/run_all/run_exnr10/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 8,
9
+ "peft_type": "ROTATION",
10
+ "r": 2,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/run_exnr10/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:373029eeee0417e9247131c310db05bcef947e17ea80622d6498c15ab27e173e
3
+ size 33602659
nl_tasks/run_all/run_exnr10/trainer_state.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 50,
7
+ "global_step": 2438,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.020508613617719443,
14
+ "grad_norm": 0.23067046701908112,
15
+ "learning_rate": 9.836065573770491e-05,
16
+ "loss": 0.6777,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04101722723543889,
21
+ "grad_norm": 0.18888919055461884,
22
+ "learning_rate": 0.00020081967213114754,
23
+ "loss": 0.4155,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.04101722723543889,
28
+ "eval_loss": 0.37254372239112854,
29
+ "eval_runtime": 21.1288,
30
+ "eval_samples_per_second": 47.329,
31
+ "eval_steps_per_second": 0.757,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.06152584085315833,
36
+ "grad_norm": 0.18046127259731293,
37
+ "learning_rate": 0.0003032786885245902,
38
+ "loss": 0.3704,
39
+ "step": 75
40
+ },
41
+ {
42
+ "epoch": 0.08203445447087777,
43
+ "grad_norm": 0.19047358632087708,
44
+ "learning_rate": 0.0004057377049180328,
45
+ "loss": 0.3503,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.08203445447087777,
50
+ "eval_loss": 0.32894426584243774,
51
+ "eval_runtime": 20.8119,
52
+ "eval_samples_per_second": 48.049,
53
+ "eval_steps_per_second": 0.769,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 0.10254306808859721,
58
+ "grad_norm": 0.1786128282546997,
59
+ "learning_rate": 0.0005081967213114754,
60
+ "loss": 0.3301,
61
+ "step": 125
62
+ },
63
+ {
64
+ "epoch": 0.12305168170631665,
65
+ "grad_norm": 0.17241524159908295,
66
+ "learning_rate": 0.000610655737704918,
67
+ "loss": 0.3158,
68
+ "step": 150
69
+ },
70
+ {
71
+ "epoch": 0.12305168170631665,
72
+ "eval_loss": 0.31440791487693787,
73
+ "eval_runtime": 20.778,
74
+ "eval_samples_per_second": 48.128,
75
+ "eval_steps_per_second": 0.77,
76
+ "step": 150
77
+ },
78
+ {
79
+ "epoch": 0.1435602953240361,
80
+ "grad_norm": 0.18751497566699982,
81
+ "learning_rate": 0.0007131147540983607,
82
+ "loss": 0.313,
83
+ "step": 175
84
+ },
85
+ {
86
+ "epoch": 0.16406890894175555,
87
+ "grad_norm": 0.24119554460048676,
88
+ "learning_rate": 0.0008155737704918033,
89
+ "loss": 0.3209,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.16406890894175555,
94
+ "eval_loss": 0.30967944860458374,
95
+ "eval_runtime": 20.8466,
96
+ "eval_samples_per_second": 47.969,
97
+ "eval_steps_per_second": 0.768,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 0.184577522559475,
102
+ "grad_norm": 0.28959155082702637,
103
+ "learning_rate": 0.0009180327868852459,
104
+ "loss": 0.3282,
105
+ "step": 225
106
+ },
107
+ {
108
+ "epoch": 0.20508613617719443,
109
+ "grad_norm": 0.2528744637966156,
110
+ "learning_rate": 0.0009999871854116063,
111
+ "loss": 0.3229,
112
+ "step": 250
113
+ },
114
+ {
115
+ "epoch": 0.20508613617719443,
116
+ "eval_loss": 0.30937159061431885,
117
+ "eval_runtime": 20.8274,
118
+ "eval_samples_per_second": 48.014,
119
+ "eval_steps_per_second": 0.768,
120
+ "step": 250
121
+ },
122
+ {
123
+ "epoch": 0.22559474979491387,
124
+ "grad_norm": 0.5322397351264954,
125
+ "learning_rate": 0.0009995387437838027,
126
+ "loss": 0.3225,
127
+ "step": 275
128
+ },
129
+ {
130
+ "epoch": 0.2461033634126333,
131
+ "grad_norm": 13.223990440368652,
132
+ "learning_rate": 0.000998450229439693,
133
+ "loss": 0.354,
134
+ "step": 300
135
+ },
136
+ {
137
+ "epoch": 0.2461033634126333,
138
+ "eval_loss": 0.3210923969745636,
139
+ "eval_runtime": 20.8168,
140
+ "eval_samples_per_second": 48.038,
141
+ "eval_steps_per_second": 0.769,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 0.2666119770303528,
146
+ "grad_norm": 0.3021000325679779,
147
+ "learning_rate": 0.000996723037122612,
148
+ "loss": 0.3151,
149
+ "step": 325
150
+ },
151
+ {
152
+ "epoch": 0.2871205906480722,
153
+ "grad_norm": 0.21538248658180237,
154
+ "learning_rate": 0.0009943593799315263,
155
+ "loss": 0.3059,
156
+ "step": 350
157
+ },
158
+ {
159
+ "epoch": 0.2871205906480722,
160
+ "eval_loss": 0.29110386967658997,
161
+ "eval_runtime": 20.8154,
162
+ "eval_samples_per_second": 48.041,
163
+ "eval_steps_per_second": 0.769,
164
+ "step": 350
165
+ },
166
+ {
167
+ "epoch": 0.30762920426579166,
168
+ "grad_norm": 0.2078874111175537,
169
+ "learning_rate": 0.0009913622864853324,
170
+ "loss": 0.2942,
171
+ "step": 375
172
+ },
173
+ {
174
+ "epoch": 0.3281378178835111,
175
+ "grad_norm": 0.2866668105125427,
176
+ "learning_rate": 0.0009877355970422024,
177
+ "loss": 0.3038,
178
+ "step": 400
179
+ },
180
+ {
181
+ "epoch": 0.3281378178835111,
182
+ "eval_loss": 0.28084269165992737,
183
+ "eval_runtime": 20.8237,
184
+ "eval_samples_per_second": 48.022,
185
+ "eval_steps_per_second": 0.768,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 0.34864643150123054,
190
+ "grad_norm": 0.16703106462955475,
191
+ "learning_rate": 0.0009834839585789559,
192
+ "loss": 0.2962,
193
+ "step": 425
194
+ },
195
+ {
196
+ "epoch": 0.36915504511895,
197
+ "grad_norm": 0.18550001084804535,
198
+ "learning_rate": 0.000978612818836762,
199
+ "loss": 0.292,
200
+ "step": 450
201
+ },
202
+ {
203
+ "epoch": 0.36915504511895,
204
+ "eval_loss": 0.2726050913333893,
205
+ "eval_runtime": 20.8143,
206
+ "eval_samples_per_second": 48.044,
207
+ "eval_steps_per_second": 0.769,
208
+ "step": 450
209
+ },
210
+ {
211
+ "epoch": 0.3896636587366694,
212
+ "grad_norm": 0.18142051994800568,
213
+ "learning_rate": 0.0009731284193407981,
214
+ "loss": 0.287,
215
+ "step": 475
216
+ },
217
+ {
218
+ "epoch": 0.41017227235438886,
219
+ "grad_norm": 0.15791696310043335,
220
+ "learning_rate": 0.0009670377874028117,
221
+ "loss": 0.2788,
222
+ "step": 500
223
+ },
224
+ {
225
+ "epoch": 0.41017227235438886,
226
+ "eval_loss": 0.2660056948661804,
227
+ "eval_runtime": 20.7894,
228
+ "eval_samples_per_second": 48.101,
229
+ "eval_steps_per_second": 0.77,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 0.4306808859721083,
234
+ "grad_norm": 0.17865672707557678,
235
+ "learning_rate": 0.0009603487271168336,
236
+ "loss": 0.2818,
237
+ "step": 525
238
+ },
239
+ {
240
+ "epoch": 0.45118949958982774,
241
+ "grad_norm": 0.1716611534357071,
242
+ "learning_rate": 0.0009530698093595781,
243
+ "loss": 0.2754,
244
+ "step": 550
245
+ },
246
+ {
247
+ "epoch": 0.45118949958982774,
248
+ "eval_loss": 0.25951725244522095,
249
+ "eval_runtime": 20.6867,
250
+ "eval_samples_per_second": 48.34,
251
+ "eval_steps_per_second": 0.773,
252
+ "step": 550
253
+ },
254
+ {
255
+ "epoch": 0.4716981132075472,
256
+ "grad_norm": 0.15952439606189728,
257
+ "learning_rate": 0.0009452103608083418,
258
+ "loss": 0.2624,
259
+ "step": 575
260
+ },
261
+ {
262
+ "epoch": 0.4922067268252666,
263
+ "grad_norm": 0.15508660674095154,
264
+ "learning_rate": 0.0009367804519904775,
265
+ "loss": 0.2707,
266
+ "step": 600
267
+ },
268
+ {
269
+ "epoch": 0.4922067268252666,
270
+ "eval_loss": 0.2577713131904602,
271
+ "eval_runtime": 20.8355,
272
+ "eval_samples_per_second": 47.995,
273
+ "eval_steps_per_second": 0.768,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 0.5127153404429861,
278
+ "grad_norm": 0.14390869438648224,
279
+ "learning_rate": 0.0009277908843797492,
280
+ "loss": 0.258,
281
+ "step": 625
282
+ },
283
+ {
284
+ "epoch": 0.5332239540607056,
285
+ "grad_norm": 0.15374627709388733,
286
+ "learning_rate": 0.0009182531765561084,
287
+ "loss": 0.2575,
288
+ "step": 650
289
+ },
290
+ {
291
+ "epoch": 0.5332239540607056,
292
+ "eval_loss": 0.25287148356437683,
293
+ "eval_runtime": 20.8679,
294
+ "eval_samples_per_second": 47.921,
295
+ "eval_steps_per_second": 0.767,
296
+ "step": 650
297
+ },
298
+ {
299
+ "epoch": 0.5537325676784249,
300
+ "grad_norm": 0.13985492289066315,
301
+ "learning_rate": 0.0009081795494466201,
302
+ "loss": 0.2589,
303
+ "step": 675
304
+ },
305
+ {
306
+ "epoch": 0.5742411812961444,
307
+ "grad_norm": 0.11946714669466019,
308
+ "learning_rate": 0.0008975829106664539,
309
+ "loss": 0.2502,
310
+ "step": 700
311
+ },
312
+ {
313
+ "epoch": 0.5742411812961444,
314
+ "eval_loss": 0.24979238212108612,
315
+ "eval_runtime": 20.8494,
316
+ "eval_samples_per_second": 47.963,
317
+ "eval_steps_per_second": 0.767,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 0.5947497949138638,
322
+ "grad_norm": 0.11909514665603638,
323
+ "learning_rate": 0.0008864768379800017,
324
+ "loss": 0.2475,
325
+ "step": 725
326
+ },
327
+ {
328
+ "epoch": 0.6152584085315833,
329
+ "grad_norm": 0.12616503238677979,
330
+ "learning_rate": 0.0008748755619033153,
331
+ "loss": 0.257,
332
+ "step": 750
333
+ },
334
+ {
335
+ "epoch": 0.6152584085315833,
336
+ "eval_loss": 0.24706576764583588,
337
+ "eval_runtime": 20.8598,
338
+ "eval_samples_per_second": 47.939,
339
+ "eval_steps_per_second": 0.767,
340
+ "step": 750
341
+ },
342
+ {
343
+ "epoch": 0.6357670221493027,
344
+ "grad_norm": 0.1338769644498825,
345
+ "learning_rate": 0.000862793947470155,
346
+ "loss": 0.262,
347
+ "step": 775
348
+ },
349
+ {
350
+ "epoch": 0.6562756357670222,
351
+ "grad_norm": 0.1327950358390808,
352
+ "learning_rate": 0.0008502474751850142,
353
+ "loss": 0.2512,
354
+ "step": 800
355
+ },
356
+ {
357
+ "epoch": 0.6562756357670222,
358
+ "eval_loss": 0.24406881630420685,
359
+ "eval_runtime": 20.8661,
360
+ "eval_samples_per_second": 47.925,
361
+ "eval_steps_per_second": 0.767,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 0.6767842493847416,
366
+ "grad_norm": 0.11789478361606598,
367
+ "learning_rate": 0.0008372522211875224,
368
+ "loss": 0.2468,
369
+ "step": 825
370
+ },
371
+ {
372
+ "epoch": 0.6972928630024611,
373
+ "grad_norm": 0.114653080701828,
374
+ "learning_rate": 0.0008238248366536473,
375
+ "loss": 0.2593,
376
+ "step": 850
377
+ },
378
+ {
379
+ "epoch": 0.6972928630024611,
380
+ "eval_loss": 0.24199624359607697,
381
+ "eval_runtime": 20.8571,
382
+ "eval_samples_per_second": 47.945,
383
+ "eval_steps_per_second": 0.767,
384
+ "step": 850
385
+ },
386
+ {
387
+ "epoch": 0.7178014766201805,
388
+ "grad_norm": 0.11074954271316528,
389
+ "learning_rate": 0.0008099825264600842,
390
+ "loss": 0.2541,
391
+ "step": 875
392
+ },
393
+ {
394
+ "epoch": 0.7383100902379,
395
+ "grad_norm": 0.16259630024433136,
396
+ "learning_rate": 0.0007957430271391761,
397
+ "loss": 0.2466,
398
+ "step": 900
399
+ },
400
+ {
401
+ "epoch": 0.7383100902379,
402
+ "eval_loss": 0.23889920115470886,
403
+ "eval_runtime": 20.8704,
404
+ "eval_samples_per_second": 47.915,
405
+ "eval_steps_per_second": 0.767,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 0.7588187038556193,
410
+ "grad_norm": 0.11665979772806168,
411
+ "learning_rate": 0.0007811245841526062,
412
+ "loss": 0.2499,
413
+ "step": 925
414
+ },
415
+ {
416
+ "epoch": 0.7793273174733388,
417
+ "grad_norm": 0.12356989085674286,
418
+ "learning_rate": 0.0007661459285129879,
419
+ "loss": 0.233,
420
+ "step": 950
421
+ },
422
+ {
423
+ "epoch": 0.7793273174733388,
424
+ "eval_loss": 0.23713098466396332,
425
+ "eval_runtime": 20.8545,
426
+ "eval_samples_per_second": 47.951,
427
+ "eval_steps_per_second": 0.767,
428
+ "step": 950
429
+ },
430
+ {
431
+ "epoch": 0.7998359310910582,
432
+ "grad_norm": 0.10465991497039795,
433
+ "learning_rate": 0.0007508262527833029,
434
+ "loss": 0.2465,
435
+ "step": 975
436
+ },
437
+ {
438
+ "epoch": 0.8203445447087777,
439
+ "grad_norm": 0.10666616261005402,
440
+ "learning_rate": 0.000735185186484943,
441
+ "loss": 0.2486,
442
+ "step": 1000
443
+ },
444
+ {
445
+ "epoch": 0.8203445447087777,
446
+ "eval_loss": 0.23608271777629852,
447
+ "eval_runtime": 20.8714,
448
+ "eval_samples_per_second": 47.912,
449
+ "eval_steps_per_second": 0.767,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 0.8408531583264971,
454
+ "grad_norm": 0.10759830474853516,
455
+ "learning_rate": 0.0007192427709458656,
456
+ "loss": 0.2363,
457
+ "step": 1025
458
+ },
459
+ {
460
+ "epoch": 0.8613617719442166,
461
+ "grad_norm": 0.11578875035047531,
462
+ "learning_rate": 0.0007030194336210887,
463
+ "loss": 0.2407,
464
+ "step": 1050
465
+ },
466
+ {
467
+ "epoch": 0.8613617719442166,
468
+ "eval_loss": 0.23380084335803986,
469
+ "eval_runtime": 20.8115,
470
+ "eval_samples_per_second": 48.05,
471
+ "eval_steps_per_second": 0.769,
472
+ "step": 1050
473
+ },
474
+ {
475
+ "epoch": 0.881870385561936,
476
+ "grad_norm": 0.11421164870262146,
477
+ "learning_rate": 0.0006865359619184331,
478
+ "loss": 0.2464,
479
+ "step": 1075
480
+ },
481
+ {
482
+ "epoch": 0.9023789991796555,
483
+ "grad_norm": 0.1076551228761673,
484
+ "learning_rate": 0.0006698134765630434,
485
+ "loss": 0.2387,
486
+ "step": 1100
487
+ },
488
+ {
489
+ "epoch": 0.9023789991796555,
490
+ "eval_loss": 0.23312732577323914,
491
+ "eval_runtime": 20.8233,
492
+ "eval_samples_per_second": 48.023,
493
+ "eval_steps_per_second": 0.768,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 0.9228876127973749,
498
+ "grad_norm": 0.10048159211874008,
499
+ "learning_rate": 0.0006528734045348248,
500
+ "loss": 0.2361,
501
+ "step": 1125
502
+ },
503
+ {
504
+ "epoch": 0.9433962264150944,
505
+ "grad_norm": 0.1083035096526146,
506
+ "learning_rate": 0.0006357374516134643,
507
+ "loss": 0.2506,
508
+ "step": 1150
509
+ },
510
+ {
511
+ "epoch": 0.9433962264150944,
512
+ "eval_loss": 0.2312103807926178,
513
+ "eval_runtime": 20.7981,
514
+ "eval_samples_per_second": 48.081,
515
+ "eval_steps_per_second": 0.769,
516
+ "step": 1150
517
+ },
518
+ {
519
+ "epoch": 0.9639048400328137,
520
+ "grad_norm": 0.11083986610174179,
521
+ "learning_rate": 0.0006184275745662179,
522
+ "loss": 0.2401,
523
+ "step": 1175
524
+ },
525
+ {
526
+ "epoch": 0.9844134536505332,
527
+ "grad_norm": 0.10034681111574173,
528
+ "learning_rate": 0.0006009659530141031,
529
+ "loss": 0.2428,
530
+ "step": 1200
531
+ },
532
+ {
533
+ "epoch": 0.9844134536505332,
534
+ "eval_loss": 0.22971327602863312,
535
+ "eval_runtime": 20.7973,
536
+ "eval_samples_per_second": 48.083,
537
+ "eval_steps_per_second": 0.769,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 1.0049220672682526,
542
+ "grad_norm": 0.10005165636539459,
543
+ "learning_rate": 0.0005833749610125402,
544
+ "loss": 0.2395,
545
+ "step": 1225
546
+ },
547
+ {
548
+ "epoch": 1.0254306808859721,
549
+ "grad_norm": 0.10959237813949585,
550
+ "learning_rate": 0.0005656771383828602,
551
+ "loss": 0.2135,
552
+ "step": 1250
553
+ },
554
+ {
555
+ "epoch": 1.0254306808859721,
556
+ "eval_loss": 0.23102633655071259,
557
+ "eval_runtime": 20.7627,
558
+ "eval_samples_per_second": 48.163,
559
+ "eval_steps_per_second": 0.771,
560
+ "step": 1250
561
+ },
562
+ {
563
+ "epoch": 1.0459392945036916,
564
+ "grad_norm": 0.103731170296669,
565
+ "learning_rate": 0.0005478951618314134,
566
+ "loss": 0.2094,
567
+ "step": 1275
568
+ },
569
+ {
570
+ "epoch": 1.066447908121411,
571
+ "grad_norm": 0.11381426453590393,
572
+ "learning_rate": 0.0005300518158932815,
573
+ "loss": 0.2108,
574
+ "step": 1300
575
+ },
576
+ {
577
+ "epoch": 1.066447908121411,
578
+ "eval_loss": 0.2284233421087265,
579
+ "eval_runtime": 20.7865,
580
+ "eval_samples_per_second": 48.108,
581
+ "eval_steps_per_second": 0.77,
582
+ "step": 1300
583
+ },
584
+ {
585
+ "epoch": 1.0869565217391304,
586
+ "grad_norm": 0.10289537161588669,
587
+ "learning_rate": 0.0005121699637378282,
588
+ "loss": 0.2098,
589
+ "step": 1325
590
+ },
591
+ {
592
+ "epoch": 1.1074651353568499,
593
+ "grad_norm": 0.11907949298620224,
594
+ "learning_rate": 0.0004942725178734903,
595
+ "loss": 0.2152,
596
+ "step": 1350
597
+ },
598
+ {
599
+ "epoch": 1.1074651353568499,
600
+ "eval_loss": 0.22748351097106934,
601
+ "eval_runtime": 20.7803,
602
+ "eval_samples_per_second": 48.123,
603
+ "eval_steps_per_second": 0.77,
604
+ "step": 1350
605
+ },
606
+ {
607
+ "epoch": 1.1279737489745694,
608
+ "grad_norm": 0.11696625500917435,
609
+ "learning_rate": 0.00047638241078935324,
610
+ "loss": 0.2121,
611
+ "step": 1375
612
+ },
613
+ {
614
+ "epoch": 1.1484823625922886,
615
+ "grad_norm": 0.10638347268104553,
616
+ "learning_rate": 0.000458522565571121,
617
+ "loss": 0.2201,
618
+ "step": 1400
619
+ },
620
+ {
621
+ "epoch": 1.1484823625922886,
622
+ "eval_loss": 0.22692248225212097,
623
+ "eval_runtime": 20.7646,
624
+ "eval_samples_per_second": 48.159,
625
+ "eval_steps_per_second": 0.771,
626
+ "step": 1400
627
+ },
628
+ {
629
+ "epoch": 1.1689909762100081,
630
+ "grad_norm": 0.10754355788230896,
631
+ "learning_rate": 0.00044071586652913767,
632
+ "loss": 0.2035,
633
+ "step": 1425
634
+ },
635
+ {
636
+ "epoch": 1.1894995898277276,
637
+ "grad_norm": 0.11690429598093033,
638
+ "learning_rate": 0.0004229851298760915,
639
+ "loss": 0.2135,
640
+ "step": 1450
641
+ },
642
+ {
643
+ "epoch": 1.1894995898277276,
644
+ "eval_loss": 0.2264111191034317,
645
+ "eval_runtime": 20.7449,
646
+ "eval_samples_per_second": 48.205,
647
+ "eval_steps_per_second": 0.771,
648
+ "step": 1450
649
+ },
650
+ {
651
+ "epoch": 1.2100082034454471,
652
+ "grad_norm": 0.105661541223526,
653
+ "learning_rate": 0.0004053530744919749,
654
+ "loss": 0.2125,
655
+ "step": 1475
656
+ },
657
+ {
658
+ "epoch": 1.2305168170631666,
659
+ "grad_norm": 0.11686920374631882,
660
+ "learning_rate": 0.0003878422928137597,
661
+ "loss": 0.2158,
662
+ "step": 1500
663
+ },
664
+ {
665
+ "epoch": 1.2305168170631666,
666
+ "eval_loss": 0.22509609162807465,
667
+ "eval_runtime": 20.7795,
668
+ "eval_samples_per_second": 48.124,
669
+ "eval_steps_per_second": 0.77,
670
+ "step": 1500
671
+ },
672
+ {
673
+ "epoch": 1.251025430680886,
674
+ "grad_norm": 0.11210618168115616,
675
+ "learning_rate": 0.0003704752218870861,
676
+ "loss": 0.2205,
677
+ "step": 1525
678
+ },
679
+ {
680
+ "epoch": 1.2715340442986054,
681
+ "grad_norm": 0.10161525756120682,
682
+ "learning_rate": 0.00035327411461706025,
683
+ "loss": 0.203,
684
+ "step": 1550
685
+ },
686
+ {
687
+ "epoch": 1.2715340442986054,
688
+ "eval_loss": 0.22456735372543335,
689
+ "eval_runtime": 20.7538,
690
+ "eval_samples_per_second": 48.184,
691
+ "eval_steps_per_second": 0.771,
692
+ "step": 1550
693
+ },
694
+ {
695
+ "epoch": 1.2920426579163249,
696
+ "grad_norm": 0.11276591569185257,
697
+ "learning_rate": 0.00033626101125499555,
698
+ "loss": 0.2126,
699
+ "step": 1575
700
+ },
701
+ {
702
+ "epoch": 1.3125512715340442,
703
+ "grad_norm": 0.10445020347833633,
704
+ "learning_rate": 0.0003194577111576333,
705
+ "loss": 0.2081,
706
+ "step": 1600
707
+ },
708
+ {
709
+ "epoch": 1.3125512715340442,
710
+ "eval_loss": 0.22300216555595398,
711
+ "eval_runtime": 20.7911,
712
+ "eval_samples_per_second": 48.098,
713
+ "eval_steps_per_second": 0.77,
714
+ "step": 1600
715
+ },
716
+ {
717
+ "epoch": 1.3330598851517639,
718
+ "grad_norm": 0.09758764505386353,
719
+ "learning_rate": 0.00030288574485502756,
720
+ "loss": 0.2076,
721
+ "step": 1625
722
+ },
723
+ {
724
+ "epoch": 1.3535684987694832,
725
+ "grad_norm": 0.10864421725273132,
726
+ "learning_rate": 0.00028656634646288565,
727
+ "loss": 0.2085,
728
+ "step": 1650
729
+ },
730
+ {
731
+ "epoch": 1.3535684987694832,
732
+ "eval_loss": 0.2210860252380371,
733
+ "eval_runtime": 20.7702,
734
+ "eval_samples_per_second": 48.146,
735
+ "eval_steps_per_second": 0.77,
736
+ "step": 1650
737
+ },
738
+ {
739
+ "epoch": 1.3740771123872026,
740
+ "grad_norm": 0.12502917647361755,
741
+ "learning_rate": 0.00027052042647471254,
742
+ "loss": 0.1977,
743
+ "step": 1675
744
+ },
745
+ {
746
+ "epoch": 1.3945857260049221,
747
+ "grad_norm": 0.10719209909439087,
748
+ "learning_rate": 0.0002547685449686206,
749
+ "loss": 0.206,
750
+ "step": 1700
751
+ },
752
+ {
753
+ "epoch": 1.3945857260049221,
754
+ "eval_loss": 0.21972452104091644,
755
+ "eval_runtime": 20.7618,
756
+ "eval_samples_per_second": 48.165,
757
+ "eval_steps_per_second": 0.771,
758
+ "step": 1700
759
+ },
760
+ {
761
+ "epoch": 1.4150943396226414,
762
+ "grad_norm": 0.13277575373649597,
763
+ "learning_rate": 0.0002393308852631373,
764
+ "loss": 0.2175,
765
+ "step": 1725
766
+ },
767
+ {
768
+ "epoch": 1.435602953240361,
769
+ "grad_norm": 0.10963447391986847,
770
+ "learning_rate": 0.0002242272280557645,
771
+ "loss": 0.2119,
772
+ "step": 1750
773
+ },
774
+ {
775
+ "epoch": 1.435602953240361,
776
+ "eval_loss": 0.21942101418972015,
777
+ "eval_runtime": 20.8,
778
+ "eval_samples_per_second": 48.077,
779
+ "eval_steps_per_second": 0.769,
780
+ "step": 1750
781
+ },
782
+ {
783
+ "epoch": 1.4561115668580804,
784
+ "grad_norm": 0.1101914569735527,
785
+ "learning_rate": 0.0002094769260774262,
786
+ "loss": 0.2077,
787
+ "step": 1775
788
+ },
789
+ {
790
+ "epoch": 1.4766201804758,
791
+ "grad_norm": 0.1031966507434845,
792
+ "learning_rate": 0.00019509887929528458,
793
+ "loss": 0.2116,
794
+ "step": 1800
795
+ },
796
+ {
797
+ "epoch": 1.4766201804758,
798
+ "eval_loss": 0.21845056116580963,
799
+ "eval_runtime": 20.795,
800
+ "eval_samples_per_second": 48.089,
801
+ "eval_steps_per_second": 0.769,
802
+ "step": 1800
803
+ },
804
+ {
805
+ "epoch": 1.4971287940935194,
806
+ "grad_norm": 0.13558058440685272,
807
+ "learning_rate": 0.0001811115106956918,
808
+ "loss": 0.2028,
809
+ "step": 1825
810
+ },
811
+ {
812
+ "epoch": 1.5176374077112387,
813
+ "grad_norm": 0.11538273841142654,
814
+ "learning_rate": 0.00016753274267831115,
815
+ "loss": 0.2119,
816
+ "step": 1850
817
+ },
818
+ {
819
+ "epoch": 1.5176374077112387,
820
+ "eval_loss": 0.21733731031417847,
821
+ "eval_runtime": 20.7805,
822
+ "eval_samples_per_second": 48.122,
823
+ "eval_steps_per_second": 0.77,
824
+ "step": 1850
825
+ },
826
+ {
827
+ "epoch": 1.5381460213289582,
828
+ "grad_norm": 0.13104985654354095,
829
+ "learning_rate": 0.00015437997409165478,
830
+ "loss": 0.2022,
831
+ "step": 1875
832
+ },
833
+ {
834
+ "epoch": 1.5586546349466777,
835
+ "grad_norm": 0.11488507688045502,
836
+ "learning_rate": 0.00014167005793946035,
837
+ "loss": 0.2015,
838
+ "step": 1900
839
+ },
840
+ {
841
+ "epoch": 1.5586546349466777,
842
+ "eval_loss": 0.21705850958824158,
843
+ "eval_runtime": 20.7621,
844
+ "eval_samples_per_second": 48.165,
845
+ "eval_steps_per_second": 0.771,
846
+ "step": 1900
847
+ },
848
+ {
849
+ "epoch": 1.579163248564397,
850
+ "grad_norm": 0.11999308317899704,
851
+ "learning_rate": 0.00012941927978647527,
852
+ "loss": 0.2038,
853
+ "step": 1925
854
+ },
855
+ {
856
+ "epoch": 1.5996718621821167,
857
+ "grad_norm": 0.1013152152299881,
858
+ "learning_rate": 0.00011764333689131385,
859
+ "loss": 0.2095,
860
+ "step": 1950
861
+ },
862
+ {
863
+ "epoch": 1.5996718621821167,
864
+ "eval_loss": 0.2159809172153473,
865
+ "eval_runtime": 20.8631,
866
+ "eval_samples_per_second": 47.932,
867
+ "eval_steps_per_second": 0.767,
868
+ "step": 1950
869
+ },
870
+ {
871
+ "epoch": 1.620180475799836,
872
+ "grad_norm": 0.10026060789823532,
873
+ "learning_rate": 0.00010635731809312993,
874
+ "loss": 0.2058,
875
+ "step": 1975
876
+ },
877
+ {
878
+ "epoch": 1.6406890894175554,
879
+ "grad_norm": 0.11767016351222992,
880
+ "learning_rate": 9.557568447787201e-05,
881
+ "loss": 0.2056,
882
+ "step": 2000
883
+ },
884
+ {
885
+ "epoch": 1.6406890894175554,
886
+ "eval_loss": 0.2152351588010788,
887
+ "eval_runtime": 20.9101,
888
+ "eval_samples_per_second": 47.824,
889
+ "eval_steps_per_second": 0.765,
890
+ "step": 2000
891
+ },
892
+ {
893
+ "epoch": 1.661197703035275,
894
+ "grad_norm": 0.10698919743299484,
895
+ "learning_rate": 8.531225084889654e-05,
896
+ "loss": 0.1993,
897
+ "step": 2025
898
+ },
899
+ {
900
+ "epoch": 1.6817063166529942,
901
+ "grad_norm": 0.13556532561779022,
902
+ "learning_rate": 7.558016802568091e-05,
903
+ "loss": 0.2021,
904
+ "step": 2050
905
+ },
906
+ {
907
+ "epoch": 1.6817063166529942,
908
+ "eval_loss": 0.21463391184806824,
909
+ "eval_runtime": 20.821,
910
+ "eval_samples_per_second": 48.029,
911
+ "eval_steps_per_second": 0.768,
912
+ "step": 2050
913
+ },
914
+ {
915
+ "epoch": 1.7022149302707137,
916
+ "grad_norm": 0.1280520260334015,
917
+ "learning_rate": 6.639190599331746e-05,
918
+ "loss": 0.2029,
919
+ "step": 2075
920
+ },
921
+ {
922
+ "epoch": 1.7227235438884332,
923
+ "grad_norm": 0.11126288026571274,
924
+ "learning_rate": 5.775923792437865e-05,
925
+ "loss": 0.1991,
926
+ "step": 2100
927
+ },
928
+ {
929
+ "epoch": 1.7227235438884332,
930
+ "eval_loss": 0.21391206979751587,
931
+ "eval_runtime": 20.7954,
932
+ "eval_samples_per_second": 48.088,
933
+ "eval_steps_per_second": 0.769,
934
+ "step": 2100
935
+ },
936
+ {
937
+ "epoch": 1.7432321575061525,
938
+ "grad_norm": 0.11311797052621841,
939
+ "learning_rate": 4.9693225093627616e-05,
940
+ "loss": 0.1992,
941
+ "step": 2125
942
+ },
943
+ {
944
+ "epoch": 1.7637407711238722,
945
+ "grad_norm": 0.11362408846616745,
946
+ "learning_rate": 4.220420270490294e-05,
947
+ "loss": 0.1932,
948
+ "step": 2150
949
+ },
950
+ {
951
+ "epoch": 1.7637407711238722,
952
+ "eval_loss": 0.21360714733600616,
953
+ "eval_runtime": 20.7865,
954
+ "eval_samples_per_second": 48.108,
955
+ "eval_steps_per_second": 0.77,
956
+ "step": 2150
957
+ },
958
+ {
959
+ "epoch": 1.7842493847415914,
960
+ "grad_norm": 0.1285392940044403,
961
+ "learning_rate": 3.530176664833834e-05,
962
+ "loss": 0.196,
963
+ "step": 2175
964
+ },
965
+ {
966
+ "epoch": 1.804757998359311,
967
+ "grad_norm": 0.12596482038497925,
968
+ "learning_rate": 2.8994761204884756e-05,
969
+ "loss": 0.2048,
970
+ "step": 2200
971
+ },
972
+ {
973
+ "epoch": 1.804757998359311,
974
+ "eval_loss": 0.21343722939491272,
975
+ "eval_runtime": 20.8106,
976
+ "eval_samples_per_second": 48.052,
977
+ "eval_steps_per_second": 0.769,
978
+ "step": 2200
979
+ },
980
+ {
981
+ "epoch": 1.8252666119770304,
982
+ "grad_norm": 0.12076670676469803,
983
+ "learning_rate": 2.329126771388995e-05,
984
+ "loss": 0.1957,
985
+ "step": 2225
986
+ },
987
+ {
988
+ "epoch": 1.8457752255947497,
989
+ "grad_norm": 0.10855985432863235,
990
+ "learning_rate": 1.8198594218256815e-05,
991
+ "loss": 0.1971,
992
+ "step": 2250
993
+ },
994
+ {
995
+ "epoch": 1.8457752255947497,
996
+ "eval_loss": 0.21339941024780273,
997
+ "eval_runtime": 20.7802,
998
+ "eval_samples_per_second": 48.123,
999
+ "eval_steps_per_second": 0.77,
1000
+ "step": 2250
1001
+ },
1002
+ {
1003
+ "epoch": 1.8662838392124692,
1004
+ "grad_norm": 0.12243843078613281,
1005
+ "learning_rate": 1.3723266100447052e-05,
1006
+ "loss": 0.2006,
1007
+ "step": 2275
1008
+ },
1009
+ {
1010
+ "epoch": 1.8867924528301887,
1011
+ "grad_norm": 0.14360643923282623,
1012
+ "learning_rate": 9.871017721329201e-06,
1013
+ "loss": 0.2083,
1014
+ "step": 2300
1015
+ },
1016
+ {
1017
+ "epoch": 1.8867924528301887,
1018
+ "eval_loss": 0.2131664752960205,
1019
+ "eval_runtime": 20.7557,
1020
+ "eval_samples_per_second": 48.18,
1021
+ "eval_steps_per_second": 0.771,
1022
+ "step": 2300
1023
+ },
1024
+ {
1025
+ "epoch": 1.907301066447908,
1026
+ "grad_norm": 0.11693233996629715,
1027
+ "learning_rate": 6.646785072584871e-06,
1028
+ "loss": 0.2031,
1029
+ "step": 2325
1030
+ },
1031
+ {
1032
+ "epoch": 1.9278096800656277,
1033
+ "grad_norm": 0.10460177809000015,
1034
+ "learning_rate": 4.054699452086641e-06,
1035
+ "loss": 0.1966,
1036
+ "step": 2350
1037
+ },
1038
+ {
1039
+ "epoch": 1.9278096800656277,
1040
+ "eval_loss": 0.21305271983146667,
1041
+ "eval_runtime": 20.8057,
1042
+ "eval_samples_per_second": 48.064,
1043
+ "eval_steps_per_second": 0.769,
1044
+ "step": 2350
1045
+ },
1046
+ {
1047
+ "epoch": 1.948318293683347,
1048
+ "grad_norm": 0.1053781509399414,
1049
+ "learning_rate": 2.0980821703527886e-06,
1050
+ "loss": 0.211,
1051
+ "step": 2375
1052
+ },
1053
+ {
1054
+ "epoch": 1.9688269073010665,
1055
+ "grad_norm": 0.1103985607624054,
1056
+ "learning_rate": 7.794402948607671e-07,
1057
+ "loss": 0.2071,
1058
+ "step": 2400
1059
+ },
1060
+ {
1061
+ "epoch": 1.9688269073010665,
1062
+ "eval_loss": 0.21304111182689667,
1063
+ "eval_runtime": 20.8063,
1064
+ "eval_samples_per_second": 48.062,
1065
+ "eval_steps_per_second": 0.769,
1066
+ "step": 2400
1067
+ },
1068
+ {
1069
+ "epoch": 1.989335520918786,
1070
+ "grad_norm": 0.13697576522827148,
1071
+ "learning_rate": 1.0046343767294853e-07,
1072
+ "loss": 0.1965,
1073
+ "step": 2425
1074
+ },
1075
+ {
1076
+ "epoch": 2.0,
1077
+ "step": 2438,
1078
+ "total_flos": 1.58523627405312e+18,
1079
+ "train_loss": 0.24706831303065288,
1080
+ "train_runtime": 3792.5555,
1081
+ "train_samples_per_second": 20.567,
1082
+ "train_steps_per_second": 0.643
1083
+ }
1084
+ ],
1085
+ "logging_steps": 25,
1086
+ "max_steps": 2438,
1087
+ "num_input_tokens_seen": 0,
1088
+ "num_train_epochs": 2,
1089
+ "save_steps": 500,
1090
+ "stateful_callbacks": {
1091
+ "TrainerControl": {
1092
+ "args": {
1093
+ "should_epoch_stop": false,
1094
+ "should_evaluate": false,
1095
+ "should_log": false,
1096
+ "should_save": true,
1097
+ "should_training_stop": true
1098
+ },
1099
+ "attributes": {}
1100
+ }
1101
+ },
1102
+ "total_flos": 1.58523627405312e+18,
1103
+ "train_batch_size": 32,
1104
+ "trial_name": null,
1105
+ "trial_params": null
1106
+ }
nl_tasks/run_all/run_exnr11/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 2,
9
+ "peft_type": "ROTATION",
10
+ "r": 8,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/run_exnr11/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/run_all/run_exnr11/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_all/run_exnr11/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/run_all/run_exnr11/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/run_all/run_exnr11/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_all/run_exnr11/ft/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c3569eea58fa21482a4bc85e514a16f90ed59354c7c0b80262bb053b7c12c9c
3
+ size 6481
nl_tasks/run_all/run_exnr11/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 2,
9
+ "peft_type": "ROTATION",
10
+ "r": 8,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/run_all/run_exnr11/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a354ea299786492a94c791601240dfb633666078b15a8ebdf0181221f18adb01
3
+ size 33602659
nl_tasks/run_all/run_exnr11/trainer_state.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 50,
7
+ "global_step": 2438,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.020508613617719443,
14
+ "grad_norm": 0.22536601126194,
15
+ "learning_rate": 9.836065573770491e-05,
16
+ "loss": 0.6776,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04101722723543889,
21
+ "grad_norm": 0.18631383776664734,
22
+ "learning_rate": 0.00020081967213114754,
23
+ "loss": 0.4154,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.04101722723543889,
28
+ "eval_loss": 0.3722397983074188,
29
+ "eval_runtime": 19.8213,
30
+ "eval_samples_per_second": 50.451,
31
+ "eval_steps_per_second": 0.807,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.06152584085315833,
36
+ "grad_norm": 0.17987015843391418,
37
+ "learning_rate": 0.0003032786885245902,
38
+ "loss": 0.3704,
39
+ "step": 75
40
+ },
41
+ {
42
+ "epoch": 0.08203445447087777,
43
+ "grad_norm": 0.19490021467208862,
44
+ "learning_rate": 0.0004057377049180328,
45
+ "loss": 0.3506,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.08203445447087777,
50
+ "eval_loss": 0.3291980028152466,
51
+ "eval_runtime": 19.4889,
52
+ "eval_samples_per_second": 51.311,
53
+ "eval_steps_per_second": 0.821,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 0.10254306808859721,
58
+ "grad_norm": 1.9897618293762207,
59
+ "learning_rate": 0.0005081967213114754,
60
+ "loss": 0.3309,
61
+ "step": 125
62
+ },
63
+ {
64
+ "epoch": 0.12305168170631665,
65
+ "grad_norm": 0.17177057266235352,
66
+ "learning_rate": 0.000610655737704918,
67
+ "loss": 0.3153,
68
+ "step": 150
69
+ },
70
+ {
71
+ "epoch": 0.12305168170631665,
72
+ "eval_loss": 0.31367334723472595,
73
+ "eval_runtime": 19.4853,
74
+ "eval_samples_per_second": 51.321,
75
+ "eval_steps_per_second": 0.821,
76
+ "step": 150
77
+ },
78
+ {
79
+ "epoch": 0.1435602953240361,
80
+ "grad_norm": 0.22781579196453094,
81
+ "learning_rate": 0.0007131147540983607,
82
+ "loss": 0.314,
83
+ "step": 175
84
+ },
85
+ {
86
+ "epoch": 0.16406890894175555,
87
+ "grad_norm": 0.3163987994194031,
88
+ "learning_rate": 0.0008155737704918033,
89
+ "loss": 0.3208,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.16406890894175555,
94
+ "eval_loss": 0.31177034974098206,
95
+ "eval_runtime": 19.4994,
96
+ "eval_samples_per_second": 51.284,
97
+ "eval_steps_per_second": 0.821,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 0.184577522559475,
102
+ "grad_norm": 0.32001861929893494,
103
+ "learning_rate": 0.0009180327868852459,
104
+ "loss": 0.3285,
105
+ "step": 225
106
+ },
107
+ {
108
+ "epoch": 0.20508613617719443,
109
+ "grad_norm": 0.2669612169265747,
110
+ "learning_rate": 0.0009999871854116063,
111
+ "loss": 0.3195,
112
+ "step": 250
113
+ },
114
+ {
115
+ "epoch": 0.20508613617719443,
116
+ "eval_loss": 0.30614417791366577,
117
+ "eval_runtime": 19.4864,
118
+ "eval_samples_per_second": 51.318,
119
+ "eval_steps_per_second": 0.821,
120
+ "step": 250
121
+ },
122
+ {
123
+ "epoch": 0.22559474979491387,
124
+ "grad_norm": 0.35766294598579407,
125
+ "learning_rate": 0.0009995387437838027,
126
+ "loss": 0.3164,
127
+ "step": 275
128
+ },
129
+ {
130
+ "epoch": 0.2461033634126333,
131
+ "grad_norm": 0.2564176619052887,
132
+ "learning_rate": 0.000998450229439693,
133
+ "loss": 0.3137,
134
+ "step": 300
135
+ },
136
+ {
137
+ "epoch": 0.2461033634126333,
138
+ "eval_loss": 0.3005053699016571,
139
+ "eval_runtime": 19.4856,
140
+ "eval_samples_per_second": 51.32,
141
+ "eval_steps_per_second": 0.821,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 0.2666119770303528,
146
+ "grad_norm": 0.21307945251464844,
147
+ "learning_rate": 0.000996723037122612,
148
+ "loss": 0.2997,
149
+ "step": 325
150
+ },
151
+ {
152
+ "epoch": 0.2871205906480722,
153
+ "grad_norm": 0.24557501077651978,
154
+ "learning_rate": 0.0009943593799315263,
155
+ "loss": 0.3067,
156
+ "step": 350
157
+ },
158
+ {
159
+ "epoch": 0.2871205906480722,
160
+ "eval_loss": 0.29183846712112427,
161
+ "eval_runtime": 19.4754,
162
+ "eval_samples_per_second": 51.347,
163
+ "eval_steps_per_second": 0.822,
164
+ "step": 350
165
+ },
166
+ {
167
+ "epoch": 0.30762920426579166,
168
+ "grad_norm": 0.22399385273456573,
169
+ "learning_rate": 0.0009913622864853324,
170
+ "loss": 0.2946,
171
+ "step": 375
172
+ },
173
+ {
174
+ "epoch": 0.3281378178835111,
175
+ "grad_norm": 0.28306031227111816,
176
+ "learning_rate": 0.0009877355970422024,
177
+ "loss": 0.3047,
178
+ "step": 400
179
+ },
180
+ {
181
+ "epoch": 0.3281378178835111,
182
+ "eval_loss": 0.2847941815853119,
183
+ "eval_runtime": 19.4709,
184
+ "eval_samples_per_second": 51.359,
185
+ "eval_steps_per_second": 0.822,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 0.34864643150123054,
190
+ "grad_norm": 0.18770238757133484,
191
+ "learning_rate": 0.0009834839585789559,
192
+ "loss": 0.2983,
193
+ "step": 425
194
+ },
195
+ {
196
+ "epoch": 0.36915504511895,
197
+ "grad_norm": 0.2074132114648819,
198
+ "learning_rate": 0.000978612818836762,
199
+ "loss": 0.2957,
200
+ "step": 450
201
+ },
202
+ {
203
+ "epoch": 0.36915504511895,
204
+ "eval_loss": 0.2784390449523926,
205
+ "eval_runtime": 19.4671,
206
+ "eval_samples_per_second": 51.369,
207
+ "eval_steps_per_second": 0.822,
208
+ "step": 450
209
+ },
210
+ {
211
+ "epoch": 0.3896636587366694,
212
+ "grad_norm": 0.1998339742422104,
213
+ "learning_rate": 0.0009731284193407981,
214
+ "loss": 0.2922,
215
+ "step": 475
216
+ },
217
+ {
218
+ "epoch": 0.41017227235438886,
219
+ "grad_norm": 0.19609296321868896,
220
+ "learning_rate": 0.0009670377874028117,
221
+ "loss": 0.2838,
222
+ "step": 500
223
+ },
224
+ {
225
+ "epoch": 0.41017227235438886,
226
+ "eval_loss": 0.2710179090499878,
227
+ "eval_runtime": 19.4596,
228
+ "eval_samples_per_second": 51.389,
229
+ "eval_steps_per_second": 0.822,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 0.4306808859721083,
234
+ "grad_norm": 0.22822904586791992,
235
+ "learning_rate": 0.0009603487271168336,
236
+ "loss": 0.2859,
237
+ "step": 525
238
+ },
239
+ {
240
+ "epoch": 0.45118949958982774,
241
+ "grad_norm": 0.21479056775569916,
242
+ "learning_rate": 0.0009530698093595781,
243
+ "loss": 0.2805,
244
+ "step": 550
245
+ },
246
+ {
247
+ "epoch": 0.45118949958982774,
248
+ "eval_loss": 0.2640228867530823,
249
+ "eval_runtime": 19.4506,
250
+ "eval_samples_per_second": 51.412,
251
+ "eval_steps_per_second": 0.823,
252
+ "step": 550
253
+ },
254
+ {
255
+ "epoch": 0.4716981132075472,
256
+ "grad_norm": 0.21766699850559235,
257
+ "learning_rate": 0.0009452103608083418,
258
+ "loss": 0.2659,
259
+ "step": 575
260
+ },
261
+ {
262
+ "epoch": 0.4922067268252666,
263
+ "grad_norm": 0.21340589225292206,
264
+ "learning_rate": 0.0009367804519904775,
265
+ "loss": 0.2761,
266
+ "step": 600
267
+ },
268
+ {
269
+ "epoch": 0.4922067268252666,
270
+ "eval_loss": 0.26190468668937683,
271
+ "eval_runtime": 19.4654,
272
+ "eval_samples_per_second": 51.373,
273
+ "eval_steps_per_second": 0.822,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 0.5127153404429861,
278
+ "grad_norm": 0.1762474924325943,
279
+ "learning_rate": 0.0009277908843797492,
280
+ "loss": 0.2622,
281
+ "step": 625
282
+ },
283
+ {
284
+ "epoch": 0.5332239540607056,
285
+ "grad_norm": 0.19957980513572693,
286
+ "learning_rate": 0.0009182531765561084,
287
+ "loss": 0.2618,
288
+ "step": 650
289
+ },
290
+ {
291
+ "epoch": 0.5332239540607056,
292
+ "eval_loss": 0.2553676664829254,
293
+ "eval_runtime": 19.4847,
294
+ "eval_samples_per_second": 51.322,
295
+ "eval_steps_per_second": 0.821,
296
+ "step": 650
297
+ },
298
+ {
299
+ "epoch": 0.5537325676784249,
300
+ "grad_norm": 0.1787998378276825,
301
+ "learning_rate": 0.0009081795494466201,
302
+ "loss": 0.261,
303
+ "step": 675
304
+ },
305
+ {
306
+ "epoch": 0.5742411812961444,
307
+ "grad_norm": 0.1660076081752777,
308
+ "learning_rate": 0.0008975829106664539,
309
+ "loss": 0.2534,
310
+ "step": 700
311
+ },
312
+ {
313
+ "epoch": 0.5742411812961444,
314
+ "eval_loss": 0.2539336383342743,
315
+ "eval_runtime": 19.4661,
316
+ "eval_samples_per_second": 51.371,
317
+ "eval_steps_per_second": 0.822,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 0.5947497949138638,
322
+ "grad_norm": 0.1471703201532364,
323
+ "learning_rate": 0.0008864768379800017,
324
+ "loss": 0.2508,
325
+ "step": 725
326
+ },
327
+ {
328
+ "epoch": 0.6152584085315833,
329
+ "grad_norm": 0.16628079116344452,
330
+ "learning_rate": 0.0008748755619033153,
331
+ "loss": 0.2608,
332
+ "step": 750
333
+ },
334
+ {
335
+ "epoch": 0.6152584085315833,
336
+ "eval_loss": 0.25101426243782043,
337
+ "eval_runtime": 19.4998,
338
+ "eval_samples_per_second": 51.283,
339
+ "eval_steps_per_second": 0.821,
340
+ "step": 750
341
+ },
342
+ {
343
+ "epoch": 0.6357670221493027,
344
+ "grad_norm": 0.180254727602005,
345
+ "learning_rate": 0.000862793947470155,
346
+ "loss": 0.2647,
347
+ "step": 775
348
+ },
349
+ {
350
+ "epoch": 0.6562756357670222,
351
+ "grad_norm": 0.16186201572418213,
352
+ "learning_rate": 0.0008502474751850142,
353
+ "loss": 0.2538,
354
+ "step": 800
355
+ },
356
+ {
357
+ "epoch": 0.6562756357670222,
358
+ "eval_loss": 0.24444915354251862,
359
+ "eval_runtime": 19.4781,
360
+ "eval_samples_per_second": 51.34,
361
+ "eval_steps_per_second": 0.821,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 0.6767842493847416,
366
+ "grad_norm": 0.1586069017648697,
367
+ "learning_rate": 0.0008372522211875224,
368
+ "loss": 0.248,
369
+ "step": 825
370
+ },
371
+ {
372
+ "epoch": 0.6972928630024611,
373
+ "grad_norm": 0.1476527601480484,
374
+ "learning_rate": 0.0008238248366536473,
375
+ "loss": 0.2611,
376
+ "step": 850
377
+ },
378
+ {
379
+ "epoch": 0.6972928630024611,
380
+ "eval_loss": 0.24326753616333008,
381
+ "eval_runtime": 19.4366,
382
+ "eval_samples_per_second": 51.449,
383
+ "eval_steps_per_second": 0.823,
384
+ "step": 850
385
+ },
386
+ {
387
+ "epoch": 0.7178014766201805,
388
+ "grad_norm": 0.11883819103240967,
389
+ "learning_rate": 0.0008099825264600842,
390
+ "loss": 0.255,
391
+ "step": 875
392
+ },
393
+ {
394
+ "epoch": 0.7383100902379,
395
+ "grad_norm": 0.1317686289548874,
396
+ "learning_rate": 0.0007957430271391761,
397
+ "loss": 0.2468,
398
+ "step": 900
399
+ },
400
+ {
401
+ "epoch": 0.7383100902379,
402
+ "eval_loss": 0.2392330914735794,
403
+ "eval_runtime": 19.493,
404
+ "eval_samples_per_second": 51.301,
405
+ "eval_steps_per_second": 0.821,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 0.7588187038556193,
410
+ "grad_norm": 0.1467055082321167,
411
+ "learning_rate": 0.0007811245841526062,
412
+ "loss": 0.25,
413
+ "step": 925
414
+ },
415
+ {
416
+ "epoch": 0.7793273174733388,
417
+ "grad_norm": 0.15291206538677216,
418
+ "learning_rate": 0.0007661459285129879,
419
+ "loss": 0.2334,
420
+ "step": 950
421
+ },
422
+ {
423
+ "epoch": 0.7793273174733388,
424
+ "eval_loss": 0.2371567040681839,
425
+ "eval_runtime": 19.4844,
426
+ "eval_samples_per_second": 51.323,
427
+ "eval_steps_per_second": 0.821,
428
+ "step": 950
429
+ },
430
+ {
431
+ "epoch": 0.7998359310910582,
432
+ "grad_norm": 0.1438579112291336,
433
+ "learning_rate": 0.0007508262527833029,
434
+ "loss": 0.2458,
435
+ "step": 975
436
+ },
437
+ {
438
+ "epoch": 0.8203445447087777,
439
+ "grad_norm": 0.1307683140039444,
440
+ "learning_rate": 0.000735185186484943,
441
+ "loss": 0.2478,
442
+ "step": 1000
443
+ },
444
+ {
445
+ "epoch": 0.8203445447087777,
446
+ "eval_loss": 0.23642279207706451,
447
+ "eval_runtime": 19.474,
448
+ "eval_samples_per_second": 51.35,
449
+ "eval_steps_per_second": 0.822,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 0.8408531583264971,
454
+ "grad_norm": 0.12361195683479309,
455
+ "learning_rate": 0.0007192427709458656,
456
+ "loss": 0.2358,
457
+ "step": 1025
458
+ },
459
+ {
460
+ "epoch": 0.8613617719442166,
461
+ "grad_norm": 0.12630796432495117,
462
+ "learning_rate": 0.0007030194336210887,
463
+ "loss": 0.2409,
464
+ "step": 1050
465
+ },
466
+ {
467
+ "epoch": 0.8613617719442166,
468
+ "eval_loss": 0.23247265815734863,
469
+ "eval_runtime": 19.4414,
470
+ "eval_samples_per_second": 51.437,
471
+ "eval_steps_per_second": 0.823,
472
+ "step": 1050
473
+ },
474
+ {
475
+ "epoch": 0.881870385561936,
476
+ "grad_norm": 0.13411730527877808,
477
+ "learning_rate": 0.0006865359619184331,
478
+ "loss": 0.2447,
479
+ "step": 1075
480
+ },
481
+ {
482
+ "epoch": 0.9023789991796555,
483
+ "grad_norm": 0.12585557997226715,
484
+ "learning_rate": 0.0006698134765630434,
485
+ "loss": 0.2379,
486
+ "step": 1100
487
+ },
488
+ {
489
+ "epoch": 0.9023789991796555,
490
+ "eval_loss": 0.2312391996383667,
491
+ "eval_runtime": 19.4906,
492
+ "eval_samples_per_second": 51.307,
493
+ "eval_steps_per_second": 0.821,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 0.9228876127973749,
498
+ "grad_norm": 0.10996991395950317,
499
+ "learning_rate": 0.0006528734045348248,
500
+ "loss": 0.2347,
501
+ "step": 1125
502
+ },
503
+ {
504
+ "epoch": 0.9433962264150944,
505
+ "grad_norm": 0.1385001838207245,
506
+ "learning_rate": 0.0006357374516134643,
507
+ "loss": 0.2489,
508
+ "step": 1150
509
+ },
510
+ {
511
+ "epoch": 0.9433962264150944,
512
+ "eval_loss": 0.2296425998210907,
513
+ "eval_runtime": 19.4676,
514
+ "eval_samples_per_second": 51.367,
515
+ "eval_steps_per_second": 0.822,
516
+ "step": 1150
517
+ },
518
+ {
519
+ "epoch": 0.9639048400328137,
520
+ "grad_norm": 0.1425400674343109,
521
+ "learning_rate": 0.0006184275745662179,
522
+ "loss": 0.2376,
523
+ "step": 1175
524
+ },
525
+ {
526
+ "epoch": 0.9844134536505332,
527
+ "grad_norm": 0.12151045352220535,
528
+ "learning_rate": 0.0006009659530141031,
529
+ "loss": 0.2408,
530
+ "step": 1200
531
+ },
532
+ {
533
+ "epoch": 0.9844134536505332,
534
+ "eval_loss": 0.22871758043766022,
535
+ "eval_runtime": 19.4794,
536
+ "eval_samples_per_second": 51.336,
537
+ "eval_steps_per_second": 0.821,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 1.0049220672682526,
542
+ "grad_norm": 0.1182444617152214,
543
+ "learning_rate": 0.0005833749610125402,
544
+ "loss": 0.237,
545
+ "step": 1225
546
+ },
547
+ {
548
+ "epoch": 1.0254306808859721,
549
+ "grad_norm": 0.11846429109573364,
550
+ "learning_rate": 0.0005656771383828602,
551
+ "loss": 0.2092,
552
+ "step": 1250
553
+ },
554
+ {
555
+ "epoch": 1.0254306808859721,
556
+ "eval_loss": 0.22994151711463928,
557
+ "eval_runtime": 19.4266,
558
+ "eval_samples_per_second": 51.476,
559
+ "eval_steps_per_second": 0.824,
560
+ "step": 1250
561
+ },
562
+ {
563
+ "epoch": 1.0459392945036916,
564
+ "grad_norm": 0.12247344851493835,
565
+ "learning_rate": 0.0005478951618314134,
566
+ "loss": 0.2043,
567
+ "step": 1275
568
+ },
569
+ {
570
+ "epoch": 1.066447908121411,
571
+ "grad_norm": 0.11501295119524002,
572
+ "learning_rate": 0.0005300518158932815,
573
+ "loss": 0.2051,
574
+ "step": 1300
575
+ },
576
+ {
577
+ "epoch": 1.066447908121411,
578
+ "eval_loss": 0.22652946412563324,
579
+ "eval_runtime": 19.4689,
580
+ "eval_samples_per_second": 51.364,
581
+ "eval_steps_per_second": 0.822,
582
+ "step": 1300
583
+ },
584
+ {
585
+ "epoch": 1.0869565217391304,
586
+ "grad_norm": 0.12041299045085907,
587
+ "learning_rate": 0.0005121699637378282,
588
+ "loss": 0.2046,
589
+ "step": 1325
590
+ },
591
+ {
592
+ "epoch": 1.1074651353568499,
593
+ "grad_norm": 0.13483497500419617,
594
+ "learning_rate": 0.0004942725178734903,
595
+ "loss": 0.2088,
596
+ "step": 1350
597
+ },
598
+ {
599
+ "epoch": 1.1074651353568499,
600
+ "eval_loss": 0.22511404752731323,
601
+ "eval_runtime": 19.4668,
602
+ "eval_samples_per_second": 51.37,
603
+ "eval_steps_per_second": 0.822,
604
+ "step": 1350
605
+ },
606
+ {
607
+ "epoch": 1.1279737489745694,
608
+ "grad_norm": 0.12721161544322968,
609
+ "learning_rate": 0.00047638241078935324,
610
+ "loss": 0.2069,
611
+ "step": 1375
612
+ },
613
+ {
614
+ "epoch": 1.1484823625922886,
615
+ "grad_norm": 0.1224995031952858,
616
+ "learning_rate": 0.000458522565571121,
617
+ "loss": 0.2145,
618
+ "step": 1400
619
+ },
620
+ {
621
+ "epoch": 1.1484823625922886,
622
+ "eval_loss": 0.2244964987039566,
623
+ "eval_runtime": 19.4669,
624
+ "eval_samples_per_second": 51.369,
625
+ "eval_steps_per_second": 0.822,
626
+ "step": 1400
627
+ },
628
+ {
629
+ "epoch": 1.1689909762100081,
630
+ "grad_norm": 0.11426686495542526,
631
+ "learning_rate": 0.00044071586652913767,
632
+ "loss": 0.1983,
633
+ "step": 1425
634
+ },
635
+ {
636
+ "epoch": 1.1894995898277276,
637
+ "grad_norm": 0.13497664034366608,
638
+ "learning_rate": 0.0004229851298760915,
639
+ "loss": 0.2079,
640
+ "step": 1450
641
+ },
642
+ {
643
+ "epoch": 1.1894995898277276,
644
+ "eval_loss": 0.22429363429546356,
645
+ "eval_runtime": 19.4906,
646
+ "eval_samples_per_second": 51.307,
647
+ "eval_steps_per_second": 0.821,
648
+ "step": 1450
649
+ },
650
+ {
651
+ "epoch": 1.2100082034454471,
652
+ "grad_norm": 0.11498471349477768,
653
+ "learning_rate": 0.0004053530744919749,
654
+ "loss": 0.2069,
655
+ "step": 1475
656
+ },
657
+ {
658
+ "epoch": 1.2305168170631666,
659
+ "grad_norm": 0.13178111612796783,
660
+ "learning_rate": 0.0003878422928137597,
661
+ "loss": 0.2103,
662
+ "step": 1500
663
+ },
664
+ {
665
+ "epoch": 1.2305168170631666,
666
+ "eval_loss": 0.22244225442409515,
667
+ "eval_runtime": 19.4709,
668
+ "eval_samples_per_second": 51.359,
669
+ "eval_steps_per_second": 0.822,
670
+ "step": 1500
671
+ },
672
+ {
673
+ "epoch": 1.251025430680886,
674
+ "grad_norm": 0.1308068484067917,
675
+ "learning_rate": 0.0003704752218870861,
676
+ "loss": 0.2151,
677
+ "step": 1525
678
+ },
679
+ {
680
+ "epoch": 1.2715340442986054,
681
+ "grad_norm": 0.11254964768886566,
682
+ "learning_rate": 0.00035327411461706025,
683
+ "loss": 0.1971,
684
+ "step": 1550
685
+ },
686
+ {
687
+ "epoch": 1.2715340442986054,
688
+ "eval_loss": 0.22179578244686127,
689
+ "eval_runtime": 19.4274,
690
+ "eval_samples_per_second": 51.474,
691
+ "eval_steps_per_second": 0.824,
692
+ "step": 1550
693
+ },
694
+ {
695
+ "epoch": 1.2920426579163249,
696
+ "grad_norm": 0.13904231786727905,
697
+ "learning_rate": 0.00033626101125499555,
698
+ "loss": 0.2065,
699
+ "step": 1575
700
+ },
701
+ {
702
+ "epoch": 1.3125512715340442,
703
+ "grad_norm": 0.11037751287221909,
704
+ "learning_rate": 0.0003194577111576333,
705
+ "loss": 0.2013,
706
+ "step": 1600
707
+ },
708
+ {
709
+ "epoch": 1.3125512715340442,
710
+ "eval_loss": 0.21995632350444794,
711
+ "eval_runtime": 19.4997,
712
+ "eval_samples_per_second": 51.283,
713
+ "eval_steps_per_second": 0.821,
714
+ "step": 1600
715
+ },
716
+ {
717
+ "epoch": 1.3330598851517639,
718
+ "grad_norm": 0.10675220936536789,
719
+ "learning_rate": 0.00030288574485502756,
720
+ "loss": 0.2017,
721
+ "step": 1625
722
+ },
723
+ {
724
+ "epoch": 1.3535684987694832,
725
+ "grad_norm": 0.11563979089260101,
726
+ "learning_rate": 0.00028656634646288565,
727
+ "loss": 0.2033,
728
+ "step": 1650
729
+ },
730
+ {
731
+ "epoch": 1.3535684987694832,
732
+ "eval_loss": 0.21779730916023254,
733
+ "eval_runtime": 19.4741,
734
+ "eval_samples_per_second": 51.35,
735
+ "eval_steps_per_second": 0.822,
736
+ "step": 1650
737
+ },
738
+ {
739
+ "epoch": 1.3740771123872026,
740
+ "grad_norm": 0.13573266565799713,
741
+ "learning_rate": 0.00027052042647471254,
742
+ "loss": 0.1916,
743
+ "step": 1675
744
+ },
745
+ {
746
+ "epoch": 1.3945857260049221,
747
+ "grad_norm": 0.12692494690418243,
748
+ "learning_rate": 0.0002547685449686206,
749
+ "loss": 0.1999,
750
+ "step": 1700
751
+ },
752
+ {
753
+ "epoch": 1.3945857260049221,
754
+ "eval_loss": 0.21664589643478394,
755
+ "eval_runtime": 19.4774,
756
+ "eval_samples_per_second": 51.341,
757
+ "eval_steps_per_second": 0.821,
758
+ "step": 1700
759
+ },
760
+ {
761
+ "epoch": 1.4150943396226414,
762
+ "grad_norm": 0.14164209365844727,
763
+ "learning_rate": 0.0002393308852631373,
764
+ "loss": 0.2108,
765
+ "step": 1725
766
+ },
767
+ {
768
+ "epoch": 1.435602953240361,
769
+ "grad_norm": 0.12433931976556778,
770
+ "learning_rate": 0.0002242272280557645,
771
+ "loss": 0.2056,
772
+ "step": 1750
773
+ },
774
+ {
775
+ "epoch": 1.435602953240361,
776
+ "eval_loss": 0.2161126285791397,
777
+ "eval_runtime": 19.4746,
778
+ "eval_samples_per_second": 51.349,
779
+ "eval_steps_per_second": 0.822,
780
+ "step": 1750
781
+ },
782
+ {
783
+ "epoch": 1.4561115668580804,
784
+ "grad_norm": 0.1255362182855606,
785
+ "learning_rate": 0.0002094769260774262,
786
+ "loss": 0.2021,
787
+ "step": 1775
788
+ },
789
+ {
790
+ "epoch": 1.4766201804758,
791
+ "grad_norm": 0.12211991846561432,
792
+ "learning_rate": 0.00019509887929528458,
793
+ "loss": 0.2056,
794
+ "step": 1800
795
+ },
796
+ {
797
+ "epoch": 1.4766201804758,
798
+ "eval_loss": 0.21495358645915985,
799
+ "eval_runtime": 19.4956,
800
+ "eval_samples_per_second": 51.294,
801
+ "eval_steps_per_second": 0.821,
802
+ "step": 1800
803
+ },
804
+ {
805
+ "epoch": 1.4971287940935194,
806
+ "grad_norm": 0.13951410353183746,
807
+ "learning_rate": 0.0001811115106956918,
808
+ "loss": 0.1964,
809
+ "step": 1825
810
+ },
811
+ {
812
+ "epoch": 1.5176374077112387,
813
+ "grad_norm": 0.13265341520309448,
814
+ "learning_rate": 0.00016753274267831115,
815
+ "loss": 0.206,
816
+ "step": 1850
817
+ },
818
+ {
819
+ "epoch": 1.5176374077112387,
820
+ "eval_loss": 0.21398000419139862,
821
+ "eval_runtime": 19.4829,
822
+ "eval_samples_per_second": 51.327,
823
+ "eval_steps_per_second": 0.821,
824
+ "step": 1850
825
+ },
826
+ {
827
+ "epoch": 1.5381460213289582,
828
+ "grad_norm": 0.13726522028446198,
829
+ "learning_rate": 0.00015437997409165478,
830
+ "loss": 0.1964,
831
+ "step": 1875
832
+ },
833
+ {
834
+ "epoch": 1.5586546349466777,
835
+ "grad_norm": 0.11551380902528763,
836
+ "learning_rate": 0.00014167005793946035,
837
+ "loss": 0.1952,
838
+ "step": 1900
839
+ },
840
+ {
841
+ "epoch": 1.5586546349466777,
842
+ "eval_loss": 0.2141638696193695,
843
+ "eval_runtime": 19.4999,
844
+ "eval_samples_per_second": 51.282,
845
+ "eval_steps_per_second": 0.821,
846
+ "step": 1900
847
+ },
848
+ {
849
+ "epoch": 1.579163248564397,
850
+ "grad_norm": 0.13091330230236053,
851
+ "learning_rate": 0.00012941927978647527,
852
+ "loss": 0.1976,
853
+ "step": 1925
854
+ },
855
+ {
856
+ "epoch": 1.5996718621821167,
857
+ "grad_norm": 0.11574764549732208,
858
+ "learning_rate": 0.00011764333689131385,
859
+ "loss": 0.2036,
860
+ "step": 1950
861
+ },
862
+ {
863
+ "epoch": 1.5996718621821167,
864
+ "eval_loss": 0.2131749391555786,
865
+ "eval_runtime": 19.4949,
866
+ "eval_samples_per_second": 51.295,
867
+ "eval_steps_per_second": 0.821,
868
+ "step": 1950
869
+ },
870
+ {
871
+ "epoch": 1.620180475799836,
872
+ "grad_norm": 0.11186928302049637,
873
+ "learning_rate": 0.00010635731809312993,
874
+ "loss": 0.1993,
875
+ "step": 1975
876
+ },
877
+ {
878
+ "epoch": 1.6406890894175554,
879
+ "grad_norm": 0.13052970170974731,
880
+ "learning_rate": 9.557568447787201e-05,
881
+ "loss": 0.1989,
882
+ "step": 2000
883
+ },
884
+ {
885
+ "epoch": 1.6406890894175554,
886
+ "eval_loss": 0.21262580156326294,
887
+ "eval_runtime": 19.4819,
888
+ "eval_samples_per_second": 51.33,
889
+ "eval_steps_per_second": 0.821,
890
+ "step": 2000
891
+ },
892
+ {
893
+ "epoch": 1.661197703035275,
894
+ "grad_norm": 0.11890964955091476,
895
+ "learning_rate": 8.531225084889654e-05,
896
+ "loss": 0.1925,
897
+ "step": 2025
898
+ },
899
+ {
900
+ "epoch": 1.6817063166529942,
901
+ "grad_norm": 0.13581904768943787,
902
+ "learning_rate": 7.558016802568091e-05,
903
+ "loss": 0.1966,
904
+ "step": 2050
905
+ },
906
+ {
907
+ "epoch": 1.6817063166529942,
908
+ "eval_loss": 0.21162261068820953,
909
+ "eval_runtime": 19.4663,
910
+ "eval_samples_per_second": 51.371,
911
+ "eval_steps_per_second": 0.822,
912
+ "step": 2050
913
+ },
914
+ {
915
+ "epoch": 1.7022149302707137,
916
+ "grad_norm": 0.12677335739135742,
917
+ "learning_rate": 6.639190599331746e-05,
918
+ "loss": 0.1964,
919
+ "step": 2075
920
+ },
921
+ {
922
+ "epoch": 1.7227235438884332,
923
+ "grad_norm": 0.13097846508026123,
924
+ "learning_rate": 5.775923792437865e-05,
925
+ "loss": 0.1918,
926
+ "step": 2100
927
+ },
928
+ {
929
+ "epoch": 1.7227235438884332,
930
+ "eval_loss": 0.2108549326658249,
931
+ "eval_runtime": 19.5146,
932
+ "eval_samples_per_second": 51.244,
933
+ "eval_steps_per_second": 0.82,
934
+ "step": 2100
935
+ },
936
+ {
937
+ "epoch": 1.7432321575061525,
938
+ "grad_norm": 0.12936541438102722,
939
+ "learning_rate": 4.9693225093627616e-05,
940
+ "loss": 0.1931,
941
+ "step": 2125
942
+ },
943
+ {
944
+ "epoch": 1.7637407711238722,
945
+ "grad_norm": 0.12237502634525299,
946
+ "learning_rate": 4.220420270490294e-05,
947
+ "loss": 0.1874,
948
+ "step": 2150
949
+ },
950
+ {
951
+ "epoch": 1.7637407711238722,
952
+ "eval_loss": 0.2103663980960846,
953
+ "eval_runtime": 19.4878,
954
+ "eval_samples_per_second": 51.314,
955
+ "eval_steps_per_second": 0.821,
956
+ "step": 2150
957
+ },
958
+ {
959
+ "epoch": 1.7842493847415914,
960
+ "grad_norm": 0.13166476786136627,
961
+ "learning_rate": 3.530176664833834e-05,
962
+ "loss": 0.1901,
963
+ "step": 2175
964
+ },
965
+ {
966
+ "epoch": 1.804757998359311,
967
+ "grad_norm": 0.14463044703006744,
968
+ "learning_rate": 2.8994761204884756e-05,
969
+ "loss": 0.1986,
970
+ "step": 2200
971
+ },
972
+ {
973
+ "epoch": 1.804757998359311,
974
+ "eval_loss": 0.2102488875389099,
975
+ "eval_runtime": 19.4852,
976
+ "eval_samples_per_second": 51.321,
977
+ "eval_steps_per_second": 0.821,
978
+ "step": 2200
979
+ },
980
+ {
981
+ "epoch": 1.8252666119770304,
982
+ "grad_norm": 0.1415223479270935,
983
+ "learning_rate": 2.329126771388995e-05,
984
+ "loss": 0.1903,
985
+ "step": 2225
986
+ },
987
+ {
988
+ "epoch": 1.8457752255947497,
989
+ "grad_norm": 0.11420800536870956,
990
+ "learning_rate": 1.8198594218256815e-05,
991
+ "loss": 0.1901,
992
+ "step": 2250
993
+ },
994
+ {
995
+ "epoch": 1.8457752255947497,
996
+ "eval_loss": 0.2102055698633194,
997
+ "eval_runtime": 19.4587,
998
+ "eval_samples_per_second": 51.391,
999
+ "eval_steps_per_second": 0.822,
1000
+ "step": 2250
1001
+ },
1002
+ {
1003
+ "epoch": 1.8662838392124692,
1004
+ "grad_norm": 0.1356535106897354,
1005
+ "learning_rate": 1.3723266100447052e-05,
1006
+ "loss": 0.1943,
1007
+ "step": 2275
1008
+ },
1009
+ {
1010
+ "epoch": 1.8867924528301887,
1011
+ "grad_norm": 0.15679548680782318,
1012
+ "learning_rate": 9.871017721329201e-06,
1013
+ "loss": 0.2016,
1014
+ "step": 2300
1015
+ },
1016
+ {
1017
+ "epoch": 1.8867924528301887,
1018
+ "eval_loss": 0.2100086808204651,
1019
+ "eval_runtime": 19.461,
1020
+ "eval_samples_per_second": 51.385,
1021
+ "eval_steps_per_second": 0.822,
1022
+ "step": 2300
1023
+ },
1024
+ {
1025
+ "epoch": 1.907301066447908,
1026
+ "grad_norm": 0.13155241310596466,
1027
+ "learning_rate": 6.646785072584871e-06,
1028
+ "loss": 0.1959,
1029
+ "step": 2325
1030
+ },
1031
+ {
1032
+ "epoch": 1.9278096800656277,
1033
+ "grad_norm": 0.11297722160816193,
1034
+ "learning_rate": 4.054699452086641e-06,
1035
+ "loss": 0.1904,
1036
+ "step": 2350
1037
+ },
1038
+ {
1039
+ "epoch": 1.9278096800656277,
1040
+ "eval_loss": 0.2098853588104248,
1041
+ "eval_runtime": 19.4561,
1042
+ "eval_samples_per_second": 51.398,
1043
+ "eval_steps_per_second": 0.822,
1044
+ "step": 2350
1045
+ },
1046
+ {
1047
+ "epoch": 1.948318293683347,
1048
+ "grad_norm": 0.13836362957954407,
1049
+ "learning_rate": 2.0980821703527886e-06,
1050
+ "loss": 0.2049,
1051
+ "step": 2375
1052
+ },
1053
+ {
1054
+ "epoch": 1.9688269073010665,
1055
+ "grad_norm": 0.12997499108314514,
1056
+ "learning_rate": 7.794402948607671e-07,
1057
+ "loss": 0.1999,
1058
+ "step": 2400
1059
+ },
1060
+ {
1061
+ "epoch": 1.9688269073010665,
1062
+ "eval_loss": 0.2098313570022583,
1063
+ "eval_runtime": 19.5038,
1064
+ "eval_samples_per_second": 51.272,
1065
+ "eval_steps_per_second": 0.82,
1066
+ "step": 2400
1067
+ },
1068
+ {
1069
+ "epoch": 1.989335520918786,
1070
+ "grad_norm": 0.14497987926006317,
1071
+ "learning_rate": 1.0046343767294853e-07,
1072
+ "loss": 0.1901,
1073
+ "step": 2425
1074
+ },
1075
+ {
1076
+ "epoch": 2.0,
1077
+ "step": 2438,
1078
+ "total_flos": 1.58523627405312e+18,
1079
+ "train_loss": 0.24394705248622253,
1080
+ "train_runtime": 3518.0931,
1081
+ "train_samples_per_second": 22.171,
1082
+ "train_steps_per_second": 0.693
1083
+ }
1084
+ ],
1085
+ "logging_steps": 25,
1086
+ "max_steps": 2438,
1087
+ "num_input_tokens_seen": 0,
1088
+ "num_train_epochs": 2,
1089
+ "save_steps": 500,
1090
+ "stateful_callbacks": {
1091
+ "TrainerControl": {
1092
+ "args": {
1093
+ "should_epoch_stop": false,
1094
+ "should_evaluate": false,
1095
+ "should_log": false,
1096
+ "should_save": true,
1097
+ "should_training_stop": true
1098
+ },
1099
+ "attributes": {}
1100
+ }
1101
+ },
1102
+ "total_flos": 1.58523627405312e+18,
1103
+ "train_batch_size": 32,
1104
+ "trial_name": null,
1105
+ "trial_params": null
1106
+ }
nl_tasks/run_all/run_exnr12/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/run_all/run_exnr12/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/run_all/run_exnr12/trainer_state.json ADDED
@@ -0,0 +1,1106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 50,
7
+ "global_step": 2438,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.020508613617719443,
14
+ "grad_norm": 0.22555650770664215,
15
+ "learning_rate": 9.836065573770491e-05,
16
+ "loss": 0.6774,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.04101722723543889,
21
+ "grad_norm": 0.18577826023101807,
22
+ "learning_rate": 0.00020081967213114754,
23
+ "loss": 0.4155,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.04101722723543889,
28
+ "eval_loss": 0.37306666374206543,
29
+ "eval_runtime": 22.912,
30
+ "eval_samples_per_second": 43.645,
31
+ "eval_steps_per_second": 0.698,
32
+ "step": 50
33
+ },
34
+ {
35
+ "epoch": 0.06152584085315833,
36
+ "grad_norm": 0.17923958599567413,
37
+ "learning_rate": 0.0003032786885245902,
38
+ "loss": 0.3703,
39
+ "step": 75
40
+ },
41
+ {
42
+ "epoch": 0.08203445447087777,
43
+ "grad_norm": 0.18290868401527405,
44
+ "learning_rate": 0.0004057377049180328,
45
+ "loss": 0.3501,
46
+ "step": 100
47
+ },
48
+ {
49
+ "epoch": 0.08203445447087777,
50
+ "eval_loss": 0.3294435441493988,
51
+ "eval_runtime": 22.6287,
52
+ "eval_samples_per_second": 44.192,
53
+ "eval_steps_per_second": 0.707,
54
+ "step": 100
55
+ },
56
+ {
57
+ "epoch": 0.10254306808859721,
58
+ "grad_norm": 0.1609077900648117,
59
+ "learning_rate": 0.0005081967213114754,
60
+ "loss": 0.3305,
61
+ "step": 125
62
+ },
63
+ {
64
+ "epoch": 0.12305168170631665,
65
+ "grad_norm": 0.1902703195810318,
66
+ "learning_rate": 0.000610655737704918,
67
+ "loss": 0.3159,
68
+ "step": 150
69
+ },
70
+ {
71
+ "epoch": 0.12305168170631665,
72
+ "eval_loss": 0.31451696157455444,
73
+ "eval_runtime": 22.6178,
74
+ "eval_samples_per_second": 44.213,
75
+ "eval_steps_per_second": 0.707,
76
+ "step": 150
77
+ },
78
+ {
79
+ "epoch": 0.1435602953240361,
80
+ "grad_norm": 0.2086164802312851,
81
+ "learning_rate": 0.0007131147540983607,
82
+ "loss": 0.3135,
83
+ "step": 175
84
+ },
85
+ {
86
+ "epoch": 0.16406890894175555,
87
+ "grad_norm": 0.22159546613693237,
88
+ "learning_rate": 0.0008155737704918033,
89
+ "loss": 0.3193,
90
+ "step": 200
91
+ },
92
+ {
93
+ "epoch": 0.16406890894175555,
94
+ "eval_loss": 0.30913859605789185,
95
+ "eval_runtime": 22.6205,
96
+ "eval_samples_per_second": 44.208,
97
+ "eval_steps_per_second": 0.707,
98
+ "step": 200
99
+ },
100
+ {
101
+ "epoch": 0.184577522559475,
102
+ "grad_norm": 2.3699522018432617,
103
+ "learning_rate": 0.0009180327868852459,
104
+ "loss": 0.4187,
105
+ "step": 225
106
+ },
107
+ {
108
+ "epoch": 0.20508613617719443,
109
+ "grad_norm": 0.542046070098877,
110
+ "learning_rate": 0.0009999871854116063,
111
+ "loss": 0.4345,
112
+ "step": 250
113
+ },
114
+ {
115
+ "epoch": 0.20508613617719443,
116
+ "eval_loss": 0.3142073452472687,
117
+ "eval_runtime": 22.564,
118
+ "eval_samples_per_second": 44.318,
119
+ "eval_steps_per_second": 0.709,
120
+ "step": 250
121
+ },
122
+ {
123
+ "epoch": 0.22559474979491387,
124
+ "grad_norm": 0.3608435392379761,
125
+ "learning_rate": 0.0009995387437838027,
126
+ "loss": 0.3233,
127
+ "step": 275
128
+ },
129
+ {
130
+ "epoch": 0.2461033634126333,
131
+ "grad_norm": 0.28521400690078735,
132
+ "learning_rate": 0.000998450229439693,
133
+ "loss": 0.3179,
134
+ "step": 300
135
+ },
136
+ {
137
+ "epoch": 0.2461033634126333,
138
+ "eval_loss": 0.30236080288887024,
139
+ "eval_runtime": 22.6232,
140
+ "eval_samples_per_second": 44.202,
141
+ "eval_steps_per_second": 0.707,
142
+ "step": 300
143
+ },
144
+ {
145
+ "epoch": 0.2666119770303528,
146
+ "grad_norm": 0.22246453166007996,
147
+ "learning_rate": 0.000996723037122612,
148
+ "loss": 0.3011,
149
+ "step": 325
150
+ },
151
+ {
152
+ "epoch": 0.2871205906480722,
153
+ "grad_norm": 0.24288132786750793,
154
+ "learning_rate": 0.0009943593799315263,
155
+ "loss": 0.3078,
156
+ "step": 350
157
+ },
158
+ {
159
+ "epoch": 0.2871205906480722,
160
+ "eval_loss": 0.2915167212486267,
161
+ "eval_runtime": 22.6232,
162
+ "eval_samples_per_second": 44.202,
163
+ "eval_steps_per_second": 0.707,
164
+ "step": 350
165
+ },
166
+ {
167
+ "epoch": 0.30762920426579166,
168
+ "grad_norm": 0.21969842910766602,
169
+ "learning_rate": 0.0009913622864853324,
170
+ "loss": 0.2936,
171
+ "step": 375
172
+ },
173
+ {
174
+ "epoch": 0.3281378178835111,
175
+ "grad_norm": 0.2624405026435852,
176
+ "learning_rate": 0.0009877355970422024,
177
+ "loss": 0.3019,
178
+ "step": 400
179
+ },
180
+ {
181
+ "epoch": 0.3281378178835111,
182
+ "eval_loss": 0.2813864052295685,
183
+ "eval_runtime": 22.6144,
184
+ "eval_samples_per_second": 44.22,
185
+ "eval_steps_per_second": 0.708,
186
+ "step": 400
187
+ },
188
+ {
189
+ "epoch": 0.34864643150123054,
190
+ "grad_norm": 0.18397915363311768,
191
+ "learning_rate": 0.0009834839585789559,
192
+ "loss": 0.2951,
193
+ "step": 425
194
+ },
195
+ {
196
+ "epoch": 0.36915504511895,
197
+ "grad_norm": 0.1632954329252243,
198
+ "learning_rate": 0.000978612818836762,
199
+ "loss": 0.2885,
200
+ "step": 450
201
+ },
202
+ {
203
+ "epoch": 0.36915504511895,
204
+ "eval_loss": 0.26967909932136536,
205
+ "eval_runtime": 22.5519,
206
+ "eval_samples_per_second": 44.342,
207
+ "eval_steps_per_second": 0.709,
208
+ "step": 450
209
+ },
210
+ {
211
+ "epoch": 0.3896636587366694,
212
+ "grad_norm": 0.17423078417778015,
213
+ "learning_rate": 0.0009731284193407981,
214
+ "loss": 0.2864,
215
+ "step": 475
216
+ },
217
+ {
218
+ "epoch": 0.41017227235438886,
219
+ "grad_norm": 0.15992988646030426,
220
+ "learning_rate": 0.0009670377874028117,
221
+ "loss": 0.2763,
222
+ "step": 500
223
+ },
224
+ {
225
+ "epoch": 0.41017227235438886,
226
+ "eval_loss": 0.2657691538333893,
227
+ "eval_runtime": 22.5303,
228
+ "eval_samples_per_second": 44.385,
229
+ "eval_steps_per_second": 0.71,
230
+ "step": 500
231
+ },
232
+ {
233
+ "epoch": 0.4306808859721083,
234
+ "grad_norm": 0.17642049491405487,
235
+ "learning_rate": 0.0009603487271168336,
236
+ "loss": 0.2804,
237
+ "step": 525
238
+ },
239
+ {
240
+ "epoch": 0.45118949958982774,
241
+ "grad_norm": 0.15310442447662354,
242
+ "learning_rate": 0.0009530698093595781,
243
+ "loss": 0.2738,
244
+ "step": 550
245
+ },
246
+ {
247
+ "epoch": 0.45118949958982774,
248
+ "eval_loss": 0.25949400663375854,
249
+ "eval_runtime": 22.5573,
250
+ "eval_samples_per_second": 44.332,
251
+ "eval_steps_per_second": 0.709,
252
+ "step": 550
253
+ },
254
+ {
255
+ "epoch": 0.4716981132075472,
256
+ "grad_norm": 0.16352593898773193,
257
+ "learning_rate": 0.0009452103608083418,
258
+ "loss": 0.2605,
259
+ "step": 575
260
+ },
261
+ {
262
+ "epoch": 0.4922067268252666,
263
+ "grad_norm": 0.16357450187206268,
264
+ "learning_rate": 0.0009367804519904775,
265
+ "loss": 0.2695,
266
+ "step": 600
267
+ },
268
+ {
269
+ "epoch": 0.4922067268252666,
270
+ "eval_loss": 0.2563272714614868,
271
+ "eval_runtime": 22.5256,
272
+ "eval_samples_per_second": 44.394,
273
+ "eval_steps_per_second": 0.71,
274
+ "step": 600
275
+ },
276
+ {
277
+ "epoch": 0.5127153404429861,
278
+ "grad_norm": 0.26224130392074585,
279
+ "learning_rate": 0.0009277908843797492,
280
+ "loss": 0.2572,
281
+ "step": 625
282
+ },
283
+ {
284
+ "epoch": 0.5332239540607056,
285
+ "grad_norm": 0.15522083640098572,
286
+ "learning_rate": 0.0009182531765561084,
287
+ "loss": 0.2557,
288
+ "step": 650
289
+ },
290
+ {
291
+ "epoch": 0.5332239540607056,
292
+ "eval_loss": 0.25189048051834106,
293
+ "eval_runtime": 22.5857,
294
+ "eval_samples_per_second": 44.276,
295
+ "eval_steps_per_second": 0.708,
296
+ "step": 650
297
+ },
298
+ {
299
+ "epoch": 0.5537325676784249,
300
+ "grad_norm": 0.1434183418750763,
301
+ "learning_rate": 0.0009081795494466201,
302
+ "loss": 0.2577,
303
+ "step": 675
304
+ },
305
+ {
306
+ "epoch": 0.5742411812961444,
307
+ "grad_norm": 0.1259659081697464,
308
+ "learning_rate": 0.0008975829106664539,
309
+ "loss": 0.251,
310
+ "step": 700
311
+ },
312
+ {
313
+ "epoch": 0.5742411812961444,
314
+ "eval_loss": 0.25064969062805176,
315
+ "eval_runtime": 22.5416,
316
+ "eval_samples_per_second": 44.362,
317
+ "eval_steps_per_second": 0.71,
318
+ "step": 700
319
+ },
320
+ {
321
+ "epoch": 0.5947497949138638,
322
+ "grad_norm": 0.11997288465499878,
323
+ "learning_rate": 0.0008864768379800017,
324
+ "loss": 0.2478,
325
+ "step": 725
326
+ },
327
+ {
328
+ "epoch": 0.6152584085315833,
329
+ "grad_norm": 0.130789652466774,
330
+ "learning_rate": 0.0008748755619033153,
331
+ "loss": 0.2566,
332
+ "step": 750
333
+ },
334
+ {
335
+ "epoch": 0.6152584085315833,
336
+ "eval_loss": 0.24806569516658783,
337
+ "eval_runtime": 22.5742,
338
+ "eval_samples_per_second": 44.298,
339
+ "eval_steps_per_second": 0.709,
340
+ "step": 750
341
+ },
342
+ {
343
+ "epoch": 0.6357670221493027,
344
+ "grad_norm": 0.14199994504451752,
345
+ "learning_rate": 0.000862793947470155,
346
+ "loss": 0.2606,
347
+ "step": 775
348
+ },
349
+ {
350
+ "epoch": 0.6562756357670222,
351
+ "grad_norm": 0.1386088728904724,
352
+ "learning_rate": 0.0008502474751850142,
353
+ "loss": 0.251,
354
+ "step": 800
355
+ },
356
+ {
357
+ "epoch": 0.6562756357670222,
358
+ "eval_loss": 0.24411910772323608,
359
+ "eval_runtime": 22.5531,
360
+ "eval_samples_per_second": 44.34,
361
+ "eval_steps_per_second": 0.709,
362
+ "step": 800
363
+ },
364
+ {
365
+ "epoch": 0.6767842493847416,
366
+ "grad_norm": 0.12433658540248871,
367
+ "learning_rate": 0.0008372522211875224,
368
+ "loss": 0.2469,
369
+ "step": 825
370
+ },
371
+ {
372
+ "epoch": 0.6972928630024611,
373
+ "grad_norm": 0.13002005219459534,
374
+ "learning_rate": 0.0008238248366536473,
375
+ "loss": 0.2591,
376
+ "step": 850
377
+ },
378
+ {
379
+ "epoch": 0.6972928630024611,
380
+ "eval_loss": 0.24230404198169708,
381
+ "eval_runtime": 22.5724,
382
+ "eval_samples_per_second": 44.302,
383
+ "eval_steps_per_second": 0.709,
384
+ "step": 850
385
+ },
386
+ {
387
+ "epoch": 0.7178014766201805,
388
+ "grad_norm": 0.10630059987306595,
389
+ "learning_rate": 0.0008099825264600842,
390
+ "loss": 0.253,
391
+ "step": 875
392
+ },
393
+ {
394
+ "epoch": 0.7383100902379,
395
+ "grad_norm": 0.11450506001710892,
396
+ "learning_rate": 0.0007957430271391761,
397
+ "loss": 0.2467,
398
+ "step": 900
399
+ },
400
+ {
401
+ "epoch": 0.7383100902379,
402
+ "eval_loss": 0.2398349493741989,
403
+ "eval_runtime": 22.5769,
404
+ "eval_samples_per_second": 44.293,
405
+ "eval_steps_per_second": 0.709,
406
+ "step": 900
407
+ },
408
+ {
409
+ "epoch": 0.7588187038556193,
410
+ "grad_norm": 0.11560004949569702,
411
+ "learning_rate": 0.0007811245841526062,
412
+ "loss": 0.2506,
413
+ "step": 925
414
+ },
415
+ {
416
+ "epoch": 0.7793273174733388,
417
+ "grad_norm": 0.129349485039711,
418
+ "learning_rate": 0.0007661459285129879,
419
+ "loss": 0.234,
420
+ "step": 950
421
+ },
422
+ {
423
+ "epoch": 0.7793273174733388,
424
+ "eval_loss": 0.2383035570383072,
425
+ "eval_runtime": 22.5608,
426
+ "eval_samples_per_second": 44.325,
427
+ "eval_steps_per_second": 0.709,
428
+ "step": 950
429
+ },
430
+ {
431
+ "epoch": 0.7998359310910582,
432
+ "grad_norm": 0.11788657307624817,
433
+ "learning_rate": 0.0007508262527833029,
434
+ "loss": 0.2461,
435
+ "step": 975
436
+ },
437
+ {
438
+ "epoch": 0.8203445447087777,
439
+ "grad_norm": 0.11253403127193451,
440
+ "learning_rate": 0.000735185186484943,
441
+ "loss": 0.2495,
442
+ "step": 1000
443
+ },
444
+ {
445
+ "epoch": 0.8203445447087777,
446
+ "eval_loss": 0.23681500554084778,
447
+ "eval_runtime": 22.5792,
448
+ "eval_samples_per_second": 44.288,
449
+ "eval_steps_per_second": 0.709,
450
+ "step": 1000
451
+ },
452
+ {
453
+ "epoch": 0.8408531583264971,
454
+ "grad_norm": 0.1057242602109909,
455
+ "learning_rate": 0.0007192427709458656,
456
+ "loss": 0.2366,
457
+ "step": 1025
458
+ },
459
+ {
460
+ "epoch": 0.8613617719442166,
461
+ "grad_norm": 0.12001299113035202,
462
+ "learning_rate": 0.0007030194336210887,
463
+ "loss": 0.2426,
464
+ "step": 1050
465
+ },
466
+ {
467
+ "epoch": 0.8613617719442166,
468
+ "eval_loss": 0.23558472096920013,
469
+ "eval_runtime": 22.572,
470
+ "eval_samples_per_second": 44.303,
471
+ "eval_steps_per_second": 0.709,
472
+ "step": 1050
473
+ },
474
+ {
475
+ "epoch": 0.881870385561936,
476
+ "grad_norm": 0.13502635061740875,
477
+ "learning_rate": 0.0006865359619184331,
478
+ "loss": 0.2471,
479
+ "step": 1075
480
+ },
481
+ {
482
+ "epoch": 0.9023789991796555,
483
+ "grad_norm": 0.10965543240308762,
484
+ "learning_rate": 0.0006698134765630434,
485
+ "loss": 0.2397,
486
+ "step": 1100
487
+ },
488
+ {
489
+ "epoch": 0.9023789991796555,
490
+ "eval_loss": 0.2340947687625885,
491
+ "eval_runtime": 22.599,
492
+ "eval_samples_per_second": 44.25,
493
+ "eval_steps_per_second": 0.708,
494
+ "step": 1100
495
+ },
496
+ {
497
+ "epoch": 0.9228876127973749,
498
+ "grad_norm": 0.09839494526386261,
499
+ "learning_rate": 0.0006528734045348248,
500
+ "loss": 0.2376,
501
+ "step": 1125
502
+ },
503
+ {
504
+ "epoch": 0.9433962264150944,
505
+ "grad_norm": 0.11483541131019592,
506
+ "learning_rate": 0.0006357374516134643,
507
+ "loss": 0.2516,
508
+ "step": 1150
509
+ },
510
+ {
511
+ "epoch": 0.9433962264150944,
512
+ "eval_loss": 0.23218606412410736,
513
+ "eval_runtime": 22.5646,
514
+ "eval_samples_per_second": 44.317,
515
+ "eval_steps_per_second": 0.709,
516
+ "step": 1150
517
+ },
518
+ {
519
+ "epoch": 0.9639048400328137,
520
+ "grad_norm": 0.11629696935415268,
521
+ "learning_rate": 0.0006184275745662179,
522
+ "loss": 0.2405,
523
+ "step": 1175
524
+ },
525
+ {
526
+ "epoch": 0.9844134536505332,
527
+ "grad_norm": 0.10627080500125885,
528
+ "learning_rate": 0.0006009659530141031,
529
+ "loss": 0.2434,
530
+ "step": 1200
531
+ },
532
+ {
533
+ "epoch": 0.9844134536505332,
534
+ "eval_loss": 0.2307899445295334,
535
+ "eval_runtime": 22.5891,
536
+ "eval_samples_per_second": 44.269,
537
+ "eval_steps_per_second": 0.708,
538
+ "step": 1200
539
+ },
540
+ {
541
+ "epoch": 1.0049220672682526,
542
+ "grad_norm": 0.10291340202093124,
543
+ "learning_rate": 0.0005833749610125402,
544
+ "loss": 0.2417,
545
+ "step": 1225
546
+ },
547
+ {
548
+ "epoch": 1.0254306808859721,
549
+ "grad_norm": 0.1100890040397644,
550
+ "learning_rate": 0.0005656771383828602,
551
+ "loss": 0.2157,
552
+ "step": 1250
553
+ },
554
+ {
555
+ "epoch": 1.0254306808859721,
556
+ "eval_loss": 0.23217763006687164,
557
+ "eval_runtime": 22.572,
558
+ "eval_samples_per_second": 44.303,
559
+ "eval_steps_per_second": 0.709,
560
+ "step": 1250
561
+ },
562
+ {
563
+ "epoch": 1.0459392945036916,
564
+ "grad_norm": 0.10660576075315475,
565
+ "learning_rate": 0.0005478951618314134,
566
+ "loss": 0.2115,
567
+ "step": 1275
568
+ },
569
+ {
570
+ "epoch": 1.066447908121411,
571
+ "grad_norm": 0.10155736654996872,
572
+ "learning_rate": 0.0005300518158932815,
573
+ "loss": 0.2131,
574
+ "step": 1300
575
+ },
576
+ {
577
+ "epoch": 1.066447908121411,
578
+ "eval_loss": 0.2298276424407959,
579
+ "eval_runtime": 22.5487,
580
+ "eval_samples_per_second": 44.348,
581
+ "eval_steps_per_second": 0.71,
582
+ "step": 1300
583
+ },
584
+ {
585
+ "epoch": 1.0869565217391304,
586
+ "grad_norm": 0.10922397673130035,
587
+ "learning_rate": 0.0005121699637378282,
588
+ "loss": 0.2115,
589
+ "step": 1325
590
+ },
591
+ {
592
+ "epoch": 1.1074651353568499,
593
+ "grad_norm": 0.11864089965820312,
594
+ "learning_rate": 0.0004942725178734903,
595
+ "loss": 0.2172,
596
+ "step": 1350
597
+ },
598
+ {
599
+ "epoch": 1.1074651353568499,
600
+ "eval_loss": 0.22882254421710968,
601
+ "eval_runtime": 22.5984,
602
+ "eval_samples_per_second": 44.251,
603
+ "eval_steps_per_second": 0.708,
604
+ "step": 1350
605
+ },
606
+ {
607
+ "epoch": 1.1279737489745694,
608
+ "grad_norm": 0.11816546320915222,
609
+ "learning_rate": 0.00047638241078935324,
610
+ "loss": 0.2144,
611
+ "step": 1375
612
+ },
613
+ {
614
+ "epoch": 1.1484823625922886,
615
+ "grad_norm": 0.115484818816185,
616
+ "learning_rate": 0.000458522565571121,
617
+ "loss": 0.2225,
618
+ "step": 1400
619
+ },
620
+ {
621
+ "epoch": 1.1484823625922886,
622
+ "eval_loss": 0.22774316370487213,
623
+ "eval_runtime": 22.573,
624
+ "eval_samples_per_second": 44.301,
625
+ "eval_steps_per_second": 0.709,
626
+ "step": 1400
627
+ },
628
+ {
629
+ "epoch": 1.1689909762100081,
630
+ "grad_norm": 0.10250715166330338,
631
+ "learning_rate": 0.00044071586652913767,
632
+ "loss": 0.206,
633
+ "step": 1425
634
+ },
635
+ {
636
+ "epoch": 1.1894995898277276,
637
+ "grad_norm": 0.11922137439250946,
638
+ "learning_rate": 0.0004229851298760915,
639
+ "loss": 0.2158,
640
+ "step": 1450
641
+ },
642
+ {
643
+ "epoch": 1.1894995898277276,
644
+ "eval_loss": 0.22773854434490204,
645
+ "eval_runtime": 22.5992,
646
+ "eval_samples_per_second": 44.249,
647
+ "eval_steps_per_second": 0.708,
648
+ "step": 1450
649
+ },
650
+ {
651
+ "epoch": 1.2100082034454471,
652
+ "grad_norm": 0.10833777487277985,
653
+ "learning_rate": 0.0004053530744919749,
654
+ "loss": 0.2158,
655
+ "step": 1475
656
+ },
657
+ {
658
+ "epoch": 1.2305168170631666,
659
+ "grad_norm": 0.11317677795886993,
660
+ "learning_rate": 0.0003878422928137597,
661
+ "loss": 0.2187,
662
+ "step": 1500
663
+ },
664
+ {
665
+ "epoch": 1.2305168170631666,
666
+ "eval_loss": 0.2261391282081604,
667
+ "eval_runtime": 22.6015,
668
+ "eval_samples_per_second": 44.245,
669
+ "eval_steps_per_second": 0.708,
670
+ "step": 1500
671
+ },
672
+ {
673
+ "epoch": 1.251025430680886,
674
+ "grad_norm": 0.11307208985090256,
675
+ "learning_rate": 0.0003704752218870861,
676
+ "loss": 0.2236,
677
+ "step": 1525
678
+ },
679
+ {
680
+ "epoch": 1.2715340442986054,
681
+ "grad_norm": 0.10417971014976501,
682
+ "learning_rate": 0.00035327411461706025,
683
+ "loss": 0.2054,
684
+ "step": 1550
685
+ },
686
+ {
687
+ "epoch": 1.2715340442986054,
688
+ "eval_loss": 0.2259172946214676,
689
+ "eval_runtime": 22.5368,
690
+ "eval_samples_per_second": 44.372,
691
+ "eval_steps_per_second": 0.71,
692
+ "step": 1550
693
+ },
694
+ {
695
+ "epoch": 1.2920426579163249,
696
+ "grad_norm": 0.11175528168678284,
697
+ "learning_rate": 0.00033626101125499555,
698
+ "loss": 0.2151,
699
+ "step": 1575
700
+ },
701
+ {
702
+ "epoch": 1.3125512715340442,
703
+ "grad_norm": 0.10208392888307571,
704
+ "learning_rate": 0.0003194577111576333,
705
+ "loss": 0.2113,
706
+ "step": 1600
707
+ },
708
+ {
709
+ "epoch": 1.3125512715340442,
710
+ "eval_loss": 0.2242567092180252,
711
+ "eval_runtime": 22.5352,
712
+ "eval_samples_per_second": 44.375,
713
+ "eval_steps_per_second": 0.71,
714
+ "step": 1600
715
+ },
716
+ {
717
+ "epoch": 1.3330598851517639,
718
+ "grad_norm": 0.10389668494462967,
719
+ "learning_rate": 0.00030288574485502756,
720
+ "loss": 0.2107,
721
+ "step": 1625
722
+ },
723
+ {
724
+ "epoch": 1.3535684987694832,
725
+ "grad_norm": 0.10494068264961243,
726
+ "learning_rate": 0.00028656634646288565,
727
+ "loss": 0.2119,
728
+ "step": 1650
729
+ },
730
+ {
731
+ "epoch": 1.3535684987694832,
732
+ "eval_loss": 0.22249621152877808,
733
+ "eval_runtime": 22.5187,
734
+ "eval_samples_per_second": 44.408,
735
+ "eval_steps_per_second": 0.711,
736
+ "step": 1650
737
+ },
738
+ {
739
+ "epoch": 1.3740771123872026,
740
+ "grad_norm": 0.11896856129169464,
741
+ "learning_rate": 0.00027052042647471254,
742
+ "loss": 0.2009,
743
+ "step": 1675
744
+ },
745
+ {
746
+ "epoch": 1.3945857260049221,
747
+ "grad_norm": 0.10503373295068741,
748
+ "learning_rate": 0.0002547685449686206,
749
+ "loss": 0.2095,
750
+ "step": 1700
751
+ },
752
+ {
753
+ "epoch": 1.3945857260049221,
754
+ "eval_loss": 0.22149282693862915,
755
+ "eval_runtime": 22.5508,
756
+ "eval_samples_per_second": 44.344,
757
+ "eval_steps_per_second": 0.71,
758
+ "step": 1700
759
+ },
760
+ {
761
+ "epoch": 1.4150943396226414,
762
+ "grad_norm": 0.11639218777418137,
763
+ "learning_rate": 0.0002393308852631373,
764
+ "loss": 0.2215,
765
+ "step": 1725
766
+ },
767
+ {
768
+ "epoch": 1.435602953240361,
769
+ "grad_norm": 0.11435659229755402,
770
+ "learning_rate": 0.0002242272280557645,
771
+ "loss": 0.2155,
772
+ "step": 1750
773
+ },
774
+ {
775
+ "epoch": 1.435602953240361,
776
+ "eval_loss": 0.2211078256368637,
777
+ "eval_runtime": 22.5499,
778
+ "eval_samples_per_second": 44.346,
779
+ "eval_steps_per_second": 0.71,
780
+ "step": 1750
781
+ },
782
+ {
783
+ "epoch": 1.4561115668580804,
784
+ "grad_norm": 0.11285687983036041,
785
+ "learning_rate": 0.0002094769260774262,
786
+ "loss": 0.2113,
787
+ "step": 1775
788
+ },
789
+ {
790
+ "epoch": 1.4766201804758,
791
+ "grad_norm": 0.10831128060817719,
792
+ "learning_rate": 0.00019509887929528458,
793
+ "loss": 0.2154,
794
+ "step": 1800
795
+ },
796
+ {
797
+ "epoch": 1.4766201804758,
798
+ "eval_loss": 0.22003982961177826,
799
+ "eval_runtime": 22.5404,
800
+ "eval_samples_per_second": 44.365,
801
+ "eval_steps_per_second": 0.71,
802
+ "step": 1800
803
+ },
804
+ {
805
+ "epoch": 1.4971287940935194,
806
+ "grad_norm": 0.13930928707122803,
807
+ "learning_rate": 0.0001811115106956918,
808
+ "loss": 0.2061,
809
+ "step": 1825
810
+ },
811
+ {
812
+ "epoch": 1.5176374077112387,
813
+ "grad_norm": 0.11574142426252365,
814
+ "learning_rate": 0.00016753274267831115,
815
+ "loss": 0.2158,
816
+ "step": 1850
817
+ },
818
+ {
819
+ "epoch": 1.5176374077112387,
820
+ "eval_loss": 0.21911242604255676,
821
+ "eval_runtime": 22.62,
822
+ "eval_samples_per_second": 44.209,
823
+ "eval_steps_per_second": 0.707,
824
+ "step": 1850
825
+ },
826
+ {
827
+ "epoch": 1.5381460213289582,
828
+ "grad_norm": 0.1241801530122757,
829
+ "learning_rate": 0.00015437997409165478,
830
+ "loss": 0.2061,
831
+ "step": 1875
832
+ },
833
+ {
834
+ "epoch": 1.5586546349466777,
835
+ "grad_norm": 0.10792222619056702,
836
+ "learning_rate": 0.00014167005793946035,
837
+ "loss": 0.2055,
838
+ "step": 1900
839
+ },
840
+ {
841
+ "epoch": 1.5586546349466777,
842
+ "eval_loss": 0.21896851062774658,
843
+ "eval_runtime": 22.5247,
844
+ "eval_samples_per_second": 44.396,
845
+ "eval_steps_per_second": 0.71,
846
+ "step": 1900
847
+ },
848
+ {
849
+ "epoch": 1.579163248564397,
850
+ "grad_norm": 0.12615908682346344,
851
+ "learning_rate": 0.00012941927978647527,
852
+ "loss": 0.2078,
853
+ "step": 1925
854
+ },
855
+ {
856
+ "epoch": 1.5996718621821167,
857
+ "grad_norm": 0.0955195277929306,
858
+ "learning_rate": 0.00011764333689131385,
859
+ "loss": 0.2135,
860
+ "step": 1950
861
+ },
862
+ {
863
+ "epoch": 1.5996718621821167,
864
+ "eval_loss": 0.21795088052749634,
865
+ "eval_runtime": 22.5428,
866
+ "eval_samples_per_second": 44.36,
867
+ "eval_steps_per_second": 0.71,
868
+ "step": 1950
869
+ },
870
+ {
871
+ "epoch": 1.620180475799836,
872
+ "grad_norm": 0.09828197956085205,
873
+ "learning_rate": 0.00010635731809312993,
874
+ "loss": 0.2098,
875
+ "step": 1975
876
+ },
877
+ {
878
+ "epoch": 1.6406890894175554,
879
+ "grad_norm": 0.11805617809295654,
880
+ "learning_rate": 9.557568447787201e-05,
881
+ "loss": 0.2099,
882
+ "step": 2000
883
+ },
884
+ {
885
+ "epoch": 1.6406890894175554,
886
+ "eval_loss": 0.21716704964637756,
887
+ "eval_runtime": 22.5329,
888
+ "eval_samples_per_second": 44.38,
889
+ "eval_steps_per_second": 0.71,
890
+ "step": 2000
891
+ },
892
+ {
893
+ "epoch": 1.661197703035275,
894
+ "grad_norm": 0.11299672722816467,
895
+ "learning_rate": 8.531225084889654e-05,
896
+ "loss": 0.2026,
897
+ "step": 2025
898
+ },
899
+ {
900
+ "epoch": 1.6817063166529942,
901
+ "grad_norm": 0.11950846761465073,
902
+ "learning_rate": 7.558016802568091e-05,
903
+ "loss": 0.2062,
904
+ "step": 2050
905
+ },
906
+ {
907
+ "epoch": 1.6817063166529942,
908
+ "eval_loss": 0.21671602129936218,
909
+ "eval_runtime": 22.5667,
910
+ "eval_samples_per_second": 44.313,
911
+ "eval_steps_per_second": 0.709,
912
+ "step": 2050
913
+ },
914
+ {
915
+ "epoch": 1.7022149302707137,
916
+ "grad_norm": 0.11061406135559082,
917
+ "learning_rate": 6.639190599331746e-05,
918
+ "loss": 0.2073,
919
+ "step": 2075
920
+ },
921
+ {
922
+ "epoch": 1.7227235438884332,
923
+ "grad_norm": 0.10908389836549759,
924
+ "learning_rate": 5.775923792437865e-05,
925
+ "loss": 0.2016,
926
+ "step": 2100
927
+ },
928
+ {
929
+ "epoch": 1.7227235438884332,
930
+ "eval_loss": 0.2159593254327774,
931
+ "eval_runtime": 22.5289,
932
+ "eval_samples_per_second": 44.387,
933
+ "eval_steps_per_second": 0.71,
934
+ "step": 2100
935
+ },
936
+ {
937
+ "epoch": 1.7432321575061525,
938
+ "grad_norm": 0.11665470898151398,
939
+ "learning_rate": 4.9693225093627616e-05,
940
+ "loss": 0.2029,
941
+ "step": 2125
942
+ },
943
+ {
944
+ "epoch": 1.7637407711238722,
945
+ "grad_norm": 0.11755326390266418,
946
+ "learning_rate": 4.220420270490294e-05,
947
+ "loss": 0.1974,
948
+ "step": 2150
949
+ },
950
+ {
951
+ "epoch": 1.7637407711238722,
952
+ "eval_loss": 0.21538600325584412,
953
+ "eval_runtime": 22.6212,
954
+ "eval_samples_per_second": 44.206,
955
+ "eval_steps_per_second": 0.707,
956
+ "step": 2150
957
+ },
958
+ {
959
+ "epoch": 1.7842493847415914,
960
+ "grad_norm": 0.12463412433862686,
961
+ "learning_rate": 3.530176664833834e-05,
962
+ "loss": 0.1999,
963
+ "step": 2175
964
+ },
965
+ {
966
+ "epoch": 1.804757998359311,
967
+ "grad_norm": 0.13732177019119263,
968
+ "learning_rate": 2.8994761204884756e-05,
969
+ "loss": 0.2084,
970
+ "step": 2200
971
+ },
972
+ {
973
+ "epoch": 1.804757998359311,
974
+ "eval_loss": 0.2153223305940628,
975
+ "eval_runtime": 22.5592,
976
+ "eval_samples_per_second": 44.328,
977
+ "eval_steps_per_second": 0.709,
978
+ "step": 2200
979
+ },
980
+ {
981
+ "epoch": 1.8252666119770304,
982
+ "grad_norm": 0.125973179936409,
983
+ "learning_rate": 2.329126771388995e-05,
984
+ "loss": 0.1998,
985
+ "step": 2225
986
+ },
987
+ {
988
+ "epoch": 1.8457752255947497,
989
+ "grad_norm": 0.10488861054182053,
990
+ "learning_rate": 1.8198594218256815e-05,
991
+ "loss": 0.2009,
992
+ "step": 2250
993
+ },
994
+ {
995
+ "epoch": 1.8457752255947497,
996
+ "eval_loss": 0.2151503562927246,
997
+ "eval_runtime": 22.5499,
998
+ "eval_samples_per_second": 44.346,
999
+ "eval_steps_per_second": 0.71,
1000
+ "step": 2250
1001
+ },
1002
+ {
1003
+ "epoch": 1.8662838392124692,
1004
+ "grad_norm": 0.11978505551815033,
1005
+ "learning_rate": 1.3723266100447052e-05,
1006
+ "loss": 0.204,
1007
+ "step": 2275
1008
+ },
1009
+ {
1010
+ "epoch": 1.8867924528301887,
1011
+ "grad_norm": 0.1408199667930603,
1012
+ "learning_rate": 9.871017721329201e-06,
1013
+ "loss": 0.2122,
1014
+ "step": 2300
1015
+ },
1016
+ {
1017
+ "epoch": 1.8867924528301887,
1018
+ "eval_loss": 0.21504977345466614,
1019
+ "eval_runtime": 22.618,
1020
+ "eval_samples_per_second": 44.213,
1021
+ "eval_steps_per_second": 0.707,
1022
+ "step": 2300
1023
+ },
1024
+ {
1025
+ "epoch": 1.907301066447908,
1026
+ "grad_norm": 0.11792837083339691,
1027
+ "learning_rate": 6.646785072584871e-06,
1028
+ "loss": 0.2068,
1029
+ "step": 2325
1030
+ },
1031
+ {
1032
+ "epoch": 1.9278096800656277,
1033
+ "grad_norm": 0.0992947518825531,
1034
+ "learning_rate": 4.054699452086641e-06,
1035
+ "loss": 0.2006,
1036
+ "step": 2350
1037
+ },
1038
+ {
1039
+ "epoch": 1.9278096800656277,
1040
+ "eval_loss": 0.21499131619930267,
1041
+ "eval_runtime": 22.5655,
1042
+ "eval_samples_per_second": 44.315,
1043
+ "eval_steps_per_second": 0.709,
1044
+ "step": 2350
1045
+ },
1046
+ {
1047
+ "epoch": 1.948318293683347,
1048
+ "grad_norm": 0.1237172856926918,
1049
+ "learning_rate": 2.0980821703527886e-06,
1050
+ "loss": 0.2153,
1051
+ "step": 2375
1052
+ },
1053
+ {
1054
+ "epoch": 1.9688269073010665,
1055
+ "grad_norm": 0.11623578518629074,
1056
+ "learning_rate": 7.794402948607671e-07,
1057
+ "loss": 0.2113,
1058
+ "step": 2400
1059
+ },
1060
+ {
1061
+ "epoch": 1.9688269073010665,
1062
+ "eval_loss": 0.21487931907176971,
1063
+ "eval_runtime": 22.5687,
1064
+ "eval_samples_per_second": 44.309,
1065
+ "eval_steps_per_second": 0.709,
1066
+ "step": 2400
1067
+ },
1068
+ {
1069
+ "epoch": 1.989335520918786,
1070
+ "grad_norm": 0.14077042043209076,
1071
+ "learning_rate": 1.0046343767294853e-07,
1072
+ "loss": 0.2009,
1073
+ "step": 2425
1074
+ },
1075
+ {
1076
+ "epoch": 2.0,
1077
+ "step": 2438,
1078
+ "total_flos": 1.58523627405312e+18,
1079
+ "train_loss": 0.2502285066053846,
1080
+ "train_runtime": 4294.1547,
1081
+ "train_samples_per_second": 18.164,
1082
+ "train_steps_per_second": 0.568
1083
+ }
1084
+ ],
1085
+ "logging_steps": 25,
1086
+ "max_steps": 2438,
1087
+ "num_input_tokens_seen": 0,
1088
+ "num_train_epochs": 2,
1089
+ "save_steps": 500,
1090
+ "stateful_callbacks": {
1091
+ "TrainerControl": {
1092
+ "args": {
1093
+ "should_epoch_stop": false,
1094
+ "should_evaluate": false,
1095
+ "should_log": false,
1096
+ "should_save": true,
1097
+ "should_training_stop": true
1098
+ },
1099
+ "attributes": {}
1100
+ }
1101
+ },
1102
+ "total_flos": 1.58523627405312e+18,
1103
+ "train_batch_size": 32,
1104
+ "trial_name": null,
1105
+ "trial_params": null
1106
+ }