nvan15 commited on
Commit
ade215c
·
verified ·
1 Parent(s): 27ff7e3

Batch upload part 8

Browse files
Files changed (50) hide show
  1. nl_tasks/exps/run_ex28/ft/adapter_config.json +18 -0
  2. nl_tasks/exps/run_ex28/ft/special_tokens_map.json +24 -0
  3. nl_tasks/exps/run_ex28/ft/tokenizer.json +0 -0
  4. nl_tasks/exps/run_ex28/ft/tokenizer.model +3 -0
  5. nl_tasks/exps/run_ex28/ft/tokenizer_config.json +43 -0
  6. nl_tasks/exps/run_ex28/ft2/adapter_config.json +18 -0
  7. nl_tasks/exps/run_ex28/ft2/adapter_model.bin +3 -0
  8. nl_tasks/exps/run_ex29/ft/adapter_config.json +18 -0
  9. nl_tasks/exps/run_ex29/ft/special_tokens_map.json +24 -0
  10. nl_tasks/exps/run_ex29/ft/tokenizer.json +0 -0
  11. nl_tasks/exps/run_ex29/ft/tokenizer.model +3 -0
  12. nl_tasks/exps/run_ex29/ft/tokenizer_config.json +43 -0
  13. nl_tasks/exps/run_ex29/ft2/adapter_config.json +18 -0
  14. nl_tasks/exps/run_ex29/ft2/adapter_model.bin +3 -0
  15. nl_tasks/exps/run_ex29/trainer_state.json +505 -0
  16. nl_tasks/exps/run_ex30/ft/adapter_config.json +18 -0
  17. nl_tasks/exps/run_ex30/ft/special_tokens_map.json +24 -0
  18. nl_tasks/exps/run_ex30/ft/tokenizer.json +0 -0
  19. nl_tasks/exps/run_ex30/ft/tokenizer.model +3 -0
  20. nl_tasks/exps/run_ex30/ft/tokenizer_config.json +43 -0
  21. nl_tasks/exps/run_ex30/ft2/adapter_config.json +18 -0
  22. nl_tasks/exps/run_ex30/ft2/adapter_model.bin +3 -0
  23. nl_tasks/exps/run_ex30/trainer_state.json +505 -0
  24. nl_tasks/exps/run_ex31/ft/adapter_config.json +18 -0
  25. nl_tasks/exps/run_ex31/ft/special_tokens_map.json +24 -0
  26. nl_tasks/exps/run_ex31/ft/tokenizer.json +0 -0
  27. nl_tasks/exps/run_ex31/ft/tokenizer.model +3 -0
  28. nl_tasks/exps/run_ex31/ft/tokenizer_config.json +43 -0
  29. nl_tasks/exps/run_ex31/ft2/adapter_config.json +18 -0
  30. nl_tasks/exps/run_ex31/ft2/adapter_model.bin +3 -0
  31. nl_tasks/exps/run_ex31/trainer_state.json +743 -0
  32. nl_tasks/exps/run_ex32/ft/adapter_config.json +18 -0
  33. nl_tasks/exps/run_ex32/ft/special_tokens_map.json +24 -0
  34. nl_tasks/exps/run_ex32/ft/tokenizer.json +0 -0
  35. nl_tasks/exps/run_ex32/ft/tokenizer.model +3 -0
  36. nl_tasks/exps/run_ex32/ft/tokenizer_config.json +43 -0
  37. nl_tasks/exps/run_ex32/ft2/adapter_config.json +18 -0
  38. nl_tasks/exps/run_ex32/ft2/adapter_model.bin +3 -0
  39. nl_tasks/exps/run_ex32/trainer_state.json +743 -0
  40. nl_tasks/exps/run_ex33/ft/adapter_config.json +18 -0
  41. nl_tasks/exps/run_ex33/ft/special_tokens_map.json +24 -0
  42. nl_tasks/exps/run_ex33/ft/tokenizer.json +0 -0
  43. nl_tasks/exps/run_ex33/ft/tokenizer.model +3 -0
  44. nl_tasks/exps/run_ex33/ft/tokenizer_config.json +43 -0
  45. nl_tasks/exps/run_ex33/ft2/adapter_config.json +18 -0
  46. nl_tasks/exps/run_ex33/ft2/adapter_model.bin +3 -0
  47. nl_tasks/exps/run_ex33/trainer_state.json +743 -0
  48. nl_tasks/exps/run_ex34/gsm8k.txt +1 -0
  49. nl_tasks/exps/run_ex34/math.txt +1 -0
  50. nl_tasks/exps/run_ex34/trainer_state.json +743 -0
nl_tasks/exps/run_ex28/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex28/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex28/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex28/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex28/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex28/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex28/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4b2ff3c37a243e0a7907b8e6da8bde1c03c0404c3c881e0b71b1698879447d68
3
+ size 33602915
nl_tasks/exps/run_ex29/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex29/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex29/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex29/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex29/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex29/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex29/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84e3c739b20c3790118a8b7ea87a0218b5c9c9e771866690dea91b3c76edfd03
3
+ size 33602915
nl_tasks/exps/run_ex29/trainer_state.json ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1668,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.26481908559799194,
15
+ "learning_rate": 0.000718562874251497,
16
+ "loss": 0.5019,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 0.21658311784267426,
22
+ "learning_rate": 0.001467065868263473,
23
+ "loss": 0.3441,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.4752499461174011,
29
+ "learning_rate": 0.002215568862275449,
30
+ "loss": 0.3298,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 56.11571502685547,
36
+ "learning_rate": 0.002964071856287425,
37
+ "loss": 0.3863,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 0.24988949298858643,
43
+ "learning_rate": 0.003712574850299401,
44
+ "loss": 0.3536,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.23253102600574493,
50
+ "learning_rate": 0.004461077844311378,
51
+ "loss": 0.3441,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.20779232680797577,
57
+ "learning_rate": 0.0049997316901074056,
58
+ "loss": 0.3304,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.14326857030391693,
64
+ "learning_rate": 0.004994394866271345,
65
+ "loss": 0.3232,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.1106962114572525,
71
+ "learning_rate": 0.004982230184254933,
72
+ "loss": 0.3079,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.10388347506523132,
78
+ "learning_rate": 0.004963270942203842,
79
+ "loss": 0.2993,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.10831473022699356,
85
+ "learning_rate": 0.004937569036879761,
86
+ "loss": 0.289,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.10159999877214432,
92
+ "learning_rate": 0.004905194821604405,
93
+ "loss": 0.2792,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.09414353221654892,
99
+ "learning_rate": 0.004866236913682755,
100
+ "loss": 0.2742,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.08423851430416107,
106
+ "learning_rate": 0.004820801951832635,
107
+ "loss": 0.2746,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.10220842808485031,
113
+ "learning_rate": 0.004769014304284648,
114
+ "loss": 0.2689,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.07861992716789246,
120
+ "learning_rate": 0.0047110157283514545,
121
+ "loss": 0.2684,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.09534072130918503,
127
+ "learning_rate": 0.004646964982398253,
128
+ "loss": 0.2748,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.06600063294172287,
134
+ "learning_rate": 0.0045770373912766265,
135
+ "loss": 0.2578,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.08592315763235092,
141
+ "learning_rate": 0.004501424366411254,
142
+ "loss": 0.2567,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.08367173373699188,
148
+ "learning_rate": 0.00442033288185318,
149
+ "loss": 0.2631,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.08196345716714859,
155
+ "learning_rate": 0.004333984907733788,
156
+ "loss": 0.2505,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.07102052867412567,
162
+ "learning_rate": 0.004242616802670323,
163
+ "loss": 0.2464,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.07556530088186264,
169
+ "learning_rate": 0.00414647866678607,
170
+ "loss": 0.2542,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.0706329271197319,
176
+ "learning_rate": 0.004045833657116195,
177
+ "loss": 0.2484,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.07402704656124115,
183
+ "learning_rate": 0.003940957267273149,
184
+ "loss": 0.2453,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.06807030737400055,
190
+ "learning_rate": 0.0038321365733434,
191
+ "loss": 0.2431,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.07543069124221802,
197
+ "learning_rate": 0.0037196694480796876,
198
+ "loss": 0.2497,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.06862358748912811,
204
+ "learning_rate": 0.0036038637455397798,
205
+ "loss": 0.238,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.09762419760227203,
211
+ "learning_rate": 0.0034850364584035876,
212
+ "loss": 0.2339,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.0853116512298584,
218
+ "learning_rate": 0.0033635128502753193,
219
+ "loss": 0.241,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.05775105208158493,
225
+ "learning_rate": 0.00323962556534579,
226
+ "loss": 0.2377,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.06312242150306702,
232
+ "learning_rate": 0.003113713717851998,
233
+ "loss": 0.2371,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.06418934464454651,
239
+ "learning_rate": 0.0029861219638263694,
240
+ "loss": 0.2313,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.06555480509996414,
246
+ "learning_rate": 0.002857199557676555,
247
+ "loss": 0.2148,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.061830855906009674,
253
+ "learning_rate": 0.00272729939617819,
254
+ "loss": 0.203,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.07122394442558289,
260
+ "learning_rate": 0.002596777052497456,
261
+ "loss": 0.2041,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.06675304472446442,
267
+ "learning_rate": 0.002465989802887632,
268
+ "loss": 0.21,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.06000453978776932,
274
+ "learning_rate": 0.0023352956487238063,
275
+ "loss": 0.2003,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.05904003605246544,
281
+ "learning_rate": 0.002205052336552725,
282
+ "loss": 0.2035,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.07205251604318619,
288
+ "learning_rate": 0.0020756163788401825,
289
+ "loss": 0.205,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.06704974919557571,
295
+ "learning_rate": 0.0019473420780964405,
296
+ "loss": 0.2069,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.060501646250486374,
302
+ "learning_rate": 0.0018205805570509052,
303
+ "loss": 0.198,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.05758596956729889,
309
+ "learning_rate": 0.0016956787975307614,
310
+ "loss": 0.1917,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.05682109296321869,
316
+ "learning_rate": 0.0015729786906744237,
317
+ "loss": 0.1914,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.06109858676791191,
323
+ "learning_rate": 0.0014528161010796171,
324
+ "loss": 0.196,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.06597461551427841,
330
+ "learning_rate": 0.0013355199474478,
331
+ "loss": 0.1897,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.060266848653554916,
337
+ "learning_rate": 0.0012214113022414447,
338
+ "loss": 0.1965,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.05543503537774086,
344
+ "learning_rate": 0.0011108025128186872,
345
+ "loss": 0.1816,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.06788609176874161,
351
+ "learning_rate": 0.001003996346451016,
352
+ "loss": 0.1887,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.05910054221749306,
358
+ "learning_rate": 0.0009012851615643594,
359
+ "loss": 0.1916,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.06214448809623718,
365
+ "learning_rate": 0.0008029501074720933,
366
+ "loss": 0.1897,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.05667509138584137,
372
+ "learning_rate": 0.0007092603547905377,
373
+ "loss": 0.1823,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.0649266168475151,
379
+ "learning_rate": 0.000620472358643503,
380
+ "loss": 0.1877,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.054551344364881516,
386
+ "learning_rate": 0.000536829156672706,
387
+ "loss": 0.1821,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.060151200741529465,
393
+ "learning_rate": 0.00045855970377559676,
394
+ "loss": 0.188,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.05992837995290756,
400
+ "learning_rate": 0.00038587824539160486,
401
+ "loss": 0.185,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.06002328544855118,
407
+ "learning_rate": 0.00031898373105229694,
408
+ "loss": 0.1823,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.06145670637488365,
414
+ "learning_rate": 0.00025805926980072337,
415
+ "loss": 0.1877,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.051237791776657104,
421
+ "learning_rate": 0.00020327162897062267,
422
+ "loss": 0.1826,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.059376440942287445,
428
+ "learning_rate": 0.00015477077769746855,
429
+ "loss": 0.1837,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.05060333386063576,
435
+ "learning_rate": 0.00011268947641089322,
436
+ "loss": 0.1786,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.06010892242193222,
442
+ "learning_rate": 7.714291343216635e-05,
443
+ "loss": 0.1854,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.06022082641720772,
449
+ "learning_rate": 4.822838967146054e-05,
450
+ "loss": 0.184,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.05504591017961502,
456
+ "learning_rate": 2.6025052287976248e-05,
457
+ "loss": 0.19,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.0550151988863945,
463
+ "learning_rate": 1.0593678041975475e-05,
464
+ "loss": 0.1808,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.05710240826010704,
470
+ "learning_rate": 1.9765069317453923e-06,
471
+ "loss": 0.1844,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0,
476
+ "step": 1668,
477
+ "total_flos": 1.62588235137024e+18,
478
+ "train_loss": 0.2374439179468498,
479
+ "train_runtime": 2227.387,
480
+ "train_samples_per_second": 35.917,
481
+ "train_steps_per_second": 0.749
482
+ }
483
+ ],
484
+ "logging_steps": 25,
485
+ "max_steps": 1668,
486
+ "num_input_tokens_seen": 0,
487
+ "num_train_epochs": 2,
488
+ "save_steps": 0,
489
+ "stateful_callbacks": {
490
+ "TrainerControl": {
491
+ "args": {
492
+ "should_epoch_stop": false,
493
+ "should_evaluate": false,
494
+ "should_log": false,
495
+ "should_save": true,
496
+ "should_training_stop": true
497
+ },
498
+ "attributes": {}
499
+ }
500
+ },
501
+ "total_flos": 1.62588235137024e+18,
502
+ "train_batch_size": 48,
503
+ "trial_name": null,
504
+ "trial_params": null
505
+ }
nl_tasks/exps/run_ex30/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex30/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex30/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex30/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex30/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex30/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex30/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37abc3b7865aedfe138803a372ca6148e64aa7084b6ae523203860321f217145
3
+ size 33602915
nl_tasks/exps/run_ex30/trainer_state.json ADDED
@@ -0,0 +1,505 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1668,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.2027871459722519,
15
+ "learning_rate": 0.0001437125748502994,
16
+ "loss": 0.6037,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 0.2463991791009903,
22
+ "learning_rate": 0.0002934131736526946,
23
+ "loss": 0.3853,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.16277779638767242,
29
+ "learning_rate": 0.0004431137724550898,
30
+ "loss": 0.3367,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 0.19866418838500977,
36
+ "learning_rate": 0.000592814371257485,
37
+ "loss": 0.3121,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 0.1782834231853485,
43
+ "learning_rate": 0.0007425149700598802,
44
+ "loss": 0.3089,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.19668474793434143,
50
+ "learning_rate": 0.0008922155688622756,
51
+ "loss": 0.2998,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.20847776532173157,
57
+ "learning_rate": 0.000999946338021481,
58
+ "loss": 0.2978,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.24161750078201294,
64
+ "learning_rate": 0.000998878973254269,
65
+ "loss": 0.304,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.20520828664302826,
71
+ "learning_rate": 0.0009964460368509867,
72
+ "loss": 0.2982,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.205276221036911,
78
+ "learning_rate": 0.0009926541884407686,
79
+ "loss": 0.2948,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.1710120588541031,
85
+ "learning_rate": 0.000987513807375952,
86
+ "loss": 0.2866,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.18962617218494415,
92
+ "learning_rate": 0.000981038964320881,
93
+ "loss": 0.2766,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.19223880767822266,
99
+ "learning_rate": 0.0009732473827365509,
100
+ "loss": 0.2738,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.17323505878448486,
106
+ "learning_rate": 0.0009641603903665269,
107
+ "loss": 0.2747,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.2111186534166336,
113
+ "learning_rate": 0.0009538028608569297,
114
+ "loss": 0.2687,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.16343681514263153,
120
+ "learning_rate": 0.0009422031456702909,
121
+ "loss": 0.2695,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.166376531124115,
127
+ "learning_rate": 0.0009293929964796506,
128
+ "loss": 0.2764,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.15445727109909058,
134
+ "learning_rate": 0.0009154074782553252,
135
+ "loss": 0.2592,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.19298841059207916,
141
+ "learning_rate": 0.0009002848732822509,
142
+ "loss": 0.2586,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.15150733292102814,
148
+ "learning_rate": 0.0008840665763706359,
149
+ "loss": 0.2642,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.1794758439064026,
155
+ "learning_rate": 0.0008667969815467577,
156
+ "loss": 0.2519,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.17440396547317505,
162
+ "learning_rate": 0.0008485233605340645,
163
+ "loss": 0.2473,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.1693456918001175,
169
+ "learning_rate": 0.000829295733357214,
170
+ "loss": 0.2554,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.21234950423240662,
176
+ "learning_rate": 0.0008091667314232391,
177
+ "loss": 0.2509,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.16216659545898438,
183
+ "learning_rate": 0.0007881914534546298,
184
+ "loss": 0.2539,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.1589777022600174,
190
+ "learning_rate": 0.00076642731466868,
191
+ "loss": 0.2478,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.17090196907520294,
197
+ "learning_rate": 0.0007439338896159376,
198
+ "loss": 0.2526,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.1454530507326126,
204
+ "learning_rate": 0.000720772749107956,
205
+ "loss": 0.2407,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.1544404923915863,
211
+ "learning_rate": 0.0006970072916807175,
212
+ "loss": 0.2358,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.15039412677288055,
218
+ "learning_rate": 0.0006727025700550639,
219
+ "loss": 0.2416,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.13531458377838135,
225
+ "learning_rate": 0.000647925113069158,
226
+ "loss": 0.2396,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.13535469770431519,
232
+ "learning_rate": 0.0006227427435703996,
233
+ "loss": 0.2382,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.13635869324207306,
239
+ "learning_rate": 0.0005972243927652738,
240
+ "loss": 0.234,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.16282866895198822,
246
+ "learning_rate": 0.0005714399115353111,
247
+ "loss": 0.2181,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.15078669786453247,
253
+ "learning_rate": 0.0005454598792356381,
254
+ "loss": 0.2082,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.14040178060531616,
260
+ "learning_rate": 0.0005193554104994912,
261
+ "loss": 0.2083,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.14513766765594482,
267
+ "learning_rate": 0.0004931979605775264,
268
+ "loss": 0.2137,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.14192743599414825,
274
+ "learning_rate": 0.0004670591297447613,
275
+ "loss": 0.2039,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.14158278703689575,
281
+ "learning_rate": 0.00044101046731054495,
282
+ "loss": 0.2073,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.15080343186855316,
288
+ "learning_rate": 0.0004151232757680365,
289
+ "loss": 0.2089,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.16032980382442474,
295
+ "learning_rate": 0.0003894684156192881,
296
+ "loss": 0.2097,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.14257696270942688,
302
+ "learning_rate": 0.00036411611141018104,
303
+ "loss": 0.2013,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.1491222381591797,
309
+ "learning_rate": 0.00033913575950615226,
310
+ "loss": 0.1949,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.13092097640037537,
316
+ "learning_rate": 0.00031459573813488474,
317
+ "loss": 0.1946,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.13922549784183502,
323
+ "learning_rate": 0.0002905632202159234,
324
+ "loss": 0.1991,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.13861505687236786,
330
+ "learning_rate": 0.00026710398948956,
331
+ "loss": 0.1921,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.14462125301361084,
337
+ "learning_rate": 0.00024428226044828893,
338
+ "loss": 0.1992,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.13747504353523254,
344
+ "learning_rate": 0.00022216050256373743,
345
+ "loss": 0.1848,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.1536317616701126,
351
+ "learning_rate": 0.00020079926929020321,
352
+ "loss": 0.1914,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.1415477842092514,
358
+ "learning_rate": 0.00018025703231287188,
359
+ "loss": 0.1937,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.14675097167491913,
365
+ "learning_rate": 0.00016059002149441864,
366
+ "loss": 0.1934,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.13264699280261993,
372
+ "learning_rate": 0.00014185207095810754,
373
+ "loss": 0.1848,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.15923435986042023,
379
+ "learning_rate": 0.00012409447172870058,
380
+ "loss": 0.1909,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.12699192762374878,
386
+ "learning_rate": 0.00010736583133454119,
387
+ "loss": 0.1853,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.14546607434749603,
393
+ "learning_rate": 9.171194075511934e-05,
394
+ "loss": 0.1919,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.14308422803878784,
400
+ "learning_rate": 7.717564907832098e-05,
401
+ "loss": 0.1886,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.1330956369638443,
407
+ "learning_rate": 6.379674621045939e-05,
408
+ "loss": 0.1856,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.14610905945301056,
414
+ "learning_rate": 5.1611853960144674e-05,
415
+ "loss": 0.1923,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.12671244144439697,
421
+ "learning_rate": 4.0654325794124535e-05,
422
+ "loss": 0.1853,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.14322331547737122,
428
+ "learning_rate": 3.095415553949371e-05,
429
+ "loss": 0.1868,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.1249406635761261,
435
+ "learning_rate": 2.2537895282178645e-05,
436
+ "loss": 0.1829,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.1507108360528946,
442
+ "learning_rate": 1.542858268643327e-05,
443
+ "loss": 0.1888,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.13775067031383514,
449
+ "learning_rate": 9.645677934292108e-06,
450
+ "loss": 0.1881,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.13463319838047028,
456
+ "learning_rate": 5.205010457595249e-06,
457
+ "loss": 0.1937,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.13453663885593414,
463
+ "learning_rate": 2.118735608395095e-06,
464
+ "loss": 0.1846,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.15357571840286255,
470
+ "learning_rate": 3.953013863490784e-07,
471
+ "loss": 0.1877,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0,
476
+ "step": 1668,
477
+ "total_flos": 1.62588235137024e+18,
478
+ "train_loss": 0.23850378386980053,
479
+ "train_runtime": 2220.9851,
480
+ "train_samples_per_second": 36.02,
481
+ "train_steps_per_second": 0.751
482
+ }
483
+ ],
484
+ "logging_steps": 25,
485
+ "max_steps": 1668,
486
+ "num_input_tokens_seen": 0,
487
+ "num_train_epochs": 2,
488
+ "save_steps": 0,
489
+ "stateful_callbacks": {
490
+ "TrainerControl": {
491
+ "args": {
492
+ "should_epoch_stop": false,
493
+ "should_evaluate": false,
494
+ "should_log": false,
495
+ "should_save": true,
496
+ "should_training_stop": true
497
+ },
498
+ "attributes": {}
499
+ }
500
+ },
501
+ "total_flos": 1.62588235137024e+18,
502
+ "train_batch_size": 48,
503
+ "trial_name": null,
504
+ "trial_params": null
505
+ }
nl_tasks/exps/run_ex31/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex31/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex31/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex31/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex31/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex31/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex31/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a02d042f5103673b43fce0e75a90ffc2bbd4c2dd3f028a6db285cf34c732bb6f
3
+ size 33602915
nl_tasks/exps/run_ex31/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2502,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.237616166472435,
15
+ "learning_rate": 0.00047808764940239046,
16
+ "loss": 0.523,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 0.20422297716140747,
22
+ "learning_rate": 0.0009760956175298805,
23
+ "loss": 0.3493,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.21873657405376434,
29
+ "learning_rate": 0.0014741035856573707,
30
+ "loss": 0.3229,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 0.24540852010250092,
36
+ "learning_rate": 0.0019721115537848603,
37
+ "loss": 0.314,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 1.2509855031967163,
43
+ "learning_rate": 0.002470119521912351,
44
+ "loss": 0.3362,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.3144875168800354,
50
+ "learning_rate": 0.002968127490039841,
51
+ "loss": 0.3425,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.3264140486717224,
57
+ "learning_rate": 0.003466135458167331,
58
+ "loss": 0.3266,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.18573451042175293,
64
+ "learning_rate": 0.0039641434262948205,
65
+ "loss": 0.3298,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.18408645689487457,
71
+ "learning_rate": 0.004462151394422311,
72
+ "loss": 0.3179,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.15508218109607697,
78
+ "learning_rate": 0.0049601593625498005,
79
+ "loss": 0.3138,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.12099787592887878,
85
+ "learning_rate": 0.004998712114810764,
86
+ "loss": 0.3034,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.15490184724330902,
92
+ "learning_rate": 0.004994392376862353,
93
+ "loss": 0.2906,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.12329553812742233,
99
+ "learning_rate": 0.004987036305323271,
100
+ "loss": 0.283,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.1184345930814743,
106
+ "learning_rate": 0.0049766528544732515,
107
+ "loss": 0.2827,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.11834505200386047,
113
+ "learning_rate": 0.00496325466371133,
114
+ "loss": 0.2732,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.07786522805690765,
120
+ "learning_rate": 0.004946858042170361,
121
+ "loss": 0.2735,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.08665332198143005,
127
+ "learning_rate": 0.0049274829488645,
128
+ "loss": 0.2795,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.07928116619586945,
134
+ "learning_rate": 0.004905152968393817,
135
+ "loss": 0.2609,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.12693190574645996,
141
+ "learning_rate": 0.004879895282235616,
142
+ "loss": 0.2617,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.07392635196447372,
148
+ "learning_rate": 0.0048517406356574115,
149
+ "loss": 0.2672,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.091416135430336,
155
+ "learning_rate": 0.0048207233002918164,
156
+ "loss": 0.256,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.08377746492624283,
162
+ "learning_rate": 0.004786881032418933,
163
+ "loss": 0.2511,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.06915393471717834,
169
+ "learning_rate": 0.004750255027006994,
170
+ "loss": 0.2589,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.07373099029064178,
176
+ "learning_rate": 0.004710889867567222,
177
+ "loss": 0.2518,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.06648170202970505,
183
+ "learning_rate": 0.004668833471883931,
184
+ "loss": 0.249,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.06580448895692825,
190
+ "learning_rate": 0.0046241370336859424,
191
+ "loss": 0.2481,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.07079949229955673,
197
+ "learning_rate": 0.004576854960330311,
198
+ "loss": 0.2543,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.06271594017744064,
204
+ "learning_rate": 0.004527044806574219,
205
+ "loss": 0.2422,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.0618261955678463,
211
+ "learning_rate": 0.004474767204515652,
212
+ "loss": 0.2386,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.06375081837177277,
218
+ "learning_rate": 0.004420085789788137,
219
+ "loss": 0.2445,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.05429168790578842,
225
+ "learning_rate": 0.0043630671240993905,
226
+ "loss": 0.2422,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.05972912162542343,
232
+ "learning_rate": 0.0043037806142081645,
233
+ "loss": 0.2418,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.05979093909263611,
239
+ "learning_rate": 0.004242298427437903,
240
+ "loss": 0.2361,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.07171288132667542,
246
+ "learning_rate": 0.00417869540383007,
247
+ "loss": 0.2221,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.06082647666335106,
253
+ "learning_rate": 0.0041130489650440805,
254
+ "loss": 0.211,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.05869077146053314,
260
+ "learning_rate": 0.004045439020114715,
261
+ "loss": 0.2123,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.059404339641332626,
267
+ "learning_rate": 0.003975947868181739,
268
+ "loss": 0.2193,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.05453066527843475,
274
+ "learning_rate": 0.0039046600983101355,
275
+ "loss": 0.2105,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.05172204226255417,
281
+ "learning_rate": 0.0038316624865229088,
282
+ "loss": 0.2142,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.060828547924757004,
288
+ "learning_rate": 0.003757043890171755,
289
+ "loss": 0.2165,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.0641385167837143,
295
+ "learning_rate": 0.0036808951397742378,
296
+ "loss": 0.218,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.05649897828698158,
302
+ "learning_rate": 0.0036033089284490745,
303
+ "loss": 0.2094,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.05249471217393875,
309
+ "learning_rate": 0.003524379699084162,
310
+ "loss": 0.2028,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.05551101639866829,
316
+ "learning_rate": 0.0034442035293746655,
317
+ "loss": 0.2037,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.05503613501787186,
323
+ "learning_rate": 0.003362878014871117,
324
+ "loss": 0.208,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.05301510915160179,
330
+ "learning_rate": 0.0032805021501798805,
331
+ "loss": 0.2012,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.056410036981105804,
337
+ "learning_rate": 0.0031971762084606003,
338
+ "loss": 0.2095,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.05041206628084183,
344
+ "learning_rate": 0.0031130016193673137,
345
+ "loss": 0.1943,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.05369720607995987,
351
+ "learning_rate": 0.003028080845581801,
352
+ "loss": 0.2021,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.05225532501935959,
358
+ "learning_rate": 0.00294251725808947,
359
+ "loss": 0.2058,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.052846185863018036,
365
+ "learning_rate": 0.0028564150103495963,
366
+ "loss": 0.204,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.051086682826280594,
372
+ "learning_rate": 0.002769878911513086,
373
+ "loss": 0.1961,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.055970534682273865,
379
+ "learning_rate": 0.0026830142988420866,
380
+ "loss": 0.2012,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.0505131334066391,
386
+ "learning_rate": 0.0025959269094867525,
387
+ "loss": 0.1975,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.050360601395368576,
393
+ "learning_rate": 0.0025087227517752355,
394
+ "loss": 0.2029,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.0552959144115448,
400
+ "learning_rate": 0.0024215079761735793,
401
+ "loss": 0.1986,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.04918622598052025,
407
+ "learning_rate": 0.0023343887460726058,
408
+ "loss": 0.1966,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.05339549854397774,
414
+ "learning_rate": 0.0022474711085590524,
415
+ "loss": 0.2022,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.044814374297857285,
421
+ "learning_rate": 0.002160860865328295,
422
+ "loss": 0.1953,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.05152401328086853,
428
+ "learning_rate": 0.002074663443895771,
429
+ "loss": 0.1974,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.04496421292424202,
435
+ "learning_rate": 0.001988983769263877,
436
+ "loss": 0.1926,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.0547536201775074,
442
+ "learning_rate": 0.001903926136200566,
443
+ "loss": 0.1992,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.04525403305888176,
449
+ "learning_rate": 0.0018195940822850927,
450
+ "loss": 0.1976,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.045630406588315964,
456
+ "learning_rate": 0.0017360902618754664,
457
+ "loss": 0.2022,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.046241626143455505,
463
+ "learning_rate": 0.0016535163211510203,
464
+ "loss": 0.1926,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.04715004190802574,
470
+ "learning_rate": 0.0015719727743821854,
471
+ "loss": 0.1947,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0083932853717026,
476
+ "grad_norm": 0.046405646950006485,
477
+ "learning_rate": 0.0014915588815781152,
478
+ "loss": 0.1849,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 2.038369304556355,
483
+ "grad_norm": 0.04901168495416641,
484
+ "learning_rate": 0.0014123725276610638,
485
+ "loss": 0.1587,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 2.068345323741007,
490
+ "grad_norm": 0.05803445354104042,
491
+ "learning_rate": 0.0013345101033146085,
492
+ "loss": 0.1605,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 2.0983213429256593,
497
+ "grad_norm": 0.05447980388998985,
498
+ "learning_rate": 0.0012580663876507647,
499
+ "loss": 0.1601,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 2.128297362110312,
504
+ "grad_norm": 0.05418948829174042,
505
+ "learning_rate": 0.0011831344328387986,
506
+ "loss": 0.1577,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 2.158273381294964,
511
+ "grad_norm": 0.055272314697504044,
512
+ "learning_rate": 0.0011098054508361854,
513
+ "loss": 0.1596,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 2.1882494004796165,
518
+ "grad_norm": 0.04978896677494049,
519
+ "learning_rate": 0.0010381687023596014,
520
+ "loss": 0.1634,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 2.2182254196642686,
525
+ "grad_norm": 0.052053723484277725,
526
+ "learning_rate": 0.0009683113882310735,
527
+ "loss": 0.1565,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 2.2482014388489207,
532
+ "grad_norm": 0.04874909296631813,
533
+ "learning_rate": 0.0009003185432315822,
534
+ "loss": 0.1597,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 2.278177458033573,
539
+ "grad_norm": 0.04999493435025215,
540
+ "learning_rate": 0.0008342729325912946,
541
+ "loss": 0.1554,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 2.3081534772182253,
546
+ "grad_norm": 0.051304448395967484,
547
+ "learning_rate": 0.0007702549512424437,
548
+ "loss": 0.1617,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 2.338129496402878,
553
+ "grad_norm": 0.04773577302694321,
554
+ "learning_rate": 0.0007083425259574896,
555
+ "loss": 0.1563,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 2.36810551558753,
560
+ "grad_norm": 0.04622693732380867,
561
+ "learning_rate": 0.0006486110204916776,
562
+ "loss": 0.1582,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 2.3980815347721824,
567
+ "grad_norm": 0.05061614140868187,
568
+ "learning_rate": 0.000591133143845462,
569
+ "loss": 0.1544,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 2.4280575539568345,
574
+ "grad_norm": 0.05210672691464424,
575
+ "learning_rate": 0.0005359788617584769,
576
+ "loss": 0.1575,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 2.4580335731414866,
581
+ "grad_norm": 0.049017682671546936,
582
+ "learning_rate": 0.00048321531154276706,
583
+ "loss": 0.1578,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 2.488009592326139,
588
+ "grad_norm": 0.061639346182346344,
589
+ "learning_rate": 0.0004329067203589709,
590
+ "loss": 0.1544,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 2.5179856115107913,
595
+ "grad_norm": 0.05240131914615631,
596
+ "learning_rate": 0.00038511432703492083,
597
+ "loss": 0.1568,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 2.547961630695444,
602
+ "grad_norm": 0.05182984471321106,
603
+ "learning_rate": 0.0003398963075218309,
604
+ "loss": 0.1567,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 2.577937649880096,
609
+ "grad_norm": 0.04628787934780121,
610
+ "learning_rate": 0.0002973077040788205,
611
+ "loss": 0.1528,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 2.6079136690647484,
616
+ "grad_norm": 0.056303806602954865,
617
+ "learning_rate": 0.00025740035827196165,
618
+ "loss": 0.1515,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 2.6378896882494005,
623
+ "grad_norm": 0.05215095728635788,
624
+ "learning_rate": 0.00022022284786941544,
625
+ "loss": 0.1527,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 2.6678657074340526,
630
+ "grad_norm": 0.04626120626926422,
631
+ "learning_rate": 0.00018582042770947467,
632
+ "loss": 0.1541,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 2.697841726618705,
637
+ "grad_norm": 0.04964889958500862,
638
+ "learning_rate": 0.0001542349746134855,
639
+ "loss": 0.1574,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 2.7278177458033572,
644
+ "grad_norm": 0.05072702094912529,
645
+ "learning_rate": 0.00012550493641070665,
646
+ "loss": 0.1609,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 2.7577937649880093,
651
+ "grad_norm": 0.048793647438287735,
652
+ "learning_rate": 9.966528513716072e-05,
653
+ "loss": 0.151,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 2.787769784172662,
658
+ "grad_norm": 0.04410620033740997,
659
+ "learning_rate": 7.674747446543756e-05,
660
+ "loss": 0.149,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 2.8177458033573144,
665
+ "grad_norm": 0.04964963719248772,
666
+ "learning_rate": 5.677940141727761e-05,
667
+ "loss": 0.1524,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 2.8477218225419665,
672
+ "grad_norm": 0.052818212658166885,
673
+ "learning_rate": 3.9785372405537756e-05,
674
+ "loss": 0.149,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 2.8776978417266186,
679
+ "grad_norm": 0.07118818908929825,
680
+ "learning_rate": 2.5786073646871523e-05,
681
+ "loss": 0.1528,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 2.907673860911271,
686
+ "grad_norm": 0.04659281671047211,
687
+ "learning_rate": 1.479854598114977e-05,
688
+ "loss": 0.1544,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 2.937649880095923,
693
+ "grad_norm": 0.04493272304534912,
694
+ "learning_rate": 6.836164128259103e-06,
695
+ "loss": 0.1504,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 2.9676258992805753,
700
+ "grad_norm": 0.06299802660942078,
701
+ "learning_rate": 1.908620407542472e-06,
702
+ "loss": 0.1501,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.997601918465228,
707
+ "grad_norm": 0.053529493510723114,
708
+ "learning_rate": 2.191293968722974e-08,
709
+ "loss": 0.152,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 3.0,
714
+ "step": 2502,
715
+ "total_flos": 2.43882352705536e+18,
716
+ "train_loss": 0.215125925130219,
717
+ "train_runtime": 3297.3883,
718
+ "train_samples_per_second": 36.392,
719
+ "train_steps_per_second": 0.759
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2502,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 3,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": true,
734
+ "should_training_stop": true
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 2.43882352705536e+18,
740
+ "train_batch_size": 48,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exps/run_ex32/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex32/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex32/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex32/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex32/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex32/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex32/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e8b473fff55419f14a36da83dfd8d8f05944a51103982531702ea9c3fdd5c0c
3
+ size 33602915
nl_tasks/exps/run_ex32/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2502,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.22439797222614288,
15
+ "learning_rate": 9.56175298804781e-05,
16
+ "loss": 0.634,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 0.21105553209781647,
22
+ "learning_rate": 0.0001952191235059761,
23
+ "loss": 0.4028,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.18460212647914886,
29
+ "learning_rate": 0.0002948207171314741,
30
+ "loss": 0.346,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 0.19811777770519257,
36
+ "learning_rate": 0.0003944223107569721,
37
+ "loss": 0.3192,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 0.18307138979434967,
43
+ "learning_rate": 0.0004940239043824702,
44
+ "loss": 0.3131,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.19494092464447021,
50
+ "learning_rate": 0.0005936254980079682,
51
+ "loss": 0.3019,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.19441217184066772,
57
+ "learning_rate": 0.0006932270916334662,
58
+ "loss": 0.2973,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.1927807629108429,
64
+ "learning_rate": 0.0007928286852589641,
65
+ "loss": 0.3038,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.17632770538330078,
71
+ "learning_rate": 0.0008924302788844621,
72
+ "loss": 0.2981,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.19236312806606293,
78
+ "learning_rate": 0.00099203187250996,
79
+ "loss": 0.2966,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.21083885431289673,
85
+ "learning_rate": 0.0009997424229621528,
86
+ "loss": 0.2933,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.21011164784431458,
92
+ "learning_rate": 0.0009988784753724707,
93
+ "loss": 0.343,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.2327512800693512,
99
+ "learning_rate": 0.0009974072610646543,
100
+ "loss": 0.2838,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.25379207730293274,
106
+ "learning_rate": 0.0009953305708946503,
107
+ "loss": 0.2835,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.21607662737369537,
113
+ "learning_rate": 0.000992650932742266,
114
+ "loss": 0.2739,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.17558318376541138,
120
+ "learning_rate": 0.0009893716084340722,
121
+ "loss": 0.2751,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.17286434769630432,
127
+ "learning_rate": 0.0009854965897729,
128
+ "loss": 0.2812,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.15521785616874695,
134
+ "learning_rate": 0.0009810305936787634,
135
+ "loss": 0.2631,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.19885142147541046,
141
+ "learning_rate": 0.0009759790564471232,
142
+ "loss": 0.2634,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.16864049434661865,
148
+ "learning_rate": 0.0009703481271314822,
149
+ "loss": 0.2686,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.18147310614585876,
155
+ "learning_rate": 0.0009641446600583632,
156
+ "loss": 0.2565,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.1791866272687912,
162
+ "learning_rate": 0.0009573762064837866,
163
+ "loss": 0.2525,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.16333305835723877,
169
+ "learning_rate": 0.0009500510054013988,
170
+ "loss": 0.2599,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.15634645521640778,
176
+ "learning_rate": 0.0009421779735134444,
177
+ "loss": 0.2551,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.16058474779129028,
183
+ "learning_rate": 0.0009337666943767861,
184
+ "loss": 0.2518,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.15423771739006042,
190
+ "learning_rate": 0.0009248274067371884,
191
+ "loss": 0.2507,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.16649393737316132,
197
+ "learning_rate": 0.0009153709920660622,
198
+ "loss": 0.256,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.14274722337722778,
204
+ "learning_rate": 0.0009054089613148438,
205
+ "loss": 0.2444,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.14566783607006073,
211
+ "learning_rate": 0.0008949534409031304,
212
+ "loss": 0.2406,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.16103540360927582,
218
+ "learning_rate": 0.0008840171579576273,
219
+ "loss": 0.2476,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.13278396427631378,
225
+ "learning_rate": 0.0008726134248198781,
226
+ "loss": 0.2444,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.13953395187854767,
232
+ "learning_rate": 0.000860756122841633,
233
+ "loss": 0.2429,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.15767574310302734,
239
+ "learning_rate": 0.0008484596854875805,
240
+ "loss": 0.2382,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.17023757100105286,
246
+ "learning_rate": 0.0008357390807660139,
247
+ "loss": 0.2247,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.1432039439678192,
253
+ "learning_rate": 0.0008226097930088161,
254
+ "loss": 0.2148,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.14797061681747437,
260
+ "learning_rate": 0.0008090878040229431,
261
+ "loss": 0.2162,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.15065963566303253,
267
+ "learning_rate": 0.0007951895736363477,
268
+ "loss": 0.2234,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.13475900888442993,
274
+ "learning_rate": 0.0007809320196620271,
275
+ "loss": 0.2137,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.1358369141817093,
281
+ "learning_rate": 0.0007663324973045817,
282
+ "loss": 0.2167,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.15405559539794922,
288
+ "learning_rate": 0.000751408778034351,
289
+ "loss": 0.2193,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.15572060644626617,
295
+ "learning_rate": 0.0007361790279548476,
296
+ "loss": 0.2207,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.14885199069976807,
302
+ "learning_rate": 0.0007206617856898149,
303
+ "loss": 0.2122,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.13450340926647186,
309
+ "learning_rate": 0.0007048759398168324,
310
+ "loss": 0.2053,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.12750637531280518,
316
+ "learning_rate": 0.0006888407058749331,
317
+ "loss": 0.2059,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.1344868540763855,
323
+ "learning_rate": 0.0006725756029742234,
324
+ "loss": 0.2108,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.12938323616981506,
330
+ "learning_rate": 0.0006561004300359761,
331
+ "loss": 0.2038,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.13532765209674835,
337
+ "learning_rate": 0.00063943524169212,
338
+ "loss": 0.2107,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.1328439861536026,
344
+ "learning_rate": 0.0006226003238734627,
345
+ "loss": 0.1973,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.14724420011043549,
351
+ "learning_rate": 0.0006056161691163601,
352
+ "loss": 0.2038,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.1372281312942505,
358
+ "learning_rate": 0.000588503451617894,
359
+ "loss": 0.2063,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.13836322724819183,
365
+ "learning_rate": 0.0005712830020699192,
366
+ "loss": 0.2056,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.12976917624473572,
372
+ "learning_rate": 0.0005539757823026172,
373
+ "loss": 0.1973,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.1546931117773056,
379
+ "learning_rate": 0.0005366028597684172,
380
+ "loss": 0.2028,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.12072357535362244,
386
+ "learning_rate": 0.0005191853818973506,
387
+ "loss": 0.1979,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.13212819397449493,
393
+ "learning_rate": 0.0005017445503550471,
394
+ "loss": 0.2051,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.13972344994544983,
400
+ "learning_rate": 0.00048430159523471587,
401
+ "loss": 0.1999,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.13032999634742737,
407
+ "learning_rate": 0.00046687774921452113,
408
+ "loss": 0.1975,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.1394297480583191,
414
+ "learning_rate": 0.00044949422171181047,
415
+ "loss": 0.2031,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.12275266647338867,
421
+ "learning_rate": 0.0004321721730656589,
422
+ "loss": 0.1966,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.12649331986904144,
428
+ "learning_rate": 0.0004149326887791541,
429
+ "loss": 0.1983,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.11251500993967056,
435
+ "learning_rate": 0.0003977967538527754,
436
+ "loss": 0.1932,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.13119769096374512,
442
+ "learning_rate": 0.0003807852272401132,
443
+ "loss": 0.1995,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.12021032720804214,
449
+ "learning_rate": 0.0003639188164570185,
450
+ "loss": 0.1985,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.12251219153404236,
456
+ "learning_rate": 0.0003472180523750933,
457
+ "loss": 0.2041,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.11931514739990234,
463
+ "learning_rate": 0.0003307032642302041,
464
+ "loss": 0.1933,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.12238750606775284,
470
+ "learning_rate": 0.0003143945548764371,
471
+ "loss": 0.195,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0083932853717026,
476
+ "grad_norm": 0.13976894319057465,
477
+ "learning_rate": 0.00029831177631562306,
478
+ "loss": 0.1858,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 2.038369304556355,
483
+ "grad_norm": 0.12677723169326782,
484
+ "learning_rate": 0.0002824745055322128,
485
+ "loss": 0.1608,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 2.068345323741007,
490
+ "grad_norm": 0.13730570673942566,
491
+ "learning_rate": 0.0002669020206629217,
492
+ "loss": 0.1632,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 2.0983213429256593,
497
+ "grad_norm": 0.1422010064125061,
498
+ "learning_rate": 0.00025161327753015297,
499
+ "loss": 0.1619,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 2.128297362110312,
504
+ "grad_norm": 0.14339914917945862,
505
+ "learning_rate": 0.00023662688656775972,
506
+ "loss": 0.1607,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 2.158273381294964,
511
+ "grad_norm": 0.14340265095233917,
512
+ "learning_rate": 0.00022196109016723708,
513
+ "loss": 0.1611,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 2.1882494004796165,
518
+ "grad_norm": 0.14029954373836517,
519
+ "learning_rate": 0.0002076337404719203,
520
+ "loss": 0.1657,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 2.2182254196642686,
525
+ "grad_norm": 0.13461889326572418,
526
+ "learning_rate": 0.00019366227764621468,
527
+ "loss": 0.1584,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 2.2482014388489207,
532
+ "grad_norm": 0.1285167932510376,
533
+ "learning_rate": 0.00018006370864631643,
534
+ "loss": 0.1622,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 2.278177458033573,
539
+ "grad_norm": 0.13294167816638947,
540
+ "learning_rate": 0.0001668545865182589,
541
+ "loss": 0.1577,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 2.3081534772182253,
546
+ "grad_norm": 0.1409018188714981,
547
+ "learning_rate": 0.00015405099024848874,
548
+ "loss": 0.1637,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 2.338129496402878,
553
+ "grad_norm": 0.13364772498607635,
554
+ "learning_rate": 0.00014166850519149794,
555
+ "loss": 0.1579,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 2.36810551558753,
560
+ "grad_norm": 0.12694032490253448,
561
+ "learning_rate": 0.0001297222040983355,
562
+ "loss": 0.1597,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 2.3980815347721824,
567
+ "grad_norm": 0.12603726983070374,
568
+ "learning_rate": 0.0001182266287690924,
569
+ "loss": 0.1569,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 2.4280575539568345,
574
+ "grad_norm": 0.12928339838981628,
575
+ "learning_rate": 0.00010719577235169537,
576
+ "loss": 0.1592,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 2.4580335731414866,
581
+ "grad_norm": 0.12346290051937103,
582
+ "learning_rate": 9.664306230855341e-05,
583
+ "loss": 0.1596,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 2.488009592326139,
588
+ "grad_norm": 0.1295640915632248,
589
+ "learning_rate": 8.658134407179418e-05,
590
+ "loss": 0.1561,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 2.5179856115107913,
595
+ "grad_norm": 0.13177761435508728,
596
+ "learning_rate": 7.702286540698416e-05,
597
+ "loss": 0.1597,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 2.547961630695444,
602
+ "grad_norm": 0.13077791035175323,
603
+ "learning_rate": 6.797926150436617e-05,
604
+ "loss": 0.1586,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 2.577937649880096,
609
+ "grad_norm": 0.117102712392807,
610
+ "learning_rate": 5.9461540815764105e-05,
611
+ "loss": 0.1551,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 2.6079136690647484,
616
+ "grad_norm": 0.14442621171474457,
617
+ "learning_rate": 5.1480071654392335e-05,
618
+ "loss": 0.1543,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 2.6378896882494005,
623
+ "grad_norm": 0.13530191779136658,
624
+ "learning_rate": 4.404456957388309e-05,
625
+ "loss": 0.1547,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 2.6678657074340526,
630
+ "grad_norm": 0.11663298308849335,
631
+ "learning_rate": 3.716408554189493e-05,
632
+ "loss": 0.1567,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 2.697841726618705,
637
+ "grad_norm": 0.13254733383655548,
638
+ "learning_rate": 3.08469949226971e-05,
639
+ "loss": 0.1605,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 2.7278177458033572,
644
+ "grad_norm": 0.13481509685516357,
645
+ "learning_rate": 2.510098728214133e-05,
646
+ "loss": 0.1638,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 2.7577937649880093,
651
+ "grad_norm": 0.12630033493041992,
652
+ "learning_rate": 1.9933057027432144e-05,
653
+ "loss": 0.1544,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 2.787769784172662,
658
+ "grad_norm": 0.11446399986743927,
659
+ "learning_rate": 1.5349494893087514e-05,
660
+ "loss": 0.1519,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 2.8177458033573144,
665
+ "grad_norm": 0.1315765082836151,
666
+ "learning_rate": 1.1355880283455521e-05,
667
+ "loss": 0.1559,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 2.8477218225419665,
672
+ "grad_norm": 0.13976338505744934,
673
+ "learning_rate": 7.95707448110755e-06,
674
+ "loss": 0.1513,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 2.8776978417266186,
679
+ "grad_norm": 0.15874944627285004,
680
+ "learning_rate": 5.157214729374305e-06,
681
+ "loss": 0.1558,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 2.907673860911271,
686
+ "grad_norm": 0.12649980187416077,
687
+ "learning_rate": 2.959709196229954e-06,
688
+ "loss": 0.1576,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 2.937649880095923,
693
+ "grad_norm": 0.121933713555336,
694
+ "learning_rate": 1.3672328256518206e-06,
695
+ "loss": 0.1541,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 2.9676258992805753,
700
+ "grad_norm": 0.14589641988277435,
701
+ "learning_rate": 3.8172408150849435e-07,
702
+ "loss": 0.1534,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.997601918465228,
707
+ "grad_norm": 0.14007411897182465,
708
+ "learning_rate": 4.382587937445947e-09,
709
+ "loss": 0.1536,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 3.0,
714
+ "step": 2502,
715
+ "total_flos": 2.43882352705536e+18,
716
+ "train_loss": 0.217716321051359,
717
+ "train_runtime": 3304.2612,
718
+ "train_samples_per_second": 36.317,
719
+ "train_steps_per_second": 0.757
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2502,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 3,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": true,
734
+ "should_training_stop": true
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 2.43882352705536e+18,
740
+ "train_batch_size": 48,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exps/run_ex33/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex33/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exps/run_ex33/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exps/run_ex33/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exps/run_ex33/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exps/run_ex33/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exps/run_ex33/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:50b23d0f92322496640c91535debe14bd2599b7c55d1720ecbeff4fd370d1495
3
+ size 33602915
nl_tasks/exps/run_ex33/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2502,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.24060821533203125,
15
+ "learning_rate": 0.0009561752988047809,
16
+ "loss": 0.4891,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 0.2504737377166748,
22
+ "learning_rate": 0.001952191235059761,
23
+ "loss": 0.3442,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.3224866986274719,
29
+ "learning_rate": 0.0029482071713147415,
30
+ "loss": 0.3373,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 1.9630478620529175,
36
+ "learning_rate": 0.003944223107569721,
37
+ "loss": 0.367,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 2.872677803039551,
43
+ "learning_rate": 0.004940239043824702,
44
+ "loss": 0.3491,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.3211333751678467,
50
+ "learning_rate": 0.005936254980079682,
51
+ "loss": 0.3578,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.14077529311180115,
57
+ "learning_rate": 0.006932270916334662,
58
+ "loss": 0.3252,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.1346043050289154,
64
+ "learning_rate": 0.007928286852589641,
65
+ "loss": 0.3208,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.12695597112178802,
71
+ "learning_rate": 0.008924302788844622,
72
+ "loss": 0.3113,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.09868916869163513,
78
+ "learning_rate": 0.009920318725099601,
79
+ "loss": 0.3067,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.10491207242012024,
85
+ "learning_rate": 0.009997424229621528,
86
+ "loss": 0.2987,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.08867417275905609,
92
+ "learning_rate": 0.009988784753724706,
93
+ "loss": 0.2875,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.07948382943868637,
99
+ "learning_rate": 0.009974072610646543,
100
+ "loss": 0.2814,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.1311594694852829,
106
+ "learning_rate": 0.009953305708946503,
107
+ "loss": 0.2815,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.09352317452430725,
113
+ "learning_rate": 0.00992650932742266,
114
+ "loss": 0.274,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.06186804547905922,
120
+ "learning_rate": 0.009893716084340722,
121
+ "loss": 0.2739,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.057951804250478745,
127
+ "learning_rate": 0.009854965897729,
128
+ "loss": 0.2802,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.05402874946594238,
134
+ "learning_rate": 0.009810305936787634,
135
+ "loss": 0.2619,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.06987974047660828,
141
+ "learning_rate": 0.009759790564471233,
142
+ "loss": 0.2596,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.055585265159606934,
148
+ "learning_rate": 0.009703481271314823,
149
+ "loss": 0.2663,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.06418672204017639,
155
+ "learning_rate": 0.009641446600583633,
156
+ "loss": 0.2537,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.05453021079301834,
162
+ "learning_rate": 0.009573762064837866,
163
+ "loss": 0.2506,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.04687987267971039,
169
+ "learning_rate": 0.009500510054013988,
170
+ "loss": 0.2581,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.05430283397436142,
176
+ "learning_rate": 0.009421779735134445,
177
+ "loss": 0.2524,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.04550522193312645,
183
+ "learning_rate": 0.009337666943767862,
184
+ "loss": 0.2492,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.04663613811135292,
190
+ "learning_rate": 0.009248274067371885,
191
+ "loss": 0.2476,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.05199963226914406,
197
+ "learning_rate": 0.009153709920660622,
198
+ "loss": 0.2548,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.043098509311676025,
204
+ "learning_rate": 0.009054089613148438,
205
+ "loss": 0.243,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.043702222406864166,
211
+ "learning_rate": 0.008949534409031304,
212
+ "loss": 0.2392,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.04471327364444733,
218
+ "learning_rate": 0.008840171579576273,
219
+ "loss": 0.2447,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.04158543795347214,
225
+ "learning_rate": 0.008726134248198781,
226
+ "loss": 0.2431,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.03955095633864403,
232
+ "learning_rate": 0.008607561228416329,
233
+ "loss": 0.243,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.039677463471889496,
239
+ "learning_rate": 0.008484596854875806,
240
+ "loss": 0.2379,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.04437502473592758,
246
+ "learning_rate": 0.00835739080766014,
247
+ "loss": 0.2221,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.04531238228082657,
253
+ "learning_rate": 0.008226097930088161,
254
+ "loss": 0.2113,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.04641604423522949,
260
+ "learning_rate": 0.00809087804022943,
261
+ "loss": 0.2146,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.0421479269862175,
267
+ "learning_rate": 0.007951895736363478,
268
+ "loss": 0.2209,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.036580126732587814,
274
+ "learning_rate": 0.007809320196620271,
275
+ "loss": 0.2105,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.03694167360663414,
281
+ "learning_rate": 0.0076633249730458175,
282
+ "loss": 0.2148,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.03922872990369797,
288
+ "learning_rate": 0.00751408778034351,
289
+ "loss": 0.2171,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.042761627584695816,
295
+ "learning_rate": 0.0073617902795484755,
296
+ "loss": 0.2194,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.03898875042796135,
302
+ "learning_rate": 0.007206617856898149,
303
+ "loss": 0.2102,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.039882808923721313,
309
+ "learning_rate": 0.007048759398168324,
310
+ "loss": 0.2055,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.03663821145892143,
316
+ "learning_rate": 0.006888407058749331,
317
+ "loss": 0.2045,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.04057300463318825,
323
+ "learning_rate": 0.006725756029742234,
324
+ "loss": 0.2094,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.03922704979777336,
330
+ "learning_rate": 0.006561004300359761,
331
+ "loss": 0.2031,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.03589184582233429,
337
+ "learning_rate": 0.0063943524169212005,
338
+ "loss": 0.2103,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.03473867475986481,
344
+ "learning_rate": 0.0062260032387346275,
345
+ "loss": 0.1965,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.041303601115942,
351
+ "learning_rate": 0.006056161691163602,
352
+ "loss": 0.2037,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.04580175131559372,
358
+ "learning_rate": 0.00588503451617894,
359
+ "loss": 0.2067,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.03737177699804306,
365
+ "learning_rate": 0.005712830020699193,
366
+ "loss": 0.2058,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.03482063487172127,
372
+ "learning_rate": 0.005539757823026172,
373
+ "loss": 0.1975,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.04050496220588684,
379
+ "learning_rate": 0.005366028597684173,
380
+ "loss": 0.2029,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.031009122729301453,
386
+ "learning_rate": 0.005191853818973505,
387
+ "loss": 0.1978,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.03680524602532387,
393
+ "learning_rate": 0.005017445503550471,
394
+ "loss": 0.2042,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.03796203434467316,
400
+ "learning_rate": 0.004843015952347159,
401
+ "loss": 0.2002,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.03277565911412239,
407
+ "learning_rate": 0.0046687774921452116,
408
+ "loss": 0.1982,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.03661928325891495,
414
+ "learning_rate": 0.004494942217118105,
415
+ "loss": 0.2034,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.02970374934375286,
421
+ "learning_rate": 0.00432172173065659,
422
+ "loss": 0.1973,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.03302035480737686,
428
+ "learning_rate": 0.004149326887791542,
429
+ "loss": 0.1989,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.03104039840400219,
435
+ "learning_rate": 0.003977967538527754,
436
+ "loss": 0.194,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.037008076906204224,
442
+ "learning_rate": 0.003807852272401132,
443
+ "loss": 0.2002,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.030725648626685143,
449
+ "learning_rate": 0.0036391881645701854,
450
+ "loss": 0.1992,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.03207903727889061,
456
+ "learning_rate": 0.003472180523750933,
457
+ "loss": 0.2046,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.0310438871383667,
463
+ "learning_rate": 0.0033070326423020407,
464
+ "loss": 0.1941,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.032687630504369736,
470
+ "learning_rate": 0.0031439455487643707,
471
+ "loss": 0.1962,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0083932853717026,
476
+ "grad_norm": 0.0336097776889801,
477
+ "learning_rate": 0.0029831177631562305,
478
+ "loss": 0.1852,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 2.038369304556355,
483
+ "grad_norm": 0.034838490188121796,
484
+ "learning_rate": 0.0028247450553221276,
485
+ "loss": 0.1593,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 2.068345323741007,
490
+ "grad_norm": 0.03779308870434761,
491
+ "learning_rate": 0.002669020206629217,
492
+ "loss": 0.1617,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 2.0983213429256593,
497
+ "grad_norm": 0.03595089539885521,
498
+ "learning_rate": 0.0025161327753015295,
499
+ "loss": 0.1602,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 2.128297362110312,
504
+ "grad_norm": 0.03848210349678993,
505
+ "learning_rate": 0.0023662688656775972,
506
+ "loss": 0.159,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 2.158273381294964,
511
+ "grad_norm": 0.04832014814019203,
512
+ "learning_rate": 0.0022196109016723708,
513
+ "loss": 0.1593,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 2.1882494004796165,
518
+ "grad_norm": 0.03901856020092964,
519
+ "learning_rate": 0.0020763374047192027,
520
+ "loss": 0.1641,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 2.2182254196642686,
525
+ "grad_norm": 0.03556321561336517,
526
+ "learning_rate": 0.001936622776462147,
527
+ "loss": 0.1573,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 2.2482014388489207,
532
+ "grad_norm": 0.033188410103321075,
533
+ "learning_rate": 0.0018006370864631644,
534
+ "loss": 0.1611,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 2.278177458033573,
539
+ "grad_norm": 0.0357639417052269,
540
+ "learning_rate": 0.0016685458651825892,
541
+ "loss": 0.1565,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 2.3081534772182253,
546
+ "grad_norm": 0.03594391047954559,
547
+ "learning_rate": 0.0015405099024848874,
548
+ "loss": 0.1629,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 2.338129496402878,
553
+ "grad_norm": 0.03396276384592056,
554
+ "learning_rate": 0.0014166850519149793,
555
+ "loss": 0.1566,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 2.36810551558753,
560
+ "grad_norm": 0.03651060536503792,
561
+ "learning_rate": 0.0012972220409833552,
562
+ "loss": 0.1587,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 2.3980815347721824,
567
+ "grad_norm": 0.03662113845348358,
568
+ "learning_rate": 0.001182266287690924,
569
+ "loss": 0.155,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 2.4280575539568345,
574
+ "grad_norm": 0.03555059805512428,
575
+ "learning_rate": 0.0010719577235169537,
576
+ "loss": 0.1587,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 2.4580335731414866,
581
+ "grad_norm": 0.03360700234770775,
582
+ "learning_rate": 0.0009664306230855341,
583
+ "loss": 0.1585,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 2.488009592326139,
588
+ "grad_norm": 0.033780504018068314,
589
+ "learning_rate": 0.0008658134407179418,
590
+ "loss": 0.1553,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 2.5179856115107913,
595
+ "grad_norm": 0.03432834520936012,
596
+ "learning_rate": 0.0007702286540698417,
597
+ "loss": 0.158,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 2.547961630695444,
602
+ "grad_norm": 0.035472046583890915,
603
+ "learning_rate": 0.0006797926150436618,
604
+ "loss": 0.1573,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 2.577937649880096,
609
+ "grad_norm": 0.03242143243551254,
610
+ "learning_rate": 0.000594615408157641,
611
+ "loss": 0.1537,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 2.6079136690647484,
616
+ "grad_norm": 0.03848033398389816,
617
+ "learning_rate": 0.0005148007165439233,
618
+ "loss": 0.1532,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 2.6378896882494005,
623
+ "grad_norm": 0.03433902934193611,
624
+ "learning_rate": 0.0004404456957388309,
625
+ "loss": 0.1529,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 2.6678657074340526,
630
+ "grad_norm": 0.031391434371471405,
631
+ "learning_rate": 0.00037164085541894934,
632
+ "loss": 0.1546,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 2.697841726618705,
637
+ "grad_norm": 0.04350714385509491,
638
+ "learning_rate": 0.000308469949226971,
639
+ "loss": 0.1584,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 2.7278177458033572,
644
+ "grad_norm": 0.03302815929055214,
645
+ "learning_rate": 0.0002510098728214133,
646
+ "loss": 0.1614,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 2.7577937649880093,
651
+ "grad_norm": 0.03371904045343399,
652
+ "learning_rate": 0.00019933057027432145,
653
+ "loss": 0.1517,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 2.787769784172662,
658
+ "grad_norm": 0.031399570405483246,
659
+ "learning_rate": 0.00015349494893087513,
660
+ "loss": 0.1496,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 2.8177458033573144,
665
+ "grad_norm": 0.03590654581785202,
666
+ "learning_rate": 0.00011355880283455522,
667
+ "loss": 0.1535,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 2.8477218225419665,
672
+ "grad_norm": 0.03726603463292122,
673
+ "learning_rate": 7.957074481107551e-05,
674
+ "loss": 0.1491,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 2.8776978417266186,
679
+ "grad_norm": 0.041672345250844955,
680
+ "learning_rate": 5.1572147293743046e-05,
681
+ "loss": 0.1537,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 2.907673860911271,
686
+ "grad_norm": 0.03395906835794449,
687
+ "learning_rate": 2.959709196229954e-05,
688
+ "loss": 0.1544,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 2.937649880095923,
693
+ "grad_norm": 0.030766665935516357,
694
+ "learning_rate": 1.3672328256518207e-05,
695
+ "loss": 0.1513,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 2.9676258992805753,
700
+ "grad_norm": 0.03894634544849396,
701
+ "learning_rate": 3.817240815084944e-06,
702
+ "loss": 0.1504,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.997601918465228,
707
+ "grad_norm": 0.03591805323958397,
708
+ "learning_rate": 4.382587937445948e-08,
709
+ "loss": 0.1516,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 3.0,
714
+ "step": 2502,
715
+ "total_flos": 2.43882352705536e+18,
716
+ "train_loss": 0.21604387271556733,
717
+ "train_runtime": 3302.176,
718
+ "train_samples_per_second": 36.34,
719
+ "train_steps_per_second": 0.758
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2502,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 3,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": true,
734
+ "should_training_stop": true
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 2.43882352705536e+18,
740
+ "train_batch_size": 48,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }
nl_tasks/exps/run_ex34/gsm8k.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ gsm8k length==== 1319, gsm8k acc %====, 52.388172858225936
nl_tasks/exps/run_ex34/math.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ math length==== 5000, math acc %====, 7.84
nl_tasks/exps/run_ex34/trainer_state.json ADDED
@@ -0,0 +1,743 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 2502,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.02997601918465228,
14
+ "grad_norm": 0.2532191574573517,
15
+ "learning_rate": 0.0019123505976095618,
16
+ "loss": 0.4625,
17
+ "step": 25
18
+ },
19
+ {
20
+ "epoch": 0.05995203836930456,
21
+ "grad_norm": 5.414860725402832,
22
+ "learning_rate": 0.003904382470119522,
23
+ "loss": 0.478,
24
+ "step": 50
25
+ },
26
+ {
27
+ "epoch": 0.08992805755395683,
28
+ "grad_norm": 0.6449404954910278,
29
+ "learning_rate": 0.005896414342629483,
30
+ "loss": 0.6016,
31
+ "step": 75
32
+ },
33
+ {
34
+ "epoch": 0.11990407673860912,
35
+ "grad_norm": 1.4888554811477661,
36
+ "learning_rate": 0.007888446215139441,
37
+ "loss": 0.4044,
38
+ "step": 100
39
+ },
40
+ {
41
+ "epoch": 0.1498800959232614,
42
+ "grad_norm": 0.23530787229537964,
43
+ "learning_rate": 0.009880478087649403,
44
+ "loss": 0.3748,
45
+ "step": 125
46
+ },
47
+ {
48
+ "epoch": 0.17985611510791366,
49
+ "grad_norm": 0.10670146346092224,
50
+ "learning_rate": 0.011872509960159363,
51
+ "loss": 0.3298,
52
+ "step": 150
53
+ },
54
+ {
55
+ "epoch": 0.20983213429256595,
56
+ "grad_norm": 0.1089276447892189,
57
+ "learning_rate": 0.013864541832669323,
58
+ "loss": 0.3172,
59
+ "step": 175
60
+ },
61
+ {
62
+ "epoch": 0.23980815347721823,
63
+ "grad_norm": 0.09628577530384064,
64
+ "learning_rate": 0.015856573705179282,
65
+ "loss": 0.3173,
66
+ "step": 200
67
+ },
68
+ {
69
+ "epoch": 0.2697841726618705,
70
+ "grad_norm": 0.07994027435779572,
71
+ "learning_rate": 0.017848605577689244,
72
+ "loss": 0.3081,
73
+ "step": 225
74
+ },
75
+ {
76
+ "epoch": 0.2997601918465228,
77
+ "grad_norm": 0.06300719082355499,
78
+ "learning_rate": 0.019840637450199202,
79
+ "loss": 0.3046,
80
+ "step": 250
81
+ },
82
+ {
83
+ "epoch": 0.32973621103117506,
84
+ "grad_norm": 0.06899631768465042,
85
+ "learning_rate": 0.019994848459243056,
86
+ "loss": 0.2986,
87
+ "step": 275
88
+ },
89
+ {
90
+ "epoch": 0.3597122302158273,
91
+ "grad_norm": 0.08354010432958603,
92
+ "learning_rate": 0.019977569507449413,
93
+ "loss": 0.2875,
94
+ "step": 300
95
+ },
96
+ {
97
+ "epoch": 0.38968824940047964,
98
+ "grad_norm": 0.0717054083943367,
99
+ "learning_rate": 0.019948145221293085,
100
+ "loss": 0.2824,
101
+ "step": 325
102
+ },
103
+ {
104
+ "epoch": 0.4196642685851319,
105
+ "grad_norm": 0.04974915832281113,
106
+ "learning_rate": 0.019906611417893006,
107
+ "loss": 0.282,
108
+ "step": 350
109
+ },
110
+ {
111
+ "epoch": 0.44964028776978415,
112
+ "grad_norm": 0.0749383196234703,
113
+ "learning_rate": 0.01985301865484532,
114
+ "loss": 0.2738,
115
+ "step": 375
116
+ },
117
+ {
118
+ "epoch": 0.47961630695443647,
119
+ "grad_norm": 0.04245521500706673,
120
+ "learning_rate": 0.019787432168681444,
121
+ "loss": 0.2727,
122
+ "step": 400
123
+ },
124
+ {
125
+ "epoch": 0.5095923261390888,
126
+ "grad_norm": 0.04584546759724617,
127
+ "learning_rate": 0.019709931795458,
128
+ "loss": 0.2818,
129
+ "step": 425
130
+ },
131
+ {
132
+ "epoch": 0.539568345323741,
133
+ "grad_norm": 0.043414946645498276,
134
+ "learning_rate": 0.019620611873575267,
135
+ "loss": 0.2613,
136
+ "step": 450
137
+ },
138
+ {
139
+ "epoch": 0.5695443645083933,
140
+ "grad_norm": 0.058648571372032166,
141
+ "learning_rate": 0.019519581128942465,
142
+ "loss": 0.2609,
143
+ "step": 475
144
+ },
145
+ {
146
+ "epoch": 0.5995203836930456,
147
+ "grad_norm": 0.04752543196082115,
148
+ "learning_rate": 0.019406962542629646,
149
+ "loss": 0.2677,
150
+ "step": 500
151
+ },
152
+ {
153
+ "epoch": 0.6294964028776978,
154
+ "grad_norm": 0.048361025750637054,
155
+ "learning_rate": 0.019282893201167266,
156
+ "loss": 0.2548,
157
+ "step": 525
158
+ },
159
+ {
160
+ "epoch": 0.6594724220623501,
161
+ "grad_norm": 0.05573540925979614,
162
+ "learning_rate": 0.01914752412967573,
163
+ "loss": 0.2517,
164
+ "step": 550
165
+ },
166
+ {
167
+ "epoch": 0.6894484412470024,
168
+ "grad_norm": 0.03862696886062622,
169
+ "learning_rate": 0.019001020108027976,
170
+ "loss": 0.2583,
171
+ "step": 575
172
+ },
173
+ {
174
+ "epoch": 0.7194244604316546,
175
+ "grad_norm": 0.0389208160340786,
176
+ "learning_rate": 0.01884355947026889,
177
+ "loss": 0.253,
178
+ "step": 600
179
+ },
180
+ {
181
+ "epoch": 0.749400479616307,
182
+ "grad_norm": 0.03260328620672226,
183
+ "learning_rate": 0.018675333887535724,
184
+ "loss": 0.2507,
185
+ "step": 625
186
+ },
187
+ {
188
+ "epoch": 0.7793764988009593,
189
+ "grad_norm": 0.038773685693740845,
190
+ "learning_rate": 0.01849654813474377,
191
+ "loss": 0.2481,
192
+ "step": 650
193
+ },
194
+ {
195
+ "epoch": 0.8093525179856115,
196
+ "grad_norm": 0.03589491918683052,
197
+ "learning_rate": 0.018307419841321244,
198
+ "loss": 0.2558,
199
+ "step": 675
200
+ },
201
+ {
202
+ "epoch": 0.8393285371702638,
203
+ "grad_norm": 0.040207505226135254,
204
+ "learning_rate": 0.018108179226296876,
205
+ "loss": 0.2432,
206
+ "step": 700
207
+ },
208
+ {
209
+ "epoch": 0.8693045563549161,
210
+ "grad_norm": 0.03414730727672577,
211
+ "learning_rate": 0.017899068818062608,
212
+ "loss": 0.2397,
213
+ "step": 725
214
+ },
215
+ {
216
+ "epoch": 0.8992805755395683,
217
+ "grad_norm": 0.0384533517062664,
218
+ "learning_rate": 0.017680343159152546,
219
+ "loss": 0.2469,
220
+ "step": 750
221
+ },
222
+ {
223
+ "epoch": 0.9292565947242206,
224
+ "grad_norm": 0.029624082148075104,
225
+ "learning_rate": 0.017452268496397562,
226
+ "loss": 0.2448,
227
+ "step": 775
228
+ },
229
+ {
230
+ "epoch": 0.9592326139088729,
231
+ "grad_norm": 0.028060954064130783,
232
+ "learning_rate": 0.017215122456832658,
233
+ "loss": 0.243,
234
+ "step": 800
235
+ },
236
+ {
237
+ "epoch": 0.9892086330935251,
238
+ "grad_norm": 0.033197712153196335,
239
+ "learning_rate": 0.016969193709751612,
240
+ "loss": 0.2393,
241
+ "step": 825
242
+ },
243
+ {
244
+ "epoch": 1.0191846522781776,
245
+ "grad_norm": 0.03793744370341301,
246
+ "learning_rate": 0.01671478161532028,
247
+ "loss": 0.2231,
248
+ "step": 850
249
+ },
250
+ {
251
+ "epoch": 1.0491606714628297,
252
+ "grad_norm": 0.03373177349567413,
253
+ "learning_rate": 0.016452195860176322,
254
+ "loss": 0.2136,
255
+ "step": 875
256
+ },
257
+ {
258
+ "epoch": 1.079136690647482,
259
+ "grad_norm": 0.029341645538806915,
260
+ "learning_rate": 0.01618175608045886,
261
+ "loss": 0.2158,
262
+ "step": 900
263
+ },
264
+ {
265
+ "epoch": 1.1091127098321343,
266
+ "grad_norm": 0.03401786461472511,
267
+ "learning_rate": 0.015903791472726955,
268
+ "loss": 0.2223,
269
+ "step": 925
270
+ },
271
+ {
272
+ "epoch": 1.1390887290167866,
273
+ "grad_norm": 0.02708265371620655,
274
+ "learning_rate": 0.015618640393240542,
275
+ "loss": 0.2121,
276
+ "step": 950
277
+ },
278
+ {
279
+ "epoch": 1.169064748201439,
280
+ "grad_norm": 0.029649930074810982,
281
+ "learning_rate": 0.015326649946091635,
282
+ "loss": 0.2168,
283
+ "step": 975
284
+ },
285
+ {
286
+ "epoch": 1.1990407673860912,
287
+ "grad_norm": 0.033625248819589615,
288
+ "learning_rate": 0.01502817556068702,
289
+ "loss": 0.2184,
290
+ "step": 1000
291
+ },
292
+ {
293
+ "epoch": 1.2290167865707433,
294
+ "grad_norm": 0.036567322909832,
295
+ "learning_rate": 0.014723580559096951,
296
+ "loss": 0.2205,
297
+ "step": 1025
298
+ },
299
+ {
300
+ "epoch": 1.2589928057553956,
301
+ "grad_norm": 0.029653819277882576,
302
+ "learning_rate": 0.014413235713796298,
303
+ "loss": 0.2122,
304
+ "step": 1050
305
+ },
306
+ {
307
+ "epoch": 1.288968824940048,
308
+ "grad_norm": 0.0300185214728117,
309
+ "learning_rate": 0.014097518796336648,
310
+ "loss": 0.2058,
311
+ "step": 1075
312
+ },
313
+ {
314
+ "epoch": 1.3189448441247003,
315
+ "grad_norm": 0.027810001745820045,
316
+ "learning_rate": 0.013776814117498662,
317
+ "loss": 0.207,
318
+ "step": 1100
319
+ },
320
+ {
321
+ "epoch": 1.3489208633093526,
322
+ "grad_norm": 0.027427321299910545,
323
+ "learning_rate": 0.013451512059484468,
324
+ "loss": 0.2108,
325
+ "step": 1125
326
+ },
327
+ {
328
+ "epoch": 1.3788968824940047,
329
+ "grad_norm": 0.028375081717967987,
330
+ "learning_rate": 0.013122008600719522,
331
+ "loss": 0.2048,
332
+ "step": 1150
333
+ },
334
+ {
335
+ "epoch": 1.4088729016786572,
336
+ "grad_norm": 0.029488051310181618,
337
+ "learning_rate": 0.012788704833842401,
338
+ "loss": 0.2121,
339
+ "step": 1175
340
+ },
341
+ {
342
+ "epoch": 1.4388489208633093,
343
+ "grad_norm": 0.029031606391072273,
344
+ "learning_rate": 0.012452006477469255,
345
+ "loss": 0.1975,
346
+ "step": 1200
347
+ },
348
+ {
349
+ "epoch": 1.4688249400479616,
350
+ "grad_norm": 0.029272671788930893,
351
+ "learning_rate": 0.012112323382327204,
352
+ "loss": 0.2055,
353
+ "step": 1225
354
+ },
355
+ {
356
+ "epoch": 1.498800959232614,
357
+ "grad_norm": 0.027180878445506096,
358
+ "learning_rate": 0.01177006903235788,
359
+ "loss": 0.2082,
360
+ "step": 1250
361
+ },
362
+ {
363
+ "epoch": 1.5287769784172662,
364
+ "grad_norm": 0.02914930321276188,
365
+ "learning_rate": 0.011425660041398385,
366
+ "loss": 0.2067,
367
+ "step": 1275
368
+ },
369
+ {
370
+ "epoch": 1.5587529976019185,
371
+ "grad_norm": 0.024756262078881264,
372
+ "learning_rate": 0.011079515646052343,
373
+ "loss": 0.1988,
374
+ "step": 1300
375
+ },
376
+ {
377
+ "epoch": 1.5887290167865706,
378
+ "grad_norm": 0.0281531922519207,
379
+ "learning_rate": 0.010732057195368346,
380
+ "loss": 0.2043,
381
+ "step": 1325
382
+ },
383
+ {
384
+ "epoch": 1.6187050359712232,
385
+ "grad_norm": 0.02760651335120201,
386
+ "learning_rate": 0.01038370763794701,
387
+ "loss": 0.2008,
388
+ "step": 1350
389
+ },
390
+ {
391
+ "epoch": 1.6486810551558753,
392
+ "grad_norm": 0.026089100167155266,
393
+ "learning_rate": 0.010034891007100942,
394
+ "loss": 0.2065,
395
+ "step": 1375
396
+ },
397
+ {
398
+ "epoch": 1.6786570743405276,
399
+ "grad_norm": 0.030709104612469673,
400
+ "learning_rate": 0.009686031904694317,
401
+ "loss": 0.2022,
402
+ "step": 1400
403
+ },
404
+ {
405
+ "epoch": 1.70863309352518,
406
+ "grad_norm": 0.028219345957040787,
407
+ "learning_rate": 0.009337554984290423,
408
+ "loss": 0.1998,
409
+ "step": 1425
410
+ },
411
+ {
412
+ "epoch": 1.738609112709832,
413
+ "grad_norm": 0.02828747034072876,
414
+ "learning_rate": 0.00898988443423621,
415
+ "loss": 0.2048,
416
+ "step": 1450
417
+ },
418
+ {
419
+ "epoch": 1.7685851318944845,
420
+ "grad_norm": 0.02299325354397297,
421
+ "learning_rate": 0.00864344346131318,
422
+ "loss": 0.1983,
423
+ "step": 1475
424
+ },
425
+ {
426
+ "epoch": 1.7985611510791366,
427
+ "grad_norm": 0.02619764395058155,
428
+ "learning_rate": 0.008298653775583083,
429
+ "loss": 0.2013,
430
+ "step": 1500
431
+ },
432
+ {
433
+ "epoch": 1.828537170263789,
434
+ "grad_norm": 0.021840449422597885,
435
+ "learning_rate": 0.007955935077055509,
436
+ "loss": 0.1956,
437
+ "step": 1525
438
+ },
439
+ {
440
+ "epoch": 1.8585131894484412,
441
+ "grad_norm": 0.02665482647716999,
442
+ "learning_rate": 0.007615704544802264,
443
+ "loss": 0.2022,
444
+ "step": 1550
445
+ },
446
+ {
447
+ "epoch": 1.8884892086330936,
448
+ "grad_norm": 0.023482663556933403,
449
+ "learning_rate": 0.007278376329140371,
450
+ "loss": 0.2017,
451
+ "step": 1575
452
+ },
453
+ {
454
+ "epoch": 1.9184652278177459,
455
+ "grad_norm": 0.02401842176914215,
456
+ "learning_rate": 0.006944361047501866,
457
+ "loss": 0.2059,
458
+ "step": 1600
459
+ },
460
+ {
461
+ "epoch": 1.948441247002398,
462
+ "grad_norm": 0.022366248071193695,
463
+ "learning_rate": 0.006614065284604081,
464
+ "loss": 0.1954,
465
+ "step": 1625
466
+ },
467
+ {
468
+ "epoch": 1.9784172661870505,
469
+ "grad_norm": 0.023342736065387726,
470
+ "learning_rate": 0.0062878910975287415,
471
+ "loss": 0.1973,
472
+ "step": 1650
473
+ },
474
+ {
475
+ "epoch": 2.0083932853717026,
476
+ "grad_norm": 0.02544998750090599,
477
+ "learning_rate": 0.005966235526312461,
478
+ "loss": 0.1869,
479
+ "step": 1675
480
+ },
481
+ {
482
+ "epoch": 2.038369304556355,
483
+ "grad_norm": 0.027767734602093697,
484
+ "learning_rate": 0.005649490110644255,
485
+ "loss": 0.1598,
486
+ "step": 1700
487
+ },
488
+ {
489
+ "epoch": 2.068345323741007,
490
+ "grad_norm": 0.0325852669775486,
491
+ "learning_rate": 0.005338040413258434,
492
+ "loss": 0.1628,
493
+ "step": 1725
494
+ },
495
+ {
496
+ "epoch": 2.0983213429256593,
497
+ "grad_norm": 0.027936723083257675,
498
+ "learning_rate": 0.005032265550603059,
499
+ "loss": 0.161,
500
+ "step": 1750
501
+ },
502
+ {
503
+ "epoch": 2.128297362110312,
504
+ "grad_norm": 0.027292126789689064,
505
+ "learning_rate": 0.0047325377313551945,
506
+ "loss": 0.1598,
507
+ "step": 1775
508
+ },
509
+ {
510
+ "epoch": 2.158273381294964,
511
+ "grad_norm": 0.030712289735674858,
512
+ "learning_rate": 0.0044392218033447416,
513
+ "loss": 0.1614,
514
+ "step": 1800
515
+ },
516
+ {
517
+ "epoch": 2.1882494004796165,
518
+ "grad_norm": 0.02516656182706356,
519
+ "learning_rate": 0.0041526748094384055,
520
+ "loss": 0.165,
521
+ "step": 1825
522
+ },
523
+ {
524
+ "epoch": 2.2182254196642686,
525
+ "grad_norm": 0.028255818411707878,
526
+ "learning_rate": 0.003873245552924294,
527
+ "loss": 0.1584,
528
+ "step": 1850
529
+ },
530
+ {
531
+ "epoch": 2.2482014388489207,
532
+ "grad_norm": 0.028185885399580002,
533
+ "learning_rate": 0.003601274172926329,
534
+ "loss": 0.1619,
535
+ "step": 1875
536
+ },
537
+ {
538
+ "epoch": 2.278177458033573,
539
+ "grad_norm": 0.029240386560559273,
540
+ "learning_rate": 0.0033370917303651784,
541
+ "loss": 0.1575,
542
+ "step": 1900
543
+ },
544
+ {
545
+ "epoch": 2.3081534772182253,
546
+ "grad_norm": 0.026783913373947144,
547
+ "learning_rate": 0.003081019804969775,
548
+ "loss": 0.1636,
549
+ "step": 1925
550
+ },
551
+ {
552
+ "epoch": 2.338129496402878,
553
+ "grad_norm": 0.026911884546279907,
554
+ "learning_rate": 0.0028333701038299585,
555
+ "loss": 0.1583,
556
+ "step": 1950
557
+ },
558
+ {
559
+ "epoch": 2.36810551558753,
560
+ "grad_norm": 0.02731228433549404,
561
+ "learning_rate": 0.0025944440819667103,
562
+ "loss": 0.1596,
563
+ "step": 1975
564
+ },
565
+ {
566
+ "epoch": 2.3980815347721824,
567
+ "grad_norm": 0.026634665206074715,
568
+ "learning_rate": 0.002364532575381848,
569
+ "loss": 0.1559,
570
+ "step": 2000
571
+ },
572
+ {
573
+ "epoch": 2.4280575539568345,
574
+ "grad_norm": 0.02517741546034813,
575
+ "learning_rate": 0.0021439154470339074,
576
+ "loss": 0.1598,
577
+ "step": 2025
578
+ },
579
+ {
580
+ "epoch": 2.4580335731414866,
581
+ "grad_norm": 0.028662823140621185,
582
+ "learning_rate": 0.0019328612461710682,
583
+ "loss": 0.1592,
584
+ "step": 2050
585
+ },
586
+ {
587
+ "epoch": 2.488009592326139,
588
+ "grad_norm": 0.024708494544029236,
589
+ "learning_rate": 0.0017316268814358837,
590
+ "loss": 0.1558,
591
+ "step": 2075
592
+ },
593
+ {
594
+ "epoch": 2.5179856115107913,
595
+ "grad_norm": 0.023627523332834244,
596
+ "learning_rate": 0.0015404573081396833,
597
+ "loss": 0.1581,
598
+ "step": 2100
599
+ },
600
+ {
601
+ "epoch": 2.547961630695444,
602
+ "grad_norm": 0.02505665458738804,
603
+ "learning_rate": 0.0013595852300873235,
604
+ "loss": 0.1578,
605
+ "step": 2125
606
+ },
607
+ {
608
+ "epoch": 2.577937649880096,
609
+ "grad_norm": 0.02366657927632332,
610
+ "learning_rate": 0.001189230816315282,
611
+ "loss": 0.1536,
612
+ "step": 2150
613
+ },
614
+ {
615
+ "epoch": 2.6079136690647484,
616
+ "grad_norm": 0.028983445838093758,
617
+ "learning_rate": 0.0010296014330878466,
618
+ "loss": 0.1528,
619
+ "step": 2175
620
+ },
621
+ {
622
+ "epoch": 2.6378896882494005,
623
+ "grad_norm": 0.02606895938515663,
624
+ "learning_rate": 0.0008808913914776618,
625
+ "loss": 0.1542,
626
+ "step": 2200
627
+ },
628
+ {
629
+ "epoch": 2.6678657074340526,
630
+ "grad_norm": 0.021730564534664154,
631
+ "learning_rate": 0.0007432817108378987,
632
+ "loss": 0.1558,
633
+ "step": 2225
634
+ },
635
+ {
636
+ "epoch": 2.697841726618705,
637
+ "grad_norm": 0.0258767269551754,
638
+ "learning_rate": 0.000616939898453942,
639
+ "loss": 0.1587,
640
+ "step": 2250
641
+ },
642
+ {
643
+ "epoch": 2.7278177458033572,
644
+ "grad_norm": 0.024898972362279892,
645
+ "learning_rate": 0.0005020197456428266,
646
+ "loss": 0.1613,
647
+ "step": 2275
648
+ },
649
+ {
650
+ "epoch": 2.7577937649880093,
651
+ "grad_norm": 0.026595328003168106,
652
+ "learning_rate": 0.0003986611405486429,
653
+ "loss": 0.1518,
654
+ "step": 2300
655
+ },
656
+ {
657
+ "epoch": 2.787769784172662,
658
+ "grad_norm": 0.021675392985343933,
659
+ "learning_rate": 0.00030698989786175025,
660
+ "loss": 0.1495,
661
+ "step": 2325
662
+ },
663
+ {
664
+ "epoch": 2.8177458033573144,
665
+ "grad_norm": 0.026996396481990814,
666
+ "learning_rate": 0.00022711760566911045,
667
+ "loss": 0.1539,
668
+ "step": 2350
669
+ },
670
+ {
671
+ "epoch": 2.8477218225419665,
672
+ "grad_norm": 0.02844955585896969,
673
+ "learning_rate": 0.00015914148962215102,
674
+ "loss": 0.1501,
675
+ "step": 2375
676
+ },
677
+ {
678
+ "epoch": 2.8776978417266186,
679
+ "grad_norm": 0.035385046154260635,
680
+ "learning_rate": 0.00010314429458748609,
681
+ "loss": 0.1533,
682
+ "step": 2400
683
+ },
684
+ {
685
+ "epoch": 2.907673860911271,
686
+ "grad_norm": 0.0226058941334486,
687
+ "learning_rate": 5.919418392459908e-05,
688
+ "loss": 0.1553,
689
+ "step": 2425
690
+ },
691
+ {
692
+ "epoch": 2.937649880095923,
693
+ "grad_norm": 0.021318677812814713,
694
+ "learning_rate": 2.7344656513036413e-05,
695
+ "loss": 0.1513,
696
+ "step": 2450
697
+ },
698
+ {
699
+ "epoch": 2.9676258992805753,
700
+ "grad_norm": 0.03294537961483002,
701
+ "learning_rate": 7.634481630169888e-06,
702
+ "loss": 0.1507,
703
+ "step": 2475
704
+ },
705
+ {
706
+ "epoch": 2.997601918465228,
707
+ "grad_norm": 0.027222590520977974,
708
+ "learning_rate": 8.765175874891896e-08,
709
+ "loss": 0.1521,
710
+ "step": 2500
711
+ },
712
+ {
713
+ "epoch": 3.0,
714
+ "step": 2502,
715
+ "total_flos": 2.43882352705536e+18,
716
+ "train_loss": 0.22085073801110403,
717
+ "train_runtime": 3314.1845,
718
+ "train_samples_per_second": 36.208,
719
+ "train_steps_per_second": 0.755
720
+ }
721
+ ],
722
+ "logging_steps": 25,
723
+ "max_steps": 2502,
724
+ "num_input_tokens_seen": 0,
725
+ "num_train_epochs": 3,
726
+ "save_steps": 0,
727
+ "stateful_callbacks": {
728
+ "TrainerControl": {
729
+ "args": {
730
+ "should_epoch_stop": false,
731
+ "should_evaluate": false,
732
+ "should_log": false,
733
+ "should_save": true,
734
+ "should_training_stop": true
735
+ },
736
+ "attributes": {}
737
+ }
738
+ },
739
+ "total_flos": 2.43882352705536e+18,
740
+ "train_batch_size": 48,
741
+ "trial_name": null,
742
+ "trial_params": null
743
+ }