nvan15 commited on
Commit
b3ccc92
·
verified ·
1 Parent(s): 4e49694

Batch upload part 15

Browse files
Files changed (50) hide show
  1. nl_tasks/exp395/run_ex03/ft/adapter_config.json +18 -0
  2. nl_tasks/exp395/run_ex03/ft/tokenizer.json +0 -0
  3. nl_tasks/exp395/run_ex03/ft/tokenizer.model +3 -0
  4. nl_tasks/exp395/run_ex03/ft2/adapter_config.json +18 -0
  5. nl_tasks/exp395/run_ex03/ft2/adapter_model.bin +3 -0
  6. nl_tasks/exp395/run_ex04/ft/adapter_config.json +18 -0
  7. nl_tasks/exp395/run_ex04/ft/added_tokens.json +3 -0
  8. nl_tasks/exp395/run_ex04/ft/special_tokens_map.json +30 -0
  9. nl_tasks/exp395/run_ex04/ft/tokenizer.json +0 -0
  10. nl_tasks/exp395/run_ex04/ft/tokenizer.model +3 -0
  11. nl_tasks/exp395/run_ex04/ft/tokenizer_config.json +51 -0
  12. nl_tasks/exp395/run_ex04/ft2/adapter_config.json +18 -0
  13. nl_tasks/exp395/run_ex04/ft2/adapter_model.bin +3 -0
  14. nl_tasks/exp395/run_ex04/trainer_state.json +308 -0
  15. nl_tasks/exp395/run_ex05/ft/adapter_config.json +18 -0
  16. nl_tasks/exp395/run_ex05/ft/added_tokens.json +3 -0
  17. nl_tasks/exp395/run_ex05/ft/special_tokens_map.json +30 -0
  18. nl_tasks/exp395/run_ex05/ft/tokenizer.json +0 -0
  19. nl_tasks/exp395/run_ex05/ft/tokenizer.model +3 -0
  20. nl_tasks/exp395/run_ex05/ft/tokenizer_config.json +51 -0
  21. nl_tasks/exp395/run_ex05/ft2/adapter_config.json +18 -0
  22. nl_tasks/exp395/run_ex05/ft2/adapter_model.bin +3 -0
  23. nl_tasks/exp395/run_ex05/trainer_state.json +509 -0
  24. nl_tasks/exp395/run_ex06/ft/adapter_config.json +18 -0
  25. nl_tasks/exp395/run_ex06/ft/added_tokens.json +3 -0
  26. nl_tasks/exp395/run_ex06/ft/special_tokens_map.json +30 -0
  27. nl_tasks/exp395/run_ex06/ft/tokenizer.json +0 -0
  28. nl_tasks/exp395/run_ex06/ft/tokenizer.model +3 -0
  29. nl_tasks/exp395/run_ex06/ft/tokenizer_config.json +51 -0
  30. nl_tasks/exp395/run_ex06/ft2/adapter_config.json +18 -0
  31. nl_tasks/exp395/run_ex06/ft2/adapter_model.bin +3 -0
  32. nl_tasks/exp395/run_ex06/trainer_state.json +509 -0
  33. nl_tasks/exp395/run_ex07/ft/adapter_config.json +18 -0
  34. nl_tasks/exp395/run_ex07/ft/added_tokens.json +3 -0
  35. nl_tasks/exp395/run_ex07/ft/special_tokens_map.json +30 -0
  36. nl_tasks/exp395/run_ex07/ft/tokenizer.json +0 -0
  37. nl_tasks/exp395/run_ex07/ft/tokenizer.model +3 -0
  38. nl_tasks/exp395/run_ex07/ft/tokenizer_config.json +51 -0
  39. nl_tasks/exp395/run_ex07/ft2/adapter_config.json +18 -0
  40. nl_tasks/exp395/run_ex07/ft2/adapter_model.bin +3 -0
  41. nl_tasks/exp395/run_ex07/trainer_state.json +509 -0
  42. nl_tasks/exp395/run_ex08/ft/adapter_config.json +18 -0
  43. nl_tasks/exp395/run_ex08/ft/added_tokens.json +3 -0
  44. nl_tasks/exp395/run_ex08/ft/special_tokens_map.json +30 -0
  45. nl_tasks/exp395/run_ex08/ft/tokenizer.json +0 -0
  46. nl_tasks/exp395/run_ex08/ft/tokenizer.model +3 -0
  47. nl_tasks/exp395/run_ex08/ft/tokenizer_config.json +51 -0
  48. nl_tasks/exp395/run_ex08/ft2/adapter_config.json +18 -0
  49. nl_tasks/exp395/run_ex08/ft2/adapter_model.bin +3 -0
  50. nl_tasks/exp395/run_ex08/trainer_state.json +509 -0
nl_tasks/exp395/run_ex03/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex03/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex03/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex03/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex03/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:503674d6e9d395776c6c79b8d13ca3ea16bd92520277eed1c79858595d63572f
3
+ size 33602915
nl_tasks/exp395/run_ex04/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex04/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex04/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex04/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex04/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex04/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex04/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex04/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37d6b88a9707e400bbde1c970e5d72b090db0d40ad030b4323c4c3941c0192e9
3
+ size 33602915
nl_tasks/exp395/run_ex04/trainer_state.json ADDED
@@ -0,0 +1,308 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 1000,
7
+ "global_step": 6234,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.06416426050689766,
14
+ "grad_norm": 0.21872752904891968,
15
+ "learning_rate": 0.0009988020657471077,
16
+ "loss": 0.3629,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.12832852101379533,
21
+ "grad_norm": 0.21329320967197418,
22
+ "learning_rate": 0.0009927029382680798,
23
+ "loss": 0.2964,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.19249278152069296,
28
+ "grad_norm": 1.0523426532745361,
29
+ "learning_rate": 0.0009815004229628667,
30
+ "loss": 0.2791,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.25665704202759065,
35
+ "grad_norm": 0.2172887623310089,
36
+ "learning_rate": 0.0009653105548208928,
37
+ "loss": 0.2617,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.3208213025344883,
42
+ "grad_norm": 0.18885284662246704,
43
+ "learning_rate": 0.0009443010275366055,
44
+ "loss": 0.2506,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.3208213025344883,
49
+ "eval_loss": 0.2593642771244049,
50
+ "eval_runtime": 5.2735,
51
+ "eval_samples_per_second": 48.924,
52
+ "eval_steps_per_second": 0.948,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.3849855630413859,
57
+ "grad_norm": 0.15659989416599274,
58
+ "learning_rate": 0.0009186894565481294,
59
+ "loss": 0.2423,
60
+ "step": 1200
61
+ },
62
+ {
63
+ "epoch": 0.4491498235482836,
64
+ "grad_norm": 0.14614759385585785,
65
+ "learning_rate": 0.0008887411249894593,
66
+ "loss": 0.2359,
67
+ "step": 1400
68
+ },
69
+ {
70
+ "epoch": 0.5133140840551813,
71
+ "grad_norm": 0.13668683171272278,
72
+ "learning_rate": 0.0008547662359034838,
73
+ "loss": 0.2374,
74
+ "step": 1600
75
+ },
76
+ {
77
+ "epoch": 0.5774783445620789,
78
+ "grad_norm": 0.14054378867149353,
79
+ "learning_rate": 0.0008171166991772578,
80
+ "loss": 0.2273,
81
+ "step": 1800
82
+ },
83
+ {
84
+ "epoch": 0.6416426050689766,
85
+ "grad_norm": 0.11869219690561295,
86
+ "learning_rate": 0.0007761824864802529,
87
+ "loss": 0.2222,
88
+ "step": 2000
89
+ },
90
+ {
91
+ "epoch": 0.6416426050689766,
92
+ "eval_loss": 0.23084010183811188,
93
+ "eval_runtime": 5.0454,
94
+ "eval_samples_per_second": 51.136,
95
+ "eval_steps_per_second": 0.991,
96
+ "step": 2000
97
+ },
98
+ {
99
+ "epoch": 0.7058068655758742,
100
+ "grad_norm": 0.1382647603750229,
101
+ "learning_rate": 0.0007323875919609245,
102
+ "loss": 0.224,
103
+ "step": 2200
104
+ },
105
+ {
106
+ "epoch": 0.7699711260827719,
107
+ "grad_norm": 0.1192302256822586,
108
+ "learning_rate": 0.0006861856405404567,
109
+ "loss": 0.2161,
110
+ "step": 2400
111
+ },
112
+ {
113
+ "epoch": 0.8341353865896696,
114
+ "grad_norm": 0.12821270525455475,
115
+ "learning_rate": 0.0006380551892927205,
116
+ "loss": 0.2164,
117
+ "step": 2600
118
+ },
119
+ {
120
+ "epoch": 0.8982996470965672,
121
+ "grad_norm": 0.1226765364408493,
122
+ "learning_rate": 0.0005884947705784723,
123
+ "loss": 0.2119,
124
+ "step": 2800
125
+ },
126
+ {
127
+ "epoch": 0.9624639076034649,
128
+ "grad_norm": 0.11759891360998154,
129
+ "learning_rate": 0.0005380177282767159,
130
+ "loss": 0.2127,
131
+ "step": 3000
132
+ },
133
+ {
134
+ "epoch": 0.9624639076034649,
135
+ "eval_loss": 0.21615009009838104,
136
+ "eval_runtime": 5.0486,
137
+ "eval_samples_per_second": 51.103,
138
+ "eval_steps_per_second": 0.99,
139
+ "step": 3000
140
+ },
141
+ {
142
+ "epoch": 1.0266281681103626,
143
+ "grad_norm": 0.11819776147603989,
144
+ "learning_rate": 0.0004871469005992345,
145
+ "loss": 0.1981,
146
+ "step": 3200
147
+ },
148
+ {
149
+ "epoch": 1.0907924286172601,
150
+ "grad_norm": 0.11975960433483124,
151
+ "learning_rate": 0.00043640920456338925,
152
+ "loss": 0.1804,
153
+ "step": 3400
154
+ },
155
+ {
156
+ "epoch": 1.1549566891241578,
157
+ "grad_norm": 0.160000741481781,
158
+ "learning_rate": 0.00038633017821688954,
159
+ "loss": 0.1847,
160
+ "step": 3600
161
+ },
162
+ {
163
+ "epoch": 1.2191209496310556,
164
+ "grad_norm": 0.11288543790578842,
165
+ "learning_rate": 0.00033742853714584963,
166
+ "loss": 0.1821,
167
+ "step": 3800
168
+ },
169
+ {
170
+ "epoch": 1.283285210137953,
171
+ "grad_norm": 0.11609069257974625,
172
+ "learning_rate": 0.0002902108016494981,
173
+ "loss": 0.1803,
174
+ "step": 4000
175
+ },
176
+ {
177
+ "epoch": 1.283285210137953,
178
+ "eval_loss": 0.20613692700862885,
179
+ "eval_runtime": 5.0502,
180
+ "eval_samples_per_second": 51.087,
181
+ "eval_steps_per_second": 0.99,
182
+ "step": 4000
183
+ },
184
+ {
185
+ "epoch": 1.3474494706448508,
186
+ "grad_norm": 0.12499521672725677,
187
+ "learning_rate": 0.0002451660502329462,
188
+ "loss": 0.1827,
189
+ "step": 4200
190
+ },
191
+ {
192
+ "epoch": 1.4116137311517485,
193
+ "grad_norm": 0.13932673633098602,
194
+ "learning_rate": 0.0002027608537610257,
195
+ "loss": 0.1758,
196
+ "step": 4400
197
+ },
198
+ {
199
+ "epoch": 1.4757779916586462,
200
+ "grad_norm": 0.10319329053163528,
201
+ "learning_rate": 0.00016343444274492946,
202
+ "loss": 0.1796,
203
+ "step": 4600
204
+ },
205
+ {
206
+ "epoch": 1.539942252165544,
207
+ "grad_norm": 0.09759525954723358,
208
+ "learning_rate": 0.00012759415781861554,
209
+ "loss": 0.176,
210
+ "step": 4800
211
+ },
212
+ {
213
+ "epoch": 1.6041065126724414,
214
+ "grad_norm": 0.10206837952136993,
215
+ "learning_rate": 9.561123052866844e-05,
216
+ "loss": 0.1755,
217
+ "step": 5000
218
+ },
219
+ {
220
+ "epoch": 1.6041065126724414,
221
+ "eval_loss": 0.19889786839485168,
222
+ "eval_runtime": 5.0526,
223
+ "eval_samples_per_second": 51.063,
224
+ "eval_steps_per_second": 0.99,
225
+ "step": 5000
226
+ },
227
+ {
228
+ "epoch": 1.6682707731793391,
229
+ "grad_norm": 0.0990043506026268,
230
+ "learning_rate": 6.781693813994666e-05,
231
+ "loss": 0.1747,
232
+ "step": 5200
233
+ },
234
+ {
235
+ "epoch": 1.7324350336862366,
236
+ "grad_norm": 0.11463545262813568,
237
+ "learning_rate": 4.449917228531264e-05,
238
+ "loss": 0.1741,
239
+ "step": 5400
240
+ },
241
+ {
242
+ "epoch": 1.7965992941931344,
243
+ "grad_norm": 0.1105906218290329,
244
+ "learning_rate": 2.5899457001165806e-05,
245
+ "loss": 0.1745,
246
+ "step": 5600
247
+ },
248
+ {
249
+ "epoch": 1.860763554700032,
250
+ "grad_norm": 0.09413338452577591,
251
+ "learning_rate": 1.2210447035791717e-05,
252
+ "loss": 0.174,
253
+ "step": 5800
254
+ },
255
+ {
256
+ "epoch": 1.9249278152069298,
257
+ "grad_norm": 0.11482080072164536,
258
+ "learning_rate": 3.5739323429003765e-06,
259
+ "loss": 0.1699,
260
+ "step": 6000
261
+ },
262
+ {
263
+ "epoch": 1.9249278152069298,
264
+ "eval_loss": 0.19647078216075897,
265
+ "eval_runtime": 5.0578,
266
+ "eval_samples_per_second": 51.011,
267
+ "eval_steps_per_second": 0.989,
268
+ "step": 6000
269
+ },
270
+ {
271
+ "epoch": 1.9890920757138275,
272
+ "grad_norm": 0.11400256305932999,
273
+ "learning_rate": 7.936942969133387e-08,
274
+ "loss": 0.172,
275
+ "step": 6200
276
+ },
277
+ {
278
+ "epoch": 2.0,
279
+ "step": 6234,
280
+ "total_flos": 4.0542214473488794e+18,
281
+ "train_loss": 0.21111015283721052,
282
+ "train_runtime": 5747.5761,
283
+ "train_samples_per_second": 34.708,
284
+ "train_steps_per_second": 1.085
285
+ }
286
+ ],
287
+ "logging_steps": 200,
288
+ "max_steps": 6234,
289
+ "num_input_tokens_seen": 0,
290
+ "num_train_epochs": 2,
291
+ "save_steps": 0,
292
+ "stateful_callbacks": {
293
+ "TrainerControl": {
294
+ "args": {
295
+ "should_epoch_stop": false,
296
+ "should_evaluate": false,
297
+ "should_log": false,
298
+ "should_save": true,
299
+ "should_training_stop": true
300
+ },
301
+ "attributes": {}
302
+ }
303
+ },
304
+ "total_flos": 4.0542214473488794e+18,
305
+ "train_batch_size": 32,
306
+ "trial_name": null,
307
+ "trial_params": null
308
+ }
nl_tasks/exp395/run_ex05/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex05/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex05/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex05/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex05/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex05/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex05/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex05/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:946d5729b3750ab24b3fb2901f0a03fbdd1387b84bb653831d0180eabbdd6639
3
+ size 33602915
nl_tasks/exp395/run_ex05/trainer_state.json ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 9375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.064,
14
+ "grad_norm": 0.19703811407089233,
15
+ "learning_rate": 0.000999684221114305,
16
+ "loss": 0.3657,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.128,
21
+ "grad_norm": 0.2437756508588791,
22
+ "learning_rate": 0.0009973376564462874,
23
+ "loss": 0.2968,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.16,
28
+ "eval_loss": 0.28636598587036133,
29
+ "eval_runtime": 19.6604,
30
+ "eval_samples_per_second": 51.423,
31
+ "eval_steps_per_second": 0.814,
32
+ "step": 500
33
+ },
34
+ {
35
+ "epoch": 0.192,
36
+ "grad_norm": 0.19999830424785614,
37
+ "learning_rate": 0.0009927125570277145,
38
+ "loss": 0.2773,
39
+ "step": 600
40
+ },
41
+ {
42
+ "epoch": 0.256,
43
+ "grad_norm": 0.20367808640003204,
44
+ "learning_rate": 0.0009858301125867587,
45
+ "loss": 0.2633,
46
+ "step": 800
47
+ },
48
+ {
49
+ "epoch": 0.32,
50
+ "grad_norm": 0.18598036468029022,
51
+ "learning_rate": 0.000976721854797373,
52
+ "loss": 0.2516,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.32,
57
+ "eval_loss": 0.254931777715683,
58
+ "eval_runtime": 19.5103,
59
+ "eval_samples_per_second": 51.819,
60
+ "eval_steps_per_second": 0.82,
61
+ "step": 1000
62
+ },
63
+ {
64
+ "epoch": 0.384,
65
+ "grad_norm": 0.1677393764257431,
66
+ "learning_rate": 0.0009654295128180494,
67
+ "loss": 0.2477,
68
+ "step": 1200
69
+ },
70
+ {
71
+ "epoch": 0.448,
72
+ "grad_norm": 0.15132741630077362,
73
+ "learning_rate": 0.0009520048221111679,
74
+ "loss": 0.2402,
75
+ "step": 1400
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "eval_loss": 0.2429439127445221,
80
+ "eval_runtime": 19.4737,
81
+ "eval_samples_per_second": 51.916,
82
+ "eval_steps_per_second": 0.822,
83
+ "step": 1500
84
+ },
85
+ {
86
+ "epoch": 0.512,
87
+ "grad_norm": 0.15494993329048157,
88
+ "learning_rate": 0.0009365092874188177,
89
+ "loss": 0.2342,
90
+ "step": 1600
91
+ },
92
+ {
93
+ "epoch": 0.576,
94
+ "grad_norm": 0.13975529372692108,
95
+ "learning_rate": 0.0009190139009810141,
96
+ "loss": 0.2298,
97
+ "step": 1800
98
+ },
99
+ {
100
+ "epoch": 0.64,
101
+ "grad_norm": 0.12113900482654572,
102
+ "learning_rate": 0.0008995988172872798,
103
+ "loss": 0.2279,
104
+ "step": 2000
105
+ },
106
+ {
107
+ "epoch": 0.64,
108
+ "eval_loss": 0.2299424409866333,
109
+ "eval_runtime": 19.5012,
110
+ "eval_samples_per_second": 51.843,
111
+ "eval_steps_per_second": 0.82,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 0.704,
116
+ "grad_norm": 0.10823472589254379,
117
+ "learning_rate": 0.0008783529858517077,
118
+ "loss": 0.2234,
119
+ "step": 2200
120
+ },
121
+ {
122
+ "epoch": 0.768,
123
+ "grad_norm": 0.10321851819753647,
124
+ "learning_rate": 0.0008553737436939324,
125
+ "loss": 0.2215,
126
+ "step": 2400
127
+ },
128
+ {
129
+ "epoch": 0.8,
130
+ "eval_loss": 0.2244570553302765,
131
+ "eval_runtime": 19.4699,
132
+ "eval_samples_per_second": 51.926,
133
+ "eval_steps_per_second": 0.822,
134
+ "step": 2500
135
+ },
136
+ {
137
+ "epoch": 0.832,
138
+ "grad_norm": 0.12440883368253708,
139
+ "learning_rate": 0.0008307663693930426,
140
+ "loss": 0.219,
141
+ "step": 2600
142
+ },
143
+ {
144
+ "epoch": 0.896,
145
+ "grad_norm": 0.10718903690576553,
146
+ "learning_rate": 0.000804643600757522,
147
+ "loss": 0.2182,
148
+ "step": 2800
149
+ },
150
+ {
151
+ "epoch": 0.96,
152
+ "grad_norm": 0.10340970754623413,
153
+ "learning_rate": 0.0007771251183209993,
154
+ "loss": 0.2114,
155
+ "step": 3000
156
+ },
157
+ {
158
+ "epoch": 0.96,
159
+ "eval_loss": 0.220004603266716,
160
+ "eval_runtime": 19.5042,
161
+ "eval_samples_per_second": 51.835,
162
+ "eval_steps_per_second": 0.82,
163
+ "step": 3000
164
+ },
165
+ {
166
+ "epoch": 1.024,
167
+ "grad_norm": 0.17328748106956482,
168
+ "learning_rate": 0.0007483369970301455,
169
+ "loss": 0.2057,
170
+ "step": 3200
171
+ },
172
+ {
173
+ "epoch": 1.088,
174
+ "grad_norm": 0.09701967239379883,
175
+ "learning_rate": 0.0007184111286368001,
176
+ "loss": 0.189,
177
+ "step": 3400
178
+ },
179
+ {
180
+ "epoch": 1.12,
181
+ "eval_loss": 0.21541637182235718,
182
+ "eval_runtime": 19.5442,
183
+ "eval_samples_per_second": 51.729,
184
+ "eval_steps_per_second": 0.819,
185
+ "step": 3500
186
+ },
187
+ {
188
+ "epoch": 1.152,
189
+ "grad_norm": 0.10150015354156494,
190
+ "learning_rate": 0.0006874846174406093,
191
+ "loss": 0.1891,
192
+ "step": 3600
193
+ },
194
+ {
195
+ "epoch": 1.216,
196
+ "grad_norm": 0.10405285656452179,
197
+ "learning_rate": 0.0006556991521505633,
198
+ "loss": 0.1891,
199
+ "step": 3800
200
+ },
201
+ {
202
+ "epoch": 1.28,
203
+ "grad_norm": 0.09167412668466568,
204
+ "learning_rate": 0.0006232003567432241,
205
+ "loss": 0.1869,
206
+ "step": 4000
207
+ },
208
+ {
209
+ "epoch": 1.28,
210
+ "eval_loss": 0.21192973852157593,
211
+ "eval_runtime": 19.5118,
212
+ "eval_samples_per_second": 51.815,
213
+ "eval_steps_per_second": 0.82,
214
+ "step": 4000
215
+ },
216
+ {
217
+ "epoch": 1.3439999999999999,
218
+ "grad_norm": 0.09688983112573624,
219
+ "learning_rate": 0.0005901371232916676,
220
+ "loss": 0.1887,
221
+ "step": 4200
222
+ },
223
+ {
224
+ "epoch": 1.408,
225
+ "grad_norm": 0.11787360906600952,
226
+ "learning_rate": 0.0005566609298217553,
227
+ "loss": 0.187,
228
+ "step": 4400
229
+ },
230
+ {
231
+ "epoch": 1.44,
232
+ "eval_loss": 0.2085772007703781,
233
+ "eval_runtime": 27.1615,
234
+ "eval_samples_per_second": 37.222,
235
+ "eval_steps_per_second": 0.589,
236
+ "step": 4500
237
+ },
238
+ {
239
+ "epoch": 1.472,
240
+ "grad_norm": 0.09200826287269592,
241
+ "learning_rate": 0.0005229251463209568,
242
+ "loss": 0.1875,
243
+ "step": 4600
244
+ },
245
+ {
246
+ "epoch": 1.536,
247
+ "grad_norm": 0.0889899879693985,
248
+ "learning_rate": 0.0004890843320792184,
249
+ "loss": 0.1845,
250
+ "step": 4800
251
+ },
252
+ {
253
+ "epoch": 1.6,
254
+ "grad_norm": 0.08855367451906204,
255
+ "learning_rate": 0.0004552935275810832,
256
+ "loss": 0.1824,
257
+ "step": 5000
258
+ },
259
+ {
260
+ "epoch": 1.6,
261
+ "eval_loss": 0.20444034039974213,
262
+ "eval_runtime": 22.73,
263
+ "eval_samples_per_second": 44.479,
264
+ "eval_steps_per_second": 0.704,
265
+ "step": 5000
266
+ },
267
+ {
268
+ "epoch": 1.6640000000000001,
269
+ "grad_norm": 0.09017802029848099,
270
+ "learning_rate": 0.0004217075441932357,
271
+ "loss": 0.1849,
272
+ "step": 5200
273
+ },
274
+ {
275
+ "epoch": 1.728,
276
+ "grad_norm": 0.08517798036336899,
277
+ "learning_rate": 0.00038848025490174253,
278
+ "loss": 0.1846,
279
+ "step": 5400
280
+ },
281
+ {
282
+ "epoch": 1.76,
283
+ "eval_loss": 0.20072534680366516,
284
+ "eval_runtime": 22.6827,
285
+ "eval_samples_per_second": 44.571,
286
+ "eval_steps_per_second": 0.705,
287
+ "step": 5500
288
+ },
289
+ {
290
+ "epoch": 1.792,
291
+ "grad_norm": 0.08856978267431259,
292
+ "learning_rate": 0.00035576388934845007,
293
+ "loss": 0.1821,
294
+ "step": 5600
295
+ },
296
+ {
297
+ "epoch": 1.8559999999999999,
298
+ "grad_norm": 0.0986369326710701,
299
+ "learning_rate": 0.00032370833639630423,
300
+ "loss": 0.179,
301
+ "step": 5800
302
+ },
303
+ {
304
+ "epoch": 1.92,
305
+ "grad_norm": 0.08553028106689453,
306
+ "learning_rate": 0.00029246045741886697,
307
+ "loss": 0.1797,
308
+ "step": 6000
309
+ },
310
+ {
311
+ "epoch": 1.92,
312
+ "eval_loss": 0.1978333443403244,
313
+ "eval_runtime": 22.7156,
314
+ "eval_samples_per_second": 44.507,
315
+ "eval_steps_per_second": 0.704,
316
+ "step": 6000
317
+ },
318
+ {
319
+ "epoch": 1.984,
320
+ "grad_norm": 0.08506551384925842,
321
+ "learning_rate": 0.00026216341346016616,
322
+ "loss": 0.1796,
323
+ "step": 6200
324
+ },
325
+ {
326
+ "epoch": 2.048,
327
+ "grad_norm": 0.09524673223495483,
328
+ "learning_rate": 0.00023295600934747395,
329
+ "loss": 0.1607,
330
+ "step": 6400
331
+ },
332
+ {
333
+ "epoch": 2.08,
334
+ "eval_loss": 0.1975262463092804,
335
+ "eval_runtime": 19.4961,
336
+ "eval_samples_per_second": 51.857,
337
+ "eval_steps_per_second": 0.821,
338
+ "step": 6500
339
+ },
340
+ {
341
+ "epoch": 2.112,
342
+ "grad_norm": 0.0900162011384964,
343
+ "learning_rate": 0.0002049720577619374,
344
+ "loss": 0.1537,
345
+ "step": 6600
346
+ },
347
+ {
348
+ "epoch": 2.176,
349
+ "grad_norm": 0.08766793459653854,
350
+ "learning_rate": 0.00017833976618054676,
351
+ "loss": 0.1544,
352
+ "step": 6800
353
+ },
354
+ {
355
+ "epoch": 2.24,
356
+ "grad_norm": 0.09870422631502151,
357
+ "learning_rate": 0.0001531811494981501,
358
+ "loss": 0.1522,
359
+ "step": 7000
360
+ },
361
+ {
362
+ "epoch": 2.24,
363
+ "eval_loss": 0.19707150757312775,
364
+ "eval_runtime": 19.4869,
365
+ "eval_samples_per_second": 51.881,
366
+ "eval_steps_per_second": 0.821,
367
+ "step": 7000
368
+ },
369
+ {
370
+ "epoch": 2.304,
371
+ "grad_norm": 0.09113412350416183,
372
+ "learning_rate": 0.0001296114710205592,
373
+ "loss": 0.1539,
374
+ "step": 7200
375
+ },
376
+ {
377
+ "epoch": 2.368,
378
+ "grad_norm": 0.09298386424779892,
379
+ "learning_rate": 0.00010773871438982197,
380
+ "loss": 0.1565,
381
+ "step": 7400
382
+ },
383
+ {
384
+ "epoch": 2.4,
385
+ "eval_loss": 0.19512927532196045,
386
+ "eval_runtime": 19.4898,
387
+ "eval_samples_per_second": 51.873,
388
+ "eval_steps_per_second": 0.821,
389
+ "step": 7500
390
+ },
391
+ {
392
+ "epoch": 2.432,
393
+ "grad_norm": 0.09513990581035614,
394
+ "learning_rate": 8.766308886101404e-05,
395
+ "loss": 0.1551,
396
+ "step": 7600
397
+ },
398
+ {
399
+ "epoch": 2.496,
400
+ "grad_norm": 0.10141433775424957,
401
+ "learning_rate": 6.947657019710795e-05,
402
+ "loss": 0.1553,
403
+ "step": 7800
404
+ },
405
+ {
406
+ "epoch": 2.56,
407
+ "grad_norm": 0.09975577890872955,
408
+ "learning_rate": 5.3262479285291874e-05,
409
+ "loss": 0.1533,
410
+ "step": 8000
411
+ },
412
+ {
413
+ "epoch": 2.56,
414
+ "eval_loss": 0.19368411600589752,
415
+ "eval_runtime": 19.4935,
416
+ "eval_samples_per_second": 51.864,
417
+ "eval_steps_per_second": 0.821,
418
+ "step": 8000
419
+ },
420
+ {
421
+ "epoch": 2.624,
422
+ "grad_norm": 0.09386882930994034,
423
+ "learning_rate": 3.90951004052949e-05,
424
+ "loss": 0.1517,
425
+ "step": 8200
426
+ },
427
+ {
428
+ "epoch": 2.6879999999999997,
429
+ "grad_norm": 0.10441362112760544,
430
+ "learning_rate": 2.703934089860627e-05,
431
+ "loss": 0.1534,
432
+ "step": 8400
433
+ },
434
+ {
435
+ "epoch": 2.7199999999999998,
436
+ "eval_loss": 0.19298546016216278,
437
+ "eval_runtime": 19.4753,
438
+ "eval_samples_per_second": 51.912,
439
+ "eval_steps_per_second": 0.822,
440
+ "step": 8500
441
+ },
442
+ {
443
+ "epoch": 2.752,
444
+ "grad_norm": 0.09755131602287292,
445
+ "learning_rate": 1.715043379780301e-05,
446
+ "loss": 0.1507,
447
+ "step": 8600
448
+ },
449
+ {
450
+ "epoch": 2.816,
451
+ "grad_norm": 0.1058456152677536,
452
+ "learning_rate": 9.473684778379677e-06,
453
+ "loss": 0.1533,
454
+ "step": 8800
455
+ },
456
+ {
457
+ "epoch": 2.88,
458
+ "grad_norm": 0.08434511721134186,
459
+ "learning_rate": 4.044264592410641e-06,
460
+ "loss": 0.1521,
461
+ "step": 9000
462
+ },
463
+ {
464
+ "epoch": 2.88,
465
+ "eval_loss": 0.1928529292345047,
466
+ "eval_runtime": 19.5071,
467
+ "eval_samples_per_second": 51.827,
468
+ "eval_steps_per_second": 0.82,
469
+ "step": 9000
470
+ },
471
+ {
472
+ "epoch": 2.944,
473
+ "grad_norm": 0.10123102366924286,
474
+ "learning_rate": 8.87047935002272e-07,
475
+ "loss": 0.1495,
476
+ "step": 9200
477
+ },
478
+ {
479
+ "epoch": 3.0,
480
+ "step": 9375,
481
+ "total_flos": 6.097062592512e+18,
482
+ "train_loss": 0.19507358459472657,
483
+ "train_runtime": 9183.2625,
484
+ "train_samples_per_second": 32.668,
485
+ "train_steps_per_second": 1.021
486
+ }
487
+ ],
488
+ "logging_steps": 200,
489
+ "max_steps": 9375,
490
+ "num_input_tokens_seen": 0,
491
+ "num_train_epochs": 3,
492
+ "save_steps": 0,
493
+ "stateful_callbacks": {
494
+ "TrainerControl": {
495
+ "args": {
496
+ "should_epoch_stop": false,
497
+ "should_evaluate": false,
498
+ "should_log": false,
499
+ "should_save": true,
500
+ "should_training_stop": true
501
+ },
502
+ "attributes": {}
503
+ }
504
+ },
505
+ "total_flos": 6.097062592512e+18,
506
+ "train_batch_size": 32,
507
+ "trial_name": null,
508
+ "trial_params": null
509
+ }
nl_tasks/exp395/run_ex06/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex06/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex06/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex06/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex06/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex06/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex06/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex06/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:80f122d4725a9564791500cc72f337b41d525cdf466f35d47626f6b9a8b013af
3
+ size 33602915
nl_tasks/exp395/run_ex06/trainer_state.json ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 9375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.064,
14
+ "grad_norm": 0.2426149696111679,
15
+ "learning_rate": 0.00199936844222861,
16
+ "loss": 0.3762,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.128,
21
+ "grad_norm": 0.23630200326442719,
22
+ "learning_rate": 0.0019946753128925747,
23
+ "loss": 0.3053,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.16,
28
+ "eval_loss": 0.2808550000190735,
29
+ "eval_runtime": 19.7751,
30
+ "eval_samples_per_second": 51.125,
31
+ "eval_steps_per_second": 0.809,
32
+ "step": 500
33
+ },
34
+ {
35
+ "epoch": 0.192,
36
+ "grad_norm": 0.140605628490448,
37
+ "learning_rate": 0.001985425114055429,
38
+ "loss": 0.2723,
39
+ "step": 600
40
+ },
41
+ {
42
+ "epoch": 0.256,
43
+ "grad_norm": 0.12724126875400543,
44
+ "learning_rate": 0.0019716602251735175,
45
+ "loss": 0.2563,
46
+ "step": 800
47
+ },
48
+ {
49
+ "epoch": 0.32,
50
+ "grad_norm": 0.09434176236391068,
51
+ "learning_rate": 0.001953443709594746,
52
+ "loss": 0.2481,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.32,
57
+ "eval_loss": 0.2517923414707184,
58
+ "eval_runtime": 19.4998,
59
+ "eval_samples_per_second": 51.847,
60
+ "eval_steps_per_second": 0.821,
61
+ "step": 1000
62
+ },
63
+ {
64
+ "epoch": 0.384,
65
+ "grad_norm": 0.09533161669969559,
66
+ "learning_rate": 0.0019308590256360988,
67
+ "loss": 0.2449,
68
+ "step": 1200
69
+ },
70
+ {
71
+ "epoch": 0.448,
72
+ "grad_norm": 0.09580417722463608,
73
+ "learning_rate": 0.0019040096442223358,
74
+ "loss": 0.2388,
75
+ "step": 1400
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "eval_loss": 0.24148738384246826,
80
+ "eval_runtime": 19.5462,
81
+ "eval_samples_per_second": 51.724,
82
+ "eval_steps_per_second": 0.819,
83
+ "step": 1500
84
+ },
85
+ {
86
+ "epoch": 0.512,
87
+ "grad_norm": 0.08012712001800537,
88
+ "learning_rate": 0.0018730185748376353,
89
+ "loss": 0.2334,
90
+ "step": 1600
91
+ },
92
+ {
93
+ "epoch": 0.576,
94
+ "grad_norm": 0.08019606024026871,
95
+ "learning_rate": 0.0018380278019620281,
96
+ "loss": 0.2292,
97
+ "step": 1800
98
+ },
99
+ {
100
+ "epoch": 0.64,
101
+ "grad_norm": 0.0736006572842598,
102
+ "learning_rate": 0.0017991976345745596,
103
+ "loss": 0.2276,
104
+ "step": 2000
105
+ },
106
+ {
107
+ "epoch": 0.64,
108
+ "eval_loss": 0.22908221185207367,
109
+ "eval_runtime": 19.5153,
110
+ "eval_samples_per_second": 51.805,
111
+ "eval_steps_per_second": 0.82,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 0.704,
116
+ "grad_norm": 0.06510240584611893,
117
+ "learning_rate": 0.0017567059717034153,
118
+ "loss": 0.2239,
119
+ "step": 2200
120
+ },
121
+ {
122
+ "epoch": 0.768,
123
+ "grad_norm": 0.06003573164343834,
124
+ "learning_rate": 0.0017107474873878648,
125
+ "loss": 0.2219,
126
+ "step": 2400
127
+ },
128
+ {
129
+ "epoch": 0.8,
130
+ "eval_loss": 0.22465121746063232,
131
+ "eval_runtime": 19.4747,
132
+ "eval_samples_per_second": 51.914,
133
+ "eval_steps_per_second": 0.822,
134
+ "step": 2500
135
+ },
136
+ {
137
+ "epoch": 0.832,
138
+ "grad_norm": 0.07121224701404572,
139
+ "learning_rate": 0.0016615327387860851,
140
+ "loss": 0.2197,
141
+ "step": 2600
142
+ },
143
+ {
144
+ "epoch": 0.896,
145
+ "grad_norm": 0.06296424567699432,
146
+ "learning_rate": 0.001609287201515044,
147
+ "loss": 0.2191,
148
+ "step": 2800
149
+ },
150
+ {
151
+ "epoch": 0.96,
152
+ "grad_norm": 0.05874966084957123,
153
+ "learning_rate": 0.0015542502366419986,
154
+ "loss": 0.2121,
155
+ "step": 3000
156
+ },
157
+ {
158
+ "epoch": 0.96,
159
+ "eval_loss": 0.2207345813512802,
160
+ "eval_runtime": 19.5619,
161
+ "eval_samples_per_second": 51.682,
162
+ "eval_steps_per_second": 0.818,
163
+ "step": 3000
164
+ },
165
+ {
166
+ "epoch": 1.024,
167
+ "grad_norm": 0.0780828595161438,
168
+ "learning_rate": 0.001496673994060291,
169
+ "loss": 0.2069,
170
+ "step": 3200
171
+ },
172
+ {
173
+ "epoch": 1.088,
174
+ "grad_norm": 0.06145294010639191,
175
+ "learning_rate": 0.0014368222572736001,
176
+ "loss": 0.1902,
177
+ "step": 3400
178
+ },
179
+ {
180
+ "epoch": 1.12,
181
+ "eval_loss": 0.21642106771469116,
182
+ "eval_runtime": 19.4895,
183
+ "eval_samples_per_second": 51.874,
184
+ "eval_steps_per_second": 0.821,
185
+ "step": 3500
186
+ },
187
+ {
188
+ "epoch": 1.152,
189
+ "grad_norm": 0.0599561408162117,
190
+ "learning_rate": 0.0013749692348812186,
191
+ "loss": 0.1907,
192
+ "step": 3600
193
+ },
194
+ {
195
+ "epoch": 1.216,
196
+ "grad_norm": 0.060904331505298615,
197
+ "learning_rate": 0.0013113983043011267,
198
+ "loss": 0.1905,
199
+ "step": 3800
200
+ },
201
+ {
202
+ "epoch": 1.28,
203
+ "grad_norm": 0.05701998248696327,
204
+ "learning_rate": 0.0012464007134864482,
205
+ "loss": 0.1885,
206
+ "step": 4000
207
+ },
208
+ {
209
+ "epoch": 1.28,
210
+ "eval_loss": 0.2138749361038208,
211
+ "eval_runtime": 19.5319,
212
+ "eval_samples_per_second": 51.761,
213
+ "eval_steps_per_second": 0.819,
214
+ "step": 4000
215
+ },
216
+ {
217
+ "epoch": 1.3439999999999999,
218
+ "grad_norm": 0.051733970642089844,
219
+ "learning_rate": 0.0011802742465833352,
220
+ "loss": 0.1902,
221
+ "step": 4200
222
+ },
223
+ {
224
+ "epoch": 1.408,
225
+ "grad_norm": 0.07325556129217148,
226
+ "learning_rate": 0.0011133218596435107,
227
+ "loss": 0.1884,
228
+ "step": 4400
229
+ },
230
+ {
231
+ "epoch": 1.44,
232
+ "eval_loss": 0.20981121063232422,
233
+ "eval_runtime": 19.5236,
234
+ "eval_samples_per_second": 51.784,
235
+ "eval_steps_per_second": 0.82,
236
+ "step": 4500
237
+ },
238
+ {
239
+ "epoch": 1.472,
240
+ "grad_norm": 0.05644873157143593,
241
+ "learning_rate": 0.0010458502926419136,
242
+ "loss": 0.189,
243
+ "step": 4600
244
+ },
245
+ {
246
+ "epoch": 1.536,
247
+ "grad_norm": 0.07849732786417007,
248
+ "learning_rate": 0.0009781686641584368,
249
+ "loss": 0.1864,
250
+ "step": 4800
251
+ },
252
+ {
253
+ "epoch": 1.6,
254
+ "grad_norm": 0.05307694151997566,
255
+ "learning_rate": 0.0009105870551621664,
256
+ "loss": 0.1842,
257
+ "step": 5000
258
+ },
259
+ {
260
+ "epoch": 1.6,
261
+ "eval_loss": 0.20523199439048767,
262
+ "eval_runtime": 19.534,
263
+ "eval_samples_per_second": 51.756,
264
+ "eval_steps_per_second": 0.819,
265
+ "step": 5000
266
+ },
267
+ {
268
+ "epoch": 1.6640000000000001,
269
+ "grad_norm": 0.04950845614075661,
270
+ "learning_rate": 0.0008434150883864713,
271
+ "loss": 0.1866,
272
+ "step": 5200
273
+ },
274
+ {
275
+ "epoch": 1.728,
276
+ "grad_norm": 0.05008252337574959,
277
+ "learning_rate": 0.0007769605098034851,
278
+ "loss": 0.1859,
279
+ "step": 5400
280
+ },
281
+ {
282
+ "epoch": 1.76,
283
+ "eval_loss": 0.20152263343334198,
284
+ "eval_runtime": 19.4908,
285
+ "eval_samples_per_second": 51.871,
286
+ "eval_steps_per_second": 0.821,
287
+ "step": 5500
288
+ },
289
+ {
290
+ "epoch": 1.792,
291
+ "grad_norm": 0.04900854080915451,
292
+ "learning_rate": 0.0007115277786969001,
293
+ "loss": 0.1831,
294
+ "step": 5600
295
+ },
296
+ {
297
+ "epoch": 1.8559999999999999,
298
+ "grad_norm": 0.058961208909749985,
299
+ "learning_rate": 0.0006474166727926085,
300
+ "loss": 0.1805,
301
+ "step": 5800
302
+ },
303
+ {
304
+ "epoch": 1.92,
305
+ "grad_norm": 0.05107206851243973,
306
+ "learning_rate": 0.0005849209148377339,
307
+ "loss": 0.181,
308
+ "step": 6000
309
+ },
310
+ {
311
+ "epoch": 1.92,
312
+ "eval_loss": 0.19884061813354492,
313
+ "eval_runtime": 19.5072,
314
+ "eval_samples_per_second": 51.827,
315
+ "eval_steps_per_second": 0.82,
316
+ "step": 6000
317
+ },
318
+ {
319
+ "epoch": 1.984,
320
+ "grad_norm": 0.048386573791503906,
321
+ "learning_rate": 0.0005243268269203323,
322
+ "loss": 0.1806,
323
+ "step": 6200
324
+ },
325
+ {
326
+ "epoch": 2.048,
327
+ "grad_norm": 0.055718015879392624,
328
+ "learning_rate": 0.0004659120186949479,
329
+ "loss": 0.1617,
330
+ "step": 6400
331
+ },
332
+ {
333
+ "epoch": 2.08,
334
+ "eval_loss": 0.19804580509662628,
335
+ "eval_runtime": 19.5582,
336
+ "eval_samples_per_second": 51.692,
337
+ "eval_steps_per_second": 0.818,
338
+ "step": 6500
339
+ },
340
+ {
341
+ "epoch": 2.112,
342
+ "grad_norm": 0.05408688262104988,
343
+ "learning_rate": 0.0004099441155238748,
344
+ "loss": 0.1543,
345
+ "step": 6600
346
+ },
347
+ {
348
+ "epoch": 2.176,
349
+ "grad_norm": 0.05257072672247887,
350
+ "learning_rate": 0.0003566795323610935,
351
+ "loss": 0.1549,
352
+ "step": 6800
353
+ },
354
+ {
355
+ "epoch": 2.24,
356
+ "grad_norm": 0.05060563609004021,
357
+ "learning_rate": 0.0003063622989963002,
358
+ "loss": 0.1528,
359
+ "step": 7000
360
+ },
361
+ {
362
+ "epoch": 2.24,
363
+ "eval_loss": 0.19734542071819305,
364
+ "eval_runtime": 19.5386,
365
+ "eval_samples_per_second": 51.744,
366
+ "eval_steps_per_second": 0.819,
367
+ "step": 7000
368
+ },
369
+ {
370
+ "epoch": 2.304,
371
+ "grad_norm": 0.05511431023478508,
372
+ "learning_rate": 0.0002592229420411184,
373
+ "loss": 0.1543,
374
+ "step": 7200
375
+ },
376
+ {
377
+ "epoch": 2.368,
378
+ "grad_norm": 0.07401052117347717,
379
+ "learning_rate": 0.00021547742877964395,
380
+ "loss": 0.1567,
381
+ "step": 7400
382
+ },
383
+ {
384
+ "epoch": 2.4,
385
+ "eval_loss": 0.1949290782213211,
386
+ "eval_runtime": 19.4974,
387
+ "eval_samples_per_second": 51.853,
388
+ "eval_steps_per_second": 0.821,
389
+ "step": 7500
390
+ },
391
+ {
392
+ "epoch": 2.432,
393
+ "grad_norm": 0.05771145224571228,
394
+ "learning_rate": 0.00017532617772202807,
395
+ "loss": 0.1557,
396
+ "step": 7600
397
+ },
398
+ {
399
+ "epoch": 2.496,
400
+ "grad_norm": 0.05626155808568001,
401
+ "learning_rate": 0.0001389531403942159,
402
+ "loss": 0.1559,
403
+ "step": 7800
404
+ },
405
+ {
406
+ "epoch": 2.56,
407
+ "grad_norm": 0.05681011080741882,
408
+ "learning_rate": 0.00010652495857058375,
409
+ "loss": 0.1536,
410
+ "step": 8000
411
+ },
412
+ {
413
+ "epoch": 2.56,
414
+ "eval_loss": 0.19341105222702026,
415
+ "eval_runtime": 33.6538,
416
+ "eval_samples_per_second": 30.041,
417
+ "eval_steps_per_second": 0.475,
418
+ "step": 8000
419
+ },
420
+ {
421
+ "epoch": 2.624,
422
+ "grad_norm": 0.05382503569126129,
423
+ "learning_rate": 7.81902008105898e-05,
424
+ "loss": 0.1516,
425
+ "step": 8200
426
+ },
427
+ {
428
+ "epoch": 2.6879999999999997,
429
+ "grad_norm": 0.06113835424184799,
430
+ "learning_rate": 5.407868179721254e-05,
431
+ "loss": 0.1535,
432
+ "step": 8400
433
+ },
434
+ {
435
+ "epoch": 2.7199999999999998,
436
+ "eval_loss": 0.1927892565727234,
437
+ "eval_runtime": 19.4973,
438
+ "eval_samples_per_second": 51.853,
439
+ "eval_steps_per_second": 0.821,
440
+ "step": 8500
441
+ },
442
+ {
443
+ "epoch": 2.752,
444
+ "grad_norm": 0.055167559534311295,
445
+ "learning_rate": 3.430086759560602e-05,
446
+ "loss": 0.1506,
447
+ "step": 8600
448
+ },
449
+ {
450
+ "epoch": 2.816,
451
+ "grad_norm": 0.06194595992565155,
452
+ "learning_rate": 1.8947369556759353e-05,
453
+ "loss": 0.1537,
454
+ "step": 8800
455
+ },
456
+ {
457
+ "epoch": 2.88,
458
+ "grad_norm": 0.0478057824075222,
459
+ "learning_rate": 8.088529184821281e-06,
460
+ "loss": 0.1522,
461
+ "step": 9000
462
+ },
463
+ {
464
+ "epoch": 2.88,
465
+ "eval_loss": 0.1926267296075821,
466
+ "eval_runtime": 25.1004,
467
+ "eval_samples_per_second": 40.278,
468
+ "eval_steps_per_second": 0.637,
469
+ "step": 9000
470
+ },
471
+ {
472
+ "epoch": 2.944,
473
+ "grad_norm": 0.06033524498343468,
474
+ "learning_rate": 1.774095870004544e-06,
475
+ "loss": 0.1493,
476
+ "step": 9200
477
+ },
478
+ {
479
+ "epoch": 3.0,
480
+ "step": 9375,
481
+ "total_flos": 6.097062592512e+18,
482
+ "train_loss": 0.19569317911783854,
483
+ "train_runtime": 9108.3132,
484
+ "train_samples_per_second": 32.937,
485
+ "train_steps_per_second": 1.029
486
+ }
487
+ ],
488
+ "logging_steps": 200,
489
+ "max_steps": 9375,
490
+ "num_input_tokens_seen": 0,
491
+ "num_train_epochs": 3,
492
+ "save_steps": 0,
493
+ "stateful_callbacks": {
494
+ "TrainerControl": {
495
+ "args": {
496
+ "should_epoch_stop": false,
497
+ "should_evaluate": false,
498
+ "should_log": false,
499
+ "should_save": true,
500
+ "should_training_stop": true
501
+ },
502
+ "attributes": {}
503
+ }
504
+ },
505
+ "total_flos": 6.097062592512e+18,
506
+ "train_batch_size": 32,
507
+ "trial_name": null,
508
+ "trial_params": null
509
+ }
nl_tasks/exp395/run_ex07/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex07/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex07/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex07/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex07/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex07/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex07/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex07/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfc792c0299154bc4d052cdf373ee5d1328a23a53804f4af7642a0da515cc656
3
+ size 33602915
nl_tasks/exp395/run_ex07/trainer_state.json ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 9375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.064,
14
+ "grad_norm": 0.17820023000240326,
15
+ "learning_rate": 0.004998421105571525,
16
+ "loss": 0.4046,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.128,
21
+ "grad_norm": 0.11274894326925278,
22
+ "learning_rate": 0.0049866882822314365,
23
+ "loss": 0.294,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.16,
28
+ "eval_loss": 0.27918341755867004,
29
+ "eval_runtime": 28.1591,
30
+ "eval_samples_per_second": 35.903,
31
+ "eval_steps_per_second": 0.568,
32
+ "step": 500
33
+ },
34
+ {
35
+ "epoch": 0.192,
36
+ "grad_norm": 0.07643634080886841,
37
+ "learning_rate": 0.004963562785138573,
38
+ "loss": 0.2697,
39
+ "step": 600
40
+ },
41
+ {
42
+ "epoch": 0.256,
43
+ "grad_norm": 0.07283908873796463,
44
+ "learning_rate": 0.004929150562933794,
45
+ "loss": 0.2571,
46
+ "step": 800
47
+ },
48
+ {
49
+ "epoch": 0.32,
50
+ "grad_norm": 0.06430114805698395,
51
+ "learning_rate": 0.004883609273986864,
52
+ "loss": 0.2491,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.32,
57
+ "eval_loss": 0.2539050281047821,
58
+ "eval_runtime": 22.7763,
59
+ "eval_samples_per_second": 44.388,
60
+ "eval_steps_per_second": 0.702,
61
+ "step": 1000
62
+ },
63
+ {
64
+ "epoch": 0.384,
65
+ "grad_norm": 0.0629628598690033,
66
+ "learning_rate": 0.004827147564090247,
67
+ "loss": 0.2473,
68
+ "step": 1200
69
+ },
70
+ {
71
+ "epoch": 0.448,
72
+ "grad_norm": 0.05335867032408714,
73
+ "learning_rate": 0.0047600241105558395,
74
+ "loss": 0.2409,
75
+ "step": 1400
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "eval_loss": 0.2420750856399536,
80
+ "eval_runtime": 22.8179,
81
+ "eval_samples_per_second": 44.307,
82
+ "eval_steps_per_second": 0.701,
83
+ "step": 1500
84
+ },
85
+ {
86
+ "epoch": 0.512,
87
+ "grad_norm": 0.04761116951704025,
88
+ "learning_rate": 0.0046825464370940885,
89
+ "loss": 0.2349,
90
+ "step": 1600
91
+ },
92
+ {
93
+ "epoch": 0.576,
94
+ "grad_norm": 0.043634187430143356,
95
+ "learning_rate": 0.00459506950490507,
96
+ "loss": 0.2322,
97
+ "step": 1800
98
+ },
99
+ {
100
+ "epoch": 0.64,
101
+ "grad_norm": 0.04088456556200981,
102
+ "learning_rate": 0.004497994086436399,
103
+ "loss": 0.2302,
104
+ "step": 2000
105
+ },
106
+ {
107
+ "epoch": 0.64,
108
+ "eval_loss": 0.23234312236309052,
109
+ "eval_runtime": 22.7709,
110
+ "eval_samples_per_second": 44.399,
111
+ "eval_steps_per_second": 0.703,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 0.704,
116
+ "grad_norm": 0.03686574473977089,
117
+ "learning_rate": 0.004391764929258538,
118
+ "loss": 0.2264,
119
+ "step": 2200
120
+ },
121
+ {
122
+ "epoch": 0.768,
123
+ "grad_norm": 0.033889204263687134,
124
+ "learning_rate": 0.004276868718469662,
125
+ "loss": 0.2246,
126
+ "step": 2400
127
+ },
128
+ {
129
+ "epoch": 0.8,
130
+ "eval_loss": 0.22624295949935913,
131
+ "eval_runtime": 19.4841,
132
+ "eval_samples_per_second": 51.888,
133
+ "eval_steps_per_second": 0.821,
134
+ "step": 2500
135
+ },
136
+ {
137
+ "epoch": 0.832,
138
+ "grad_norm": 0.04181356728076935,
139
+ "learning_rate": 0.0041538318469652126,
140
+ "loss": 0.2222,
141
+ "step": 2600
142
+ },
143
+ {
144
+ "epoch": 0.896,
145
+ "grad_norm": 0.03593447804450989,
146
+ "learning_rate": 0.00402321800378761,
147
+ "loss": 0.2219,
148
+ "step": 2800
149
+ },
150
+ {
151
+ "epoch": 0.96,
152
+ "grad_norm": 0.031546611338853836,
153
+ "learning_rate": 0.0038856255916049965,
154
+ "loss": 0.2149,
155
+ "step": 3000
156
+ },
157
+ {
158
+ "epoch": 0.96,
159
+ "eval_loss": 0.22304701805114746,
160
+ "eval_runtime": 27.9954,
161
+ "eval_samples_per_second": 36.113,
162
+ "eval_steps_per_second": 0.572,
163
+ "step": 3000
164
+ },
165
+ {
166
+ "epoch": 1.024,
167
+ "grad_norm": 0.037926312536001205,
168
+ "learning_rate": 0.0037416849851507274,
169
+ "loss": 0.2094,
170
+ "step": 3200
171
+ },
172
+ {
173
+ "epoch": 1.088,
174
+ "grad_norm": 0.030403969809412956,
175
+ "learning_rate": 0.0035920556431840002,
176
+ "loss": 0.1939,
177
+ "step": 3400
178
+ },
179
+ {
180
+ "epoch": 1.12,
181
+ "eval_loss": 0.21854709088802338,
182
+ "eval_runtime": 19.4946,
183
+ "eval_samples_per_second": 51.86,
184
+ "eval_steps_per_second": 0.821,
185
+ "step": 3500
186
+ },
187
+ {
188
+ "epoch": 1.152,
189
+ "grad_norm": 0.03306721895933151,
190
+ "learning_rate": 0.0034374230872030466,
191
+ "loss": 0.1944,
192
+ "step": 3600
193
+ },
194
+ {
195
+ "epoch": 1.216,
196
+ "grad_norm": 0.03182404488325119,
197
+ "learning_rate": 0.0032784957607528164,
198
+ "loss": 0.1948,
199
+ "step": 3800
200
+ },
201
+ {
202
+ "epoch": 1.28,
203
+ "grad_norm": 0.025607936084270477,
204
+ "learning_rate": 0.003116001783716121,
205
+ "loss": 0.1929,
206
+ "step": 4000
207
+ },
208
+ {
209
+ "epoch": 1.28,
210
+ "eval_loss": 0.2149907499551773,
211
+ "eval_runtime": 25.792,
212
+ "eval_samples_per_second": 39.198,
213
+ "eval_steps_per_second": 0.62,
214
+ "step": 4000
215
+ },
216
+ {
217
+ "epoch": 1.3439999999999999,
218
+ "grad_norm": 0.028963735327124596,
219
+ "learning_rate": 0.0029506856164583377,
220
+ "loss": 0.1944,
221
+ "step": 4200
222
+ },
223
+ {
224
+ "epoch": 1.408,
225
+ "grad_norm": 0.03618050366640091,
226
+ "learning_rate": 0.0027833046491087766,
227
+ "loss": 0.1928,
228
+ "step": 4400
229
+ },
230
+ {
231
+ "epoch": 1.44,
232
+ "eval_loss": 0.21210040152072906,
233
+ "eval_runtime": 19.4863,
234
+ "eval_samples_per_second": 51.882,
235
+ "eval_steps_per_second": 0.821,
236
+ "step": 4500
237
+ },
238
+ {
239
+ "epoch": 1.472,
240
+ "grad_norm": 0.03128841519355774,
241
+ "learning_rate": 0.002614625731604784,
242
+ "loss": 0.1935,
243
+ "step": 4600
244
+ },
245
+ {
246
+ "epoch": 1.536,
247
+ "grad_norm": 0.026794707402586937,
248
+ "learning_rate": 0.002445421660396092,
249
+ "loss": 0.1905,
250
+ "step": 4800
251
+ },
252
+ {
253
+ "epoch": 1.6,
254
+ "grad_norm": 0.028422104194760323,
255
+ "learning_rate": 0.0022764676379054163,
256
+ "loss": 0.1878,
257
+ "step": 5000
258
+ },
259
+ {
260
+ "epoch": 1.6,
261
+ "eval_loss": 0.20722173154354095,
262
+ "eval_runtime": 19.4745,
263
+ "eval_samples_per_second": 51.914,
264
+ "eval_steps_per_second": 0.822,
265
+ "step": 5000
266
+ },
267
+ {
268
+ "epoch": 1.6640000000000001,
269
+ "grad_norm": 0.02493426389992237,
270
+ "learning_rate": 0.0021085377209661784,
271
+ "loss": 0.1904,
272
+ "step": 5200
273
+ },
274
+ {
275
+ "epoch": 1.728,
276
+ "grad_norm": 0.026702167466282845,
277
+ "learning_rate": 0.0019424012745087127,
278
+ "loss": 0.19,
279
+ "step": 5400
280
+ },
281
+ {
282
+ "epoch": 1.76,
283
+ "eval_loss": 0.20289908349514008,
284
+ "eval_runtime": 26.7848,
285
+ "eval_samples_per_second": 37.745,
286
+ "eval_steps_per_second": 0.597,
287
+ "step": 5500
288
+ },
289
+ {
290
+ "epoch": 1.792,
291
+ "grad_norm": 0.026274586096405983,
292
+ "learning_rate": 0.0017788194467422502,
293
+ "loss": 0.187,
294
+ "step": 5600
295
+ },
296
+ {
297
+ "epoch": 1.8559999999999999,
298
+ "grad_norm": 0.02791573852300644,
299
+ "learning_rate": 0.001618541681981521,
300
+ "loss": 0.1837,
301
+ "step": 5800
302
+ },
303
+ {
304
+ "epoch": 1.92,
305
+ "grad_norm": 0.024751191958785057,
306
+ "learning_rate": 0.0014623022870943348,
307
+ "loss": 0.1841,
308
+ "step": 6000
309
+ },
310
+ {
311
+ "epoch": 1.92,
312
+ "eval_loss": 0.19980530440807343,
313
+ "eval_runtime": 19.4905,
314
+ "eval_samples_per_second": 51.871,
315
+ "eval_steps_per_second": 0.821,
316
+ "step": 6000
317
+ },
318
+ {
319
+ "epoch": 1.984,
320
+ "grad_norm": 0.027469471096992493,
321
+ "learning_rate": 0.0013108170673008306,
322
+ "loss": 0.1838,
323
+ "step": 6200
324
+ },
325
+ {
326
+ "epoch": 2.048,
327
+ "grad_norm": 0.028063790872693062,
328
+ "learning_rate": 0.0011647800467373699,
329
+ "loss": 0.1639,
330
+ "step": 6400
331
+ },
332
+ {
333
+ "epoch": 2.08,
334
+ "eval_loss": 0.19846861064434052,
335
+ "eval_runtime": 28.4577,
336
+ "eval_samples_per_second": 35.526,
337
+ "eval_steps_per_second": 0.562,
338
+ "step": 6500
339
+ },
340
+ {
341
+ "epoch": 2.112,
342
+ "grad_norm": 0.02692621760070324,
343
+ "learning_rate": 0.001024860288809687,
344
+ "loss": 0.1568,
345
+ "step": 6600
346
+ },
347
+ {
348
+ "epoch": 2.176,
349
+ "grad_norm": 0.02786995656788349,
350
+ "learning_rate": 0.0008916988309027338,
351
+ "loss": 0.1571,
352
+ "step": 6800
353
+ },
354
+ {
355
+ "epoch": 2.24,
356
+ "grad_norm": 0.02808111160993576,
357
+ "learning_rate": 0.0007659057474907505,
358
+ "loss": 0.155,
359
+ "step": 7000
360
+ },
361
+ {
362
+ "epoch": 2.24,
363
+ "eval_loss": 0.1971956193447113,
364
+ "eval_runtime": 19.5072,
365
+ "eval_samples_per_second": 51.827,
366
+ "eval_steps_per_second": 0.82,
367
+ "step": 7000
368
+ },
369
+ {
370
+ "epoch": 2.304,
371
+ "grad_norm": 0.02999687008559704,
372
+ "learning_rate": 0.000648057355102796,
373
+ "loss": 0.1566,
374
+ "step": 7200
375
+ },
376
+ {
377
+ "epoch": 2.368,
378
+ "grad_norm": 0.027220070362091064,
379
+ "learning_rate": 0.0005386935719491098,
380
+ "loss": 0.1587,
381
+ "step": 7400
382
+ },
383
+ {
384
+ "epoch": 2.4,
385
+ "eval_loss": 0.19491882622241974,
386
+ "eval_runtime": 26.4649,
387
+ "eval_samples_per_second": 38.202,
388
+ "eval_steps_per_second": 0.605,
389
+ "step": 7500
390
+ },
391
+ {
392
+ "epoch": 2.432,
393
+ "grad_norm": 0.030793363228440285,
394
+ "learning_rate": 0.0004383154443050702,
395
+ "loss": 0.1573,
396
+ "step": 7600
397
+ },
398
+ {
399
+ "epoch": 2.496,
400
+ "grad_norm": 0.03137922286987305,
401
+ "learning_rate": 0.00034738285098553977,
402
+ "loss": 0.1573,
403
+ "step": 7800
404
+ },
405
+ {
406
+ "epoch": 2.56,
407
+ "grad_norm": 0.0298843365162611,
408
+ "learning_rate": 0.00026631239642645936,
409
+ "loss": 0.155,
410
+ "step": 8000
411
+ },
412
+ {
413
+ "epoch": 2.56,
414
+ "eval_loss": 0.1927516758441925,
415
+ "eval_runtime": 26.2373,
416
+ "eval_samples_per_second": 38.533,
417
+ "eval_steps_per_second": 0.61,
418
+ "step": 8000
419
+ },
420
+ {
421
+ "epoch": 2.624,
422
+ "grad_norm": 0.0275055430829525,
423
+ "learning_rate": 0.0001954755020264745,
424
+ "loss": 0.1531,
425
+ "step": 8200
426
+ },
427
+ {
428
+ "epoch": 2.6879999999999997,
429
+ "grad_norm": 0.02978612296283245,
430
+ "learning_rate": 0.00013519670449303135,
431
+ "loss": 0.1549,
432
+ "step": 8400
433
+ },
434
+ {
435
+ "epoch": 2.7199999999999998,
436
+ "eval_loss": 0.19178353250026703,
437
+ "eval_runtime": 26.5213,
438
+ "eval_samples_per_second": 38.12,
439
+ "eval_steps_per_second": 0.603,
440
+ "step": 8500
441
+ },
442
+ {
443
+ "epoch": 2.752,
444
+ "grad_norm": 0.028374744579195976,
445
+ "learning_rate": 8.575216898901506e-05,
446
+ "loss": 0.1517,
447
+ "step": 8600
448
+ },
449
+ {
450
+ "epoch": 2.816,
451
+ "grad_norm": 0.03000253438949585,
452
+ "learning_rate": 4.736842389189838e-05,
453
+ "loss": 0.1544,
454
+ "step": 8800
455
+ },
456
+ {
457
+ "epoch": 2.88,
458
+ "grad_norm": 0.025454407557845116,
459
+ "learning_rate": 2.02213229620532e-05,
460
+ "loss": 0.1533,
461
+ "step": 9000
462
+ },
463
+ {
464
+ "epoch": 2.88,
465
+ "eval_loss": 0.19136634469032288,
466
+ "eval_runtime": 19.5667,
467
+ "eval_samples_per_second": 51.67,
468
+ "eval_steps_per_second": 0.818,
469
+ "step": 9000
470
+ },
471
+ {
472
+ "epoch": 2.944,
473
+ "grad_norm": 0.030465198680758476,
474
+ "learning_rate": 4.43523967501136e-06,
475
+ "loss": 0.1502,
476
+ "step": 9200
477
+ },
478
+ {
479
+ "epoch": 3.0,
480
+ "step": 9375,
481
+ "total_flos": 6.097062592512e+18,
482
+ "train_loss": 0.1983985675048828,
483
+ "train_runtime": 10216.7969,
484
+ "train_samples_per_second": 29.363,
485
+ "train_steps_per_second": 0.918
486
+ }
487
+ ],
488
+ "logging_steps": 200,
489
+ "max_steps": 9375,
490
+ "num_input_tokens_seen": 0,
491
+ "num_train_epochs": 3,
492
+ "save_steps": 0,
493
+ "stateful_callbacks": {
494
+ "TrainerControl": {
495
+ "args": {
496
+ "should_epoch_stop": false,
497
+ "should_evaluate": false,
498
+ "should_log": false,
499
+ "should_save": true,
500
+ "should_training_stop": true
501
+ },
502
+ "attributes": {}
503
+ }
504
+ },
505
+ "total_flos": 6.097062592512e+18,
506
+ "train_batch_size": 32,
507
+ "trial_name": null,
508
+ "trial_params": null
509
+ }
nl_tasks/exp395/run_ex08/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex08/ft/added_tokens.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "[PAD]": 32000
3
+ }
nl_tasks/exp395/run_ex08/ft/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "</s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "</s>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
nl_tasks/exp395/run_ex08/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp395/run_ex08/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp395/run_ex08/ft/tokenizer_config.json ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ },
30
+ "32000": {
31
+ "content": "[PAD]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false,
36
+ "special": true
37
+ }
38
+ },
39
+ "bos_token": "</s>",
40
+ "clean_up_tokenization_spaces": false,
41
+ "eos_token": "</s>",
42
+ "extra_special_tokens": {},
43
+ "legacy": false,
44
+ "model_max_length": 512,
45
+ "pad_token": "[PAD]",
46
+ "padding_side": "right",
47
+ "sp_model_kwargs": {},
48
+ "tokenizer_class": "LlamaTokenizer",
49
+ "unk_token": "</s>",
50
+ "use_default_system_prompt": false
51
+ }
nl_tasks/exp395/run_ex08/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp395/run_ex08/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:343a959084dc5ab62addd82639a1a2f247ced9be6fcdfe7507654726d199f794
3
+ size 33602915
nl_tasks/exp395/run_ex08/trainer_state.json ADDED
@@ -0,0 +1,509 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 9375,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.064,
14
+ "grad_norm": 0.3168371915817261,
15
+ "learning_rate": 9.99684221114305e-05,
16
+ "loss": 0.4089,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.128,
21
+ "grad_norm": 0.36081254482269287,
22
+ "learning_rate": 9.973376564462874e-05,
23
+ "loss": 0.3014,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.16,
28
+ "eval_loss": 0.2922876477241516,
29
+ "eval_runtime": 19.7057,
30
+ "eval_samples_per_second": 51.305,
31
+ "eval_steps_per_second": 0.812,
32
+ "step": 500
33
+ },
34
+ {
35
+ "epoch": 0.192,
36
+ "grad_norm": 0.3251103460788727,
37
+ "learning_rate": 9.927125570277145e-05,
38
+ "loss": 0.282,
39
+ "step": 600
40
+ },
41
+ {
42
+ "epoch": 0.256,
43
+ "grad_norm": 0.3563052713871002,
44
+ "learning_rate": 9.858301125867588e-05,
45
+ "loss": 0.2695,
46
+ "step": 800
47
+ },
48
+ {
49
+ "epoch": 0.32,
50
+ "grad_norm": 0.3129023015499115,
51
+ "learning_rate": 9.767218547973729e-05,
52
+ "loss": 0.2624,
53
+ "step": 1000
54
+ },
55
+ {
56
+ "epoch": 0.32,
57
+ "eval_loss": 0.26842427253723145,
58
+ "eval_runtime": 19.4871,
59
+ "eval_samples_per_second": 51.881,
60
+ "eval_steps_per_second": 0.821,
61
+ "step": 1000
62
+ },
63
+ {
64
+ "epoch": 0.384,
65
+ "grad_norm": 0.34633564949035645,
66
+ "learning_rate": 9.654295128180494e-05,
67
+ "loss": 0.2603,
68
+ "step": 1200
69
+ },
70
+ {
71
+ "epoch": 0.448,
72
+ "grad_norm": 0.32980990409851074,
73
+ "learning_rate": 9.520048221111679e-05,
74
+ "loss": 0.2544,
75
+ "step": 1400
76
+ },
77
+ {
78
+ "epoch": 0.48,
79
+ "eval_loss": 0.2564426064491272,
80
+ "eval_runtime": 19.5049,
81
+ "eval_samples_per_second": 51.833,
82
+ "eval_steps_per_second": 0.82,
83
+ "step": 1500
84
+ },
85
+ {
86
+ "epoch": 0.512,
87
+ "grad_norm": 0.32643768191337585,
88
+ "learning_rate": 9.365092874188177e-05,
89
+ "loss": 0.2489,
90
+ "step": 1600
91
+ },
92
+ {
93
+ "epoch": 0.576,
94
+ "grad_norm": 0.31819966435432434,
95
+ "learning_rate": 9.190139009810142e-05,
96
+ "loss": 0.2444,
97
+ "step": 1800
98
+ },
99
+ {
100
+ "epoch": 0.64,
101
+ "grad_norm": 0.3185544013977051,
102
+ "learning_rate": 8.995988172872798e-05,
103
+ "loss": 0.2429,
104
+ "step": 2000
105
+ },
106
+ {
107
+ "epoch": 0.64,
108
+ "eval_loss": 0.2463328242301941,
109
+ "eval_runtime": 19.5247,
110
+ "eval_samples_per_second": 51.781,
111
+ "eval_steps_per_second": 0.819,
112
+ "step": 2000
113
+ },
114
+ {
115
+ "epoch": 0.704,
116
+ "grad_norm": 0.30173778533935547,
117
+ "learning_rate": 8.783529858517076e-05,
118
+ "loss": 0.2388,
119
+ "step": 2200
120
+ },
121
+ {
122
+ "epoch": 0.768,
123
+ "grad_norm": 0.2898229658603668,
124
+ "learning_rate": 8.553737436939324e-05,
125
+ "loss": 0.2366,
126
+ "step": 2400
127
+ },
128
+ {
129
+ "epoch": 0.8,
130
+ "eval_loss": 0.24057233333587646,
131
+ "eval_runtime": 19.4831,
132
+ "eval_samples_per_second": 51.891,
133
+ "eval_steps_per_second": 0.821,
134
+ "step": 2500
135
+ },
136
+ {
137
+ "epoch": 0.832,
138
+ "grad_norm": 0.34637656807899475,
139
+ "learning_rate": 8.307663693930425e-05,
140
+ "loss": 0.234,
141
+ "step": 2600
142
+ },
143
+ {
144
+ "epoch": 0.896,
145
+ "grad_norm": 0.3339170217514038,
146
+ "learning_rate": 8.046436007575221e-05,
147
+ "loss": 0.2336,
148
+ "step": 2800
149
+ },
150
+ {
151
+ "epoch": 0.96,
152
+ "grad_norm": 0.2930004596710205,
153
+ "learning_rate": 7.771251183209993e-05,
154
+ "loss": 0.2265,
155
+ "step": 3000
156
+ },
157
+ {
158
+ "epoch": 0.96,
159
+ "eval_loss": 0.23496957123279572,
160
+ "eval_runtime": 19.4688,
161
+ "eval_samples_per_second": 51.929,
162
+ "eval_steps_per_second": 0.822,
163
+ "step": 3000
164
+ },
165
+ {
166
+ "epoch": 1.024,
167
+ "grad_norm": 0.3596031665802002,
168
+ "learning_rate": 7.483369970301454e-05,
169
+ "loss": 0.2246,
170
+ "step": 3200
171
+ },
172
+ {
173
+ "epoch": 1.088,
174
+ "grad_norm": 0.32522472739219666,
175
+ "learning_rate": 7.184111286368001e-05,
176
+ "loss": 0.2124,
177
+ "step": 3400
178
+ },
179
+ {
180
+ "epoch": 1.12,
181
+ "eval_loss": 0.23045767843723297,
182
+ "eval_runtime": 19.488,
183
+ "eval_samples_per_second": 51.878,
184
+ "eval_steps_per_second": 0.821,
185
+ "step": 3500
186
+ },
187
+ {
188
+ "epoch": 1.152,
189
+ "grad_norm": 0.331019788980484,
190
+ "learning_rate": 6.874846174406093e-05,
191
+ "loss": 0.2122,
192
+ "step": 3600
193
+ },
194
+ {
195
+ "epoch": 1.216,
196
+ "grad_norm": 0.3516997694969177,
197
+ "learning_rate": 6.556991521505634e-05,
198
+ "loss": 0.2112,
199
+ "step": 3800
200
+ },
201
+ {
202
+ "epoch": 1.28,
203
+ "grad_norm": 0.31302034854888916,
204
+ "learning_rate": 6.232003567432242e-05,
205
+ "loss": 0.2077,
206
+ "step": 4000
207
+ },
208
+ {
209
+ "epoch": 1.28,
210
+ "eval_loss": 0.22687973082065582,
211
+ "eval_runtime": 19.5334,
212
+ "eval_samples_per_second": 51.758,
213
+ "eval_steps_per_second": 0.819,
214
+ "step": 4000
215
+ },
216
+ {
217
+ "epoch": 1.3439999999999999,
218
+ "grad_norm": 0.333069771528244,
219
+ "learning_rate": 5.901371232916676e-05,
220
+ "loss": 0.2102,
221
+ "step": 4200
222
+ },
223
+ {
224
+ "epoch": 1.408,
225
+ "grad_norm": 0.4020955562591553,
226
+ "learning_rate": 5.566609298217553e-05,
227
+ "loss": 0.208,
228
+ "step": 4400
229
+ },
230
+ {
231
+ "epoch": 1.44,
232
+ "eval_loss": 0.22374407947063446,
233
+ "eval_runtime": 19.772,
234
+ "eval_samples_per_second": 51.133,
235
+ "eval_steps_per_second": 0.809,
236
+ "step": 4500
237
+ },
238
+ {
239
+ "epoch": 1.472,
240
+ "grad_norm": 0.3483867645263672,
241
+ "learning_rate": 5.229251463209568e-05,
242
+ "loss": 0.2087,
243
+ "step": 4600
244
+ },
245
+ {
246
+ "epoch": 1.536,
247
+ "grad_norm": 0.3387017250061035,
248
+ "learning_rate": 4.8908433207921846e-05,
249
+ "loss": 0.2056,
250
+ "step": 4800
251
+ },
252
+ {
253
+ "epoch": 1.6,
254
+ "grad_norm": 0.3310563266277313,
255
+ "learning_rate": 4.5529352758108325e-05,
256
+ "loss": 0.2032,
257
+ "step": 5000
258
+ },
259
+ {
260
+ "epoch": 1.6,
261
+ "eval_loss": 0.22034206986427307,
262
+ "eval_runtime": 19.5875,
263
+ "eval_samples_per_second": 51.615,
264
+ "eval_steps_per_second": 0.817,
265
+ "step": 5000
266
+ },
267
+ {
268
+ "epoch": 1.6640000000000001,
269
+ "grad_norm": 0.3283143937587738,
270
+ "learning_rate": 4.2170754419323566e-05,
271
+ "loss": 0.2065,
272
+ "step": 5200
273
+ },
274
+ {
275
+ "epoch": 1.728,
276
+ "grad_norm": 0.32001566886901855,
277
+ "learning_rate": 3.884802549017425e-05,
278
+ "loss": 0.206,
279
+ "step": 5400
280
+ },
281
+ {
282
+ "epoch": 1.76,
283
+ "eval_loss": 0.21794956922531128,
284
+ "eval_runtime": 19.574,
285
+ "eval_samples_per_second": 51.65,
286
+ "eval_steps_per_second": 0.817,
287
+ "step": 5500
288
+ },
289
+ {
290
+ "epoch": 1.792,
291
+ "grad_norm": 0.3461301624774933,
292
+ "learning_rate": 3.557638893484501e-05,
293
+ "loss": 0.2039,
294
+ "step": 5600
295
+ },
296
+ {
297
+ "epoch": 1.8559999999999999,
298
+ "grad_norm": 0.358914315700531,
299
+ "learning_rate": 3.237083363963042e-05,
300
+ "loss": 0.2007,
301
+ "step": 5800
302
+ },
303
+ {
304
+ "epoch": 1.92,
305
+ "grad_norm": 0.3654290437698364,
306
+ "learning_rate": 2.9246045741886696e-05,
307
+ "loss": 0.2017,
308
+ "step": 6000
309
+ },
310
+ {
311
+ "epoch": 1.92,
312
+ "eval_loss": 0.21639443933963776,
313
+ "eval_runtime": 19.5656,
314
+ "eval_samples_per_second": 51.672,
315
+ "eval_steps_per_second": 0.818,
316
+ "step": 6000
317
+ },
318
+ {
319
+ "epoch": 1.984,
320
+ "grad_norm": 0.3528446853160858,
321
+ "learning_rate": 2.6216341346016615e-05,
322
+ "loss": 0.2018,
323
+ "step": 6200
324
+ },
325
+ {
326
+ "epoch": 2.048,
327
+ "grad_norm": 0.36889269948005676,
328
+ "learning_rate": 2.3295600934747398e-05,
329
+ "loss": 0.1913,
330
+ "step": 6400
331
+ },
332
+ {
333
+ "epoch": 2.08,
334
+ "eval_loss": 0.21491438150405884,
335
+ "eval_runtime": 19.5386,
336
+ "eval_samples_per_second": 51.744,
337
+ "eval_steps_per_second": 0.819,
338
+ "step": 6500
339
+ },
340
+ {
341
+ "epoch": 2.112,
342
+ "grad_norm": 0.35566315054893494,
343
+ "learning_rate": 2.049720577619374e-05,
344
+ "loss": 0.1873,
345
+ "step": 6600
346
+ },
347
+ {
348
+ "epoch": 2.176,
349
+ "grad_norm": 0.3847609758377075,
350
+ "learning_rate": 1.7833976618054676e-05,
351
+ "loss": 0.1876,
352
+ "step": 6800
353
+ },
354
+ {
355
+ "epoch": 2.24,
356
+ "grad_norm": 0.3727908730506897,
357
+ "learning_rate": 1.531811494981501e-05,
358
+ "loss": 0.185,
359
+ "step": 7000
360
+ },
361
+ {
362
+ "epoch": 2.24,
363
+ "eval_loss": 0.21415850520133972,
364
+ "eval_runtime": 19.5534,
365
+ "eval_samples_per_second": 51.705,
366
+ "eval_steps_per_second": 0.818,
367
+ "step": 7000
368
+ },
369
+ {
370
+ "epoch": 2.304,
371
+ "grad_norm": 0.3717604875564575,
372
+ "learning_rate": 1.296114710205592e-05,
373
+ "loss": 0.1874,
374
+ "step": 7200
375
+ },
376
+ {
377
+ "epoch": 2.368,
378
+ "grad_norm": 0.35382431745529175,
379
+ "learning_rate": 1.0773871438982197e-05,
380
+ "loss": 0.1906,
381
+ "step": 7400
382
+ },
383
+ {
384
+ "epoch": 2.4,
385
+ "eval_loss": 0.2131689041852951,
386
+ "eval_runtime": 19.5504,
387
+ "eval_samples_per_second": 51.712,
388
+ "eval_steps_per_second": 0.818,
389
+ "step": 7500
390
+ },
391
+ {
392
+ "epoch": 2.432,
393
+ "grad_norm": 0.39091038703918457,
394
+ "learning_rate": 8.766308886101405e-06,
395
+ "loss": 0.1895,
396
+ "step": 7600
397
+ },
398
+ {
399
+ "epoch": 2.496,
400
+ "grad_norm": 0.39373019337654114,
401
+ "learning_rate": 6.947657019710796e-06,
402
+ "loss": 0.1895,
403
+ "step": 7800
404
+ },
405
+ {
406
+ "epoch": 2.56,
407
+ "grad_norm": 0.3592659533023834,
408
+ "learning_rate": 5.326247928529187e-06,
409
+ "loss": 0.1878,
410
+ "step": 8000
411
+ },
412
+ {
413
+ "epoch": 2.56,
414
+ "eval_loss": 0.21238671243190765,
415
+ "eval_runtime": 19.5697,
416
+ "eval_samples_per_second": 51.662,
417
+ "eval_steps_per_second": 0.818,
418
+ "step": 8000
419
+ },
420
+ {
421
+ "epoch": 2.624,
422
+ "grad_norm": 0.36308494210243225,
423
+ "learning_rate": 3.90951004052949e-06,
424
+ "loss": 0.1859,
425
+ "step": 8200
426
+ },
427
+ {
428
+ "epoch": 2.6879999999999997,
429
+ "grad_norm": 0.3989940583705902,
430
+ "learning_rate": 2.703934089860627e-06,
431
+ "loss": 0.1878,
432
+ "step": 8400
433
+ },
434
+ {
435
+ "epoch": 2.7199999999999998,
436
+ "eval_loss": 0.2119983732700348,
437
+ "eval_runtime": 19.585,
438
+ "eval_samples_per_second": 51.621,
439
+ "eval_steps_per_second": 0.817,
440
+ "step": 8500
441
+ },
442
+ {
443
+ "epoch": 2.752,
444
+ "grad_norm": 0.3984169661998749,
445
+ "learning_rate": 1.7150433797803012e-06,
446
+ "loss": 0.1847,
447
+ "step": 8600
448
+ },
449
+ {
450
+ "epoch": 2.816,
451
+ "grad_norm": 0.4087405204772949,
452
+ "learning_rate": 9.473684778379677e-07,
453
+ "loss": 0.1881,
454
+ "step": 8800
455
+ },
456
+ {
457
+ "epoch": 2.88,
458
+ "grad_norm": 0.35457876324653625,
459
+ "learning_rate": 4.0442645924106404e-07,
460
+ "loss": 0.1872,
461
+ "step": 9000
462
+ },
463
+ {
464
+ "epoch": 2.88,
465
+ "eval_loss": 0.21190518140792847,
466
+ "eval_runtime": 19.5621,
467
+ "eval_samples_per_second": 51.682,
468
+ "eval_steps_per_second": 0.818,
469
+ "step": 9000
470
+ },
471
+ {
472
+ "epoch": 2.944,
473
+ "grad_norm": 0.3945753574371338,
474
+ "learning_rate": 8.87047935002272e-08,
475
+ "loss": 0.1838,
476
+ "step": 9200
477
+ },
478
+ {
479
+ "epoch": 3.0,
480
+ "step": 9375,
481
+ "total_flos": 6.097062592512e+18,
482
+ "train_loss": 0.21853865112304688,
483
+ "train_runtime": 8852.7007,
484
+ "train_samples_per_second": 33.888,
485
+ "train_steps_per_second": 1.059
486
+ }
487
+ ],
488
+ "logging_steps": 200,
489
+ "max_steps": 9375,
490
+ "num_input_tokens_seen": 0,
491
+ "num_train_epochs": 3,
492
+ "save_steps": 0,
493
+ "stateful_callbacks": {
494
+ "TrainerControl": {
495
+ "args": {
496
+ "should_epoch_stop": false,
497
+ "should_evaluate": false,
498
+ "should_log": false,
499
+ "should_save": true,
500
+ "should_training_stop": true
501
+ },
502
+ "attributes": {}
503
+ }
504
+ },
505
+ "total_flos": 6.097062592512e+18,
506
+ "train_batch_size": 32,
507
+ "trial_name": null,
508
+ "trial_params": null
509
+ }