nvan15 commited on
Commit
16e46c5
·
verified ·
1 Parent(s): 9cf4652

Batch upload part 18

Browse files
Files changed (50) hide show
  1. nl_tasks/exp100/run_ex08/ft/adapter_config.json +18 -0
  2. nl_tasks/exp100/run_ex08/ft/tokenizer.json +0 -0
  3. nl_tasks/exp100/run_ex08/ft/tokenizer.model +3 -0
  4. nl_tasks/exp100/run_ex08/ft2/adapter_config.json +18 -0
  5. nl_tasks/exp100/run_ex08/ft2/adapter_model.bin +3 -0
  6. nl_tasks/exp100/run_ex09/ft/adapter_config.json +18 -0
  7. nl_tasks/exp100/run_ex09/ft/special_tokens_map.json +24 -0
  8. nl_tasks/exp100/run_ex09/ft/tokenizer.json +0 -0
  9. nl_tasks/exp100/run_ex09/ft/tokenizer.model +3 -0
  10. nl_tasks/exp100/run_ex09/ft/tokenizer_config.json +43 -0
  11. nl_tasks/exp100/run_ex09/ft2/adapter_config.json +18 -0
  12. nl_tasks/exp100/run_ex09/ft2/adapter_model.bin +3 -0
  13. nl_tasks/exp100/run_ex09/trainer_state.json +260 -0
  14. nl_tasks/exp100/run_ex10/ft/adapter_config.json +18 -0
  15. nl_tasks/exp100/run_ex10/ft/special_tokens_map.json +24 -0
  16. nl_tasks/exp100/run_ex10/ft/tokenizer.json +0 -0
  17. nl_tasks/exp100/run_ex10/ft/tokenizer.model +3 -0
  18. nl_tasks/exp100/run_ex10/ft/tokenizer_config.json +43 -0
  19. nl_tasks/exp100/run_ex10/ft2/adapter_config.json +18 -0
  20. nl_tasks/exp100/run_ex10/ft2/adapter_model.bin +3 -0
  21. nl_tasks/exp100/run_ex10/trainer_state.json +183 -0
  22. nl_tasks/exp100/run_ex11/ft/adapter_config.json +18 -0
  23. nl_tasks/exp100/run_ex11/ft/special_tokens_map.json +24 -0
  24. nl_tasks/exp100/run_ex11/ft/tokenizer.json +0 -0
  25. nl_tasks/exp100/run_ex11/ft/tokenizer.model +3 -0
  26. nl_tasks/exp100/run_ex11/ft/tokenizer_config.json +43 -0
  27. nl_tasks/exp100/run_ex11/ft2/adapter_config.json +18 -0
  28. nl_tasks/exp100/run_ex11/ft2/adapter_model.bin +3 -0
  29. nl_tasks/exp100/run_ex11/trainer_state.json +183 -0
  30. nl_tasks/exp100/run_ex12/ft/adapter_config.json +18 -0
  31. nl_tasks/exp100/run_ex12/ft/special_tokens_map.json +24 -0
  32. nl_tasks/exp100/run_ex12/ft/tokenizer.json +0 -0
  33. nl_tasks/exp100/run_ex12/ft/tokenizer.model +3 -0
  34. nl_tasks/exp100/run_ex12/ft/tokenizer_config.json +43 -0
  35. nl_tasks/exp100/run_ex12/ft2/adapter_config.json +18 -0
  36. nl_tasks/exp100/run_ex12/ft2/adapter_model.bin +3 -0
  37. nl_tasks/exp100/run_ex12/trainer_state.json +260 -0
  38. nl_tasks/expsBOFT/seed43/trainer_state.json +218 -0
  39. nl_tasks/expsBOFT/seed44/ft/special_tokens_map.json +24 -0
  40. nl_tasks/expsBOFT/seed44/ft/tokenizer.json +0 -0
  41. nl_tasks/expsBOFT/seed44/ft/tokenizer.model +3 -0
  42. nl_tasks/expsBOFT/seed44/ft/tokenizer_config.json +43 -0
  43. nl_tasks/expsBOFT/seed44/ft2/README.md +205 -0
  44. nl_tasks/expsBOFT/seed44/ft2/adapter_config.json +27 -0
  45. nl_tasks/expsBOFT/seed44/ft2/adapter_model.safetensors +3 -0
  46. nl_tasks/expsBOFT/seed44/trainer_state.json +218 -0
  47. nl_tasks/inference/MATH_infer.py +132 -0
  48. nl_tasks/inference/grader.py +141 -0
  49. nl_tasks/inference/gsm8k_infer.py +157 -0
  50. nl_tasks/inference/util.py +253 -0
nl_tasks/exp100/run_ex08/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex08/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp100/run_ex08/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp100/run_ex08/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex08/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bc7597ccf1305d10c57f61a9c789f5d7a5cc15cc8e54fdc0806057df1fe03a3b
3
+ size 33602915
nl_tasks/exp100/run_ex09/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex09/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exp100/run_ex09/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp100/run_ex09/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp100/run_ex09/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exp100/run_ex09/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex09/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e041e22247e003d3fa1f62f968d3096e9383222bfc93bfd5deee072308dba1e8
3
+ size 33602915
nl_tasks/exp100/run_ex09/trainer_state.json ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6252,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09596928982725528,
14
+ "grad_norm": 0.06508654356002808,
15
+ "learning_rate": 0.019976180419211866,
16
+ "loss": 0.5532,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.19193857965451055,
21
+ "grad_norm": 0.0456949919462204,
22
+ "learning_rate": 0.01985490438184627,
23
+ "loss": 0.283,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.28790786948176583,
28
+ "grad_norm": 0.0334312878549099,
29
+ "learning_rate": 0.019632144212142762,
30
+ "loss": 0.2612,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.3838771593090211,
35
+ "grad_norm": 0.04754582419991493,
36
+ "learning_rate": 0.01931019385651278,
37
+ "loss": 0.2547,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.4798464491362764,
42
+ "grad_norm": 0.02298681065440178,
43
+ "learning_rate": 0.018892368705063736,
44
+ "loss": 0.242,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.5758157389635317,
49
+ "grad_norm": 0.021144121885299683,
50
+ "learning_rate": 0.018382971450274496,
51
+ "loss": 0.2388,
52
+ "step": 1200
53
+ },
54
+ {
55
+ "epoch": 0.6717850287907869,
56
+ "grad_norm": 0.023693973198533058,
57
+ "learning_rate": 0.01778724777859868,
58
+ "loss": 0.2355,
59
+ "step": 1400
60
+ },
61
+ {
62
+ "epoch": 0.7677543186180422,
63
+ "grad_norm": 0.019977454096078873,
64
+ "learning_rate": 0.017111332351276085,
65
+ "loss": 0.2275,
66
+ "step": 1600
67
+ },
68
+ {
69
+ "epoch": 0.8637236084452975,
70
+ "grad_norm": 0.019784899428486824,
71
+ "learning_rate": 0.01636218563063265,
72
+ "loss": 0.2254,
73
+ "step": 1800
74
+ },
75
+ {
76
+ "epoch": 0.9596928982725528,
77
+ "grad_norm": 0.020388498902320862,
78
+ "learning_rate": 0.015547522202421078,
79
+ "loss": 0.2216,
80
+ "step": 2000
81
+ },
82
+ {
83
+ "epoch": 1.055662188099808,
84
+ "grad_norm": 0.017220880836248398,
85
+ "learning_rate": 0.014675731332326341,
86
+ "loss": 0.2087,
87
+ "step": 2200
88
+ },
89
+ {
90
+ "epoch": 1.1516314779270633,
91
+ "grad_norm": 0.018207907676696777,
92
+ "learning_rate": 0.013755790574731894,
93
+ "loss": 0.2038,
94
+ "step": 2400
95
+ },
96
+ {
97
+ "epoch": 1.2476007677543186,
98
+ "grad_norm": 0.016460182145237923,
99
+ "learning_rate": 0.012797173323388642,
100
+ "loss": 0.202,
101
+ "step": 2600
102
+ },
103
+ {
104
+ "epoch": 1.3435700575815739,
105
+ "grad_norm": 0.013017100282013416,
106
+ "learning_rate": 0.011809751256014321,
107
+ "loss": 0.2012,
108
+ "step": 2800
109
+ },
110
+ {
111
+ "epoch": 1.4395393474088292,
112
+ "grad_norm": 0.018365703523159027,
113
+ "learning_rate": 0.010803692677432199,
114
+ "loss": 0.2005,
115
+ "step": 3000
116
+ },
117
+ {
118
+ "epoch": 1.5355086372360844,
119
+ "grad_norm": 0.01587655022740364,
120
+ "learning_rate": 0.009789357808094205,
121
+ "loss": 0.1964,
122
+ "step": 3200
123
+ },
124
+ {
125
+ "epoch": 1.6314779270633397,
126
+ "grad_norm": 0.01589033380150795,
127
+ "learning_rate": 0.008777192096289453,
128
+ "loss": 0.198,
129
+ "step": 3400
130
+ },
131
+ {
132
+ "epoch": 1.727447216890595,
133
+ "grad_norm": 0.014157130382955074,
134
+ "learning_rate": 0.007777618652691038,
135
+ "loss": 0.1951,
136
+ "step": 3600
137
+ },
138
+ {
139
+ "epoch": 1.8234165067178503,
140
+ "grad_norm": 0.013822129927575588,
141
+ "learning_rate": 0.006800930914931747,
142
+ "loss": 0.1941,
143
+ "step": 3800
144
+ },
145
+ {
146
+ "epoch": 1.9193857965451055,
147
+ "grad_norm": 0.013463828712701797,
148
+ "learning_rate": 0.005857186647530748,
149
+ "loss": 0.1892,
150
+ "step": 4000
151
+ },
152
+ {
153
+ "epoch": 2.015355086372361,
154
+ "grad_norm": 0.015684494748711586,
155
+ "learning_rate": 0.004956104368742014,
156
+ "loss": 0.1853,
157
+ "step": 4200
158
+ },
159
+ {
160
+ "epoch": 2.111324376199616,
161
+ "grad_norm": 0.01606130413711071,
162
+ "learning_rate": 0.004106963270903152,
163
+ "loss": 0.1642,
164
+ "step": 4400
165
+ },
166
+ {
167
+ "epoch": 2.2072936660268714,
168
+ "grad_norm": 0.015006215311586857,
169
+ "learning_rate": 0.0033185076648879854,
170
+ "loss": 0.164,
171
+ "step": 4600
172
+ },
173
+ {
174
+ "epoch": 2.3032629558541267,
175
+ "grad_norm": 0.013956602662801743,
176
+ "learning_rate": 0.0025988569326776123,
177
+ "loss": 0.165,
178
+ "step": 4800
179
+ },
180
+ {
181
+ "epoch": 2.399232245681382,
182
+ "grad_norm": 0.01500143762677908,
183
+ "learning_rate": 0.0019554219153431287,
184
+ "loss": 0.1593,
185
+ "step": 5000
186
+ },
187
+ {
188
+ "epoch": 2.495201535508637,
189
+ "grad_norm": 0.016031745821237564,
190
+ "learning_rate": 0.0013948285974623588,
191
+ "loss": 0.1621,
192
+ "step": 5200
193
+ },
194
+ {
195
+ "epoch": 2.5911708253358925,
196
+ "grad_norm": 0.01416528970003128,
197
+ "learning_rate": 0.0009228498738555002,
198
+ "loss": 0.161,
199
+ "step": 5400
200
+ },
201
+ {
202
+ "epoch": 2.6871401151631478,
203
+ "grad_norm": 0.01726922020316124,
204
+ "learning_rate": 0.0005443461012942996,
205
+ "loss": 0.1576,
206
+ "step": 5600
207
+ },
208
+ {
209
+ "epoch": 2.783109404990403,
210
+ "grad_norm": 0.015128469094634056,
211
+ "learning_rate": 0.00026321504737310985,
212
+ "loss": 0.1558,
213
+ "step": 5800
214
+ },
215
+ {
216
+ "epoch": 2.8790786948176583,
217
+ "grad_norm": 0.016625599935650826,
218
+ "learning_rate": 8.23517519598982e-05,
219
+ "loss": 0.1565,
220
+ "step": 6000
221
+ },
222
+ {
223
+ "epoch": 2.9750479846449136,
224
+ "grad_norm": 0.015019167214632034,
225
+ "learning_rate": 3.618714567147352e-06,
226
+ "loss": 0.1576,
227
+ "step": 6200
228
+ },
229
+ {
230
+ "epoch": 3.0,
231
+ "step": 6252,
232
+ "total_flos": 6.0970588176384e+18,
233
+ "train_loss": 0.2099078561889004,
234
+ "train_runtime": 8160.2371,
235
+ "train_samples_per_second": 36.764,
236
+ "train_steps_per_second": 0.766
237
+ }
238
+ ],
239
+ "logging_steps": 200,
240
+ "max_steps": 6252,
241
+ "num_input_tokens_seen": 0,
242
+ "num_train_epochs": 3,
243
+ "save_steps": 0,
244
+ "stateful_callbacks": {
245
+ "TrainerControl": {
246
+ "args": {
247
+ "should_epoch_stop": false,
248
+ "should_evaluate": false,
249
+ "should_log": false,
250
+ "should_save": true,
251
+ "should_training_stop": true
252
+ },
253
+ "attributes": {}
254
+ }
255
+ },
256
+ "total_flos": 6.0970588176384e+18,
257
+ "train_batch_size": 48,
258
+ "trial_name": null,
259
+ "trial_params": null
260
+ }
nl_tasks/exp100/run_ex10/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex10/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exp100/run_ex10/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp100/run_ex10/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp100/run_ex10/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exp100/run_ex10/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex10/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff09e3f976b0f890a445477281ac6c563f8b2b11869aff99580213720ae3ec8f
3
+ size 33602915
nl_tasks/exp100/run_ex10/trainer_state.json ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4168,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09596928982725528,
14
+ "grad_norm": 0.17983072996139526,
15
+ "learning_rate": 0.029892950505116346,
16
+ "loss": 5.4284,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.19193857965451055,
21
+ "grad_norm": 0.02686592936515808,
22
+ "learning_rate": 0.02944923894162051,
23
+ "loss": 0.3284,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.28790786948176583,
28
+ "grad_norm": 0.02659301459789276,
29
+ "learning_rate": 0.028671096149031867,
30
+ "loss": 0.2782,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.3838771593090211,
35
+ "grad_norm": 0.021944062784314156,
36
+ "learning_rate": 0.027576532435823177,
37
+ "loss": 0.2639,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.4798464491362764,
42
+ "grad_norm": 0.019736966118216515,
43
+ "learning_rate": 0.02619088175137459,
44
+ "loss": 0.2518,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.5758157389635317,
49
+ "grad_norm": 0.020629985257983208,
50
+ "learning_rate": 0.024546215325414244,
51
+ "loss": 0.2478,
52
+ "step": 1200
53
+ },
54
+ {
55
+ "epoch": 0.6717850287907869,
56
+ "grad_norm": 0.022958872839808464,
57
+ "learning_rate": 0.022680599371429494,
58
+ "loss": 0.2429,
59
+ "step": 1400
60
+ },
61
+ {
62
+ "epoch": 0.7677543186180422,
63
+ "grad_norm": 0.020077615976333618,
64
+ "learning_rate": 0.020637214034687996,
65
+ "loss": 0.2336,
66
+ "step": 1600
67
+ },
68
+ {
69
+ "epoch": 0.8637236084452975,
70
+ "grad_norm": 0.018682507798075676,
71
+ "learning_rate": 0.018463353977035808,
72
+ "loss": 0.2302,
73
+ "step": 1800
74
+ },
75
+ {
76
+ "epoch": 0.9596928982725528,
77
+ "grad_norm": 0.02266324870288372,
78
+ "learning_rate": 0.016209333730185887,
79
+ "loss": 0.225,
80
+ "step": 2000
81
+ },
82
+ {
83
+ "epoch": 1.055662188099808,
84
+ "grad_norm": 0.01823570765554905,
85
+ "learning_rate": 0.013927323153367477,
86
+ "loss": 0.2111,
87
+ "step": 2200
88
+ },
89
+ {
90
+ "epoch": 1.1516314779270633,
91
+ "grad_norm": 0.02119363099336624,
92
+ "learning_rate": 0.011670139948958654,
93
+ "loss": 0.2052,
94
+ "step": 2400
95
+ },
96
+ {
97
+ "epoch": 1.2476007677543186,
98
+ "grad_norm": 0.018385590985417366,
99
+ "learning_rate": 0.009490027183628048,
100
+ "loss": 0.2016,
101
+ "step": 2600
102
+ },
103
+ {
104
+ "epoch": 1.3435700575815739,
105
+ "grad_norm": 0.01590455323457718,
106
+ "learning_rate": 0.0074374441095630085,
107
+ "loss": 0.1987,
108
+ "step": 2800
109
+ },
110
+ {
111
+ "epoch": 1.4395393474088292,
112
+ "grad_norm": 0.017842255532741547,
113
+ "learning_rate": 0.0055598982725275775,
114
+ "loss": 0.1957,
115
+ "step": 3000
116
+ },
117
+ {
118
+ "epoch": 1.5355086372360844,
119
+ "grad_norm": 0.018860826268792152,
120
+ "learning_rate": 0.0039008459378997943,
121
+ "loss": 0.1901,
122
+ "step": 3200
123
+ },
124
+ {
125
+ "epoch": 1.6314779270633397,
126
+ "grad_norm": 0.018248997628688812,
127
+ "learning_rate": 0.002498686284601174,
128
+ "loss": 0.1899,
129
+ "step": 3400
130
+ },
131
+ {
132
+ "epoch": 1.727447216890595,
133
+ "grad_norm": 0.01603817380964756,
134
+ "learning_rate": 0.0013858726465499599,
135
+ "loss": 0.1865,
136
+ "step": 3600
137
+ },
138
+ {
139
+ "epoch": 1.8234165067178503,
140
+ "grad_norm": 0.016947340220212936,
141
+ "learning_rate": 0.0005881613721758754,
142
+ "loss": 0.1848,
143
+ "step": 3800
144
+ },
145
+ {
146
+ "epoch": 1.9193857965451055,
147
+ "grad_norm": 0.018619216978549957,
148
+ "learning_rate": 0.00012401568732964163,
149
+ "loss": 0.1808,
150
+ "step": 4000
151
+ },
152
+ {
153
+ "epoch": 2.0,
154
+ "step": 4168,
155
+ "total_flos": 4.0647058784256e+18,
156
+ "train_loss": 0.47151951963750505,
157
+ "train_runtime": 5453.3681,
158
+ "train_samples_per_second": 36.675,
159
+ "train_steps_per_second": 0.764
160
+ }
161
+ ],
162
+ "logging_steps": 200,
163
+ "max_steps": 4168,
164
+ "num_input_tokens_seen": 0,
165
+ "num_train_epochs": 2,
166
+ "save_steps": 0,
167
+ "stateful_callbacks": {
168
+ "TrainerControl": {
169
+ "args": {
170
+ "should_epoch_stop": false,
171
+ "should_evaluate": false,
172
+ "should_log": false,
173
+ "should_save": true,
174
+ "should_training_stop": true
175
+ },
176
+ "attributes": {}
177
+ }
178
+ },
179
+ "total_flos": 4.0647058784256e+18,
180
+ "train_batch_size": 48,
181
+ "trial_name": null,
182
+ "trial_params": null
183
+ }
nl_tasks/exp100/run_ex11/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex11/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exp100/run_ex11/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp100/run_ex11/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp100/run_ex11/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exp100/run_ex11/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "v_proj",
14
+ "q_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex11/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:669642b5dfd24d7a899a1a21a69a5a9cf6d0170c2609f2a54c664123864585da
3
+ size 33602915
nl_tasks/exp100/run_ex11/trainer_state.json ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 4168,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09596928982725528,
14
+ "grad_norm": 0.09253966063261032,
15
+ "learning_rate": 0.007971453468031025,
16
+ "loss": 3.4503,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.19193857965451055,
21
+ "grad_norm": 0.06041109189391136,
22
+ "learning_rate": 0.007853130384432137,
23
+ "loss": 0.2986,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.28790786948176583,
28
+ "grad_norm": 0.05962882563471794,
29
+ "learning_rate": 0.007645625639741832,
30
+ "loss": 0.2678,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.3838771593090211,
35
+ "grad_norm": 0.05638430267572403,
36
+ "learning_rate": 0.007353741982886181,
37
+ "loss": 0.2562,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.4798464491362764,
42
+ "grad_norm": 0.043786656111478806,
43
+ "learning_rate": 0.0069842351336998915,
44
+ "loss": 0.2442,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.5758157389635317,
49
+ "grad_norm": 0.04289592057466507,
50
+ "learning_rate": 0.006545657420110465,
51
+ "loss": 0.2413,
52
+ "step": 1200
53
+ },
54
+ {
55
+ "epoch": 0.6717850287907869,
56
+ "grad_norm": 0.044572457671165466,
57
+ "learning_rate": 0.006048159832381199,
58
+ "loss": 0.237,
59
+ "step": 1400
60
+ },
61
+ {
62
+ "epoch": 0.7677543186180422,
63
+ "grad_norm": 0.03955984488129616,
64
+ "learning_rate": 0.0055032570759168,
65
+ "loss": 0.2277,
66
+ "step": 1600
67
+ },
68
+ {
69
+ "epoch": 0.8637236084452975,
70
+ "grad_norm": 0.03934319689869881,
71
+ "learning_rate": 0.004923561060542882,
72
+ "loss": 0.2243,
73
+ "step": 1800
74
+ },
75
+ {
76
+ "epoch": 0.9596928982725528,
77
+ "grad_norm": 0.034399278461933136,
78
+ "learning_rate": 0.004322488994716237,
79
+ "loss": 0.2193,
80
+ "step": 2000
81
+ },
82
+ {
83
+ "epoch": 1.055662188099808,
84
+ "grad_norm": 0.037286121398210526,
85
+ "learning_rate": 0.003713952840897994,
86
+ "loss": 0.2051,
87
+ "step": 2200
88
+ },
89
+ {
90
+ "epoch": 1.1516314779270633,
91
+ "grad_norm": 0.03788420185446739,
92
+ "learning_rate": 0.0031120373197223083,
93
+ "loss": 0.1981,
94
+ "step": 2400
95
+ },
96
+ {
97
+ "epoch": 1.2476007677543186,
98
+ "grad_norm": 0.036435652524232864,
99
+ "learning_rate": 0.0025306739156341464,
100
+ "loss": 0.1947,
101
+ "step": 2600
102
+ },
103
+ {
104
+ "epoch": 1.3435700575815739,
105
+ "grad_norm": 0.03084568865597248,
106
+ "learning_rate": 0.0019833184292168023,
107
+ "loss": 0.1924,
108
+ "step": 2800
109
+ },
110
+ {
111
+ "epoch": 1.4395393474088292,
112
+ "grad_norm": 0.03642988204956055,
113
+ "learning_rate": 0.0014826395393406876,
114
+ "loss": 0.1901,
115
+ "step": 3000
116
+ },
117
+ {
118
+ "epoch": 1.5355086372360844,
119
+ "grad_norm": 0.03472171723842621,
120
+ "learning_rate": 0.0010402255834399453,
121
+ "loss": 0.1851,
122
+ "step": 3200
123
+ },
124
+ {
125
+ "epoch": 1.6314779270633397,
126
+ "grad_norm": 0.03253506124019623,
127
+ "learning_rate": 0.0006663163425603131,
128
+ "loss": 0.1857,
129
+ "step": 3400
130
+ },
131
+ {
132
+ "epoch": 1.727447216890595,
133
+ "grad_norm": 0.036016546189785004,
134
+ "learning_rate": 0.0003695660390799893,
135
+ "loss": 0.1827,
136
+ "step": 3600
137
+ },
138
+ {
139
+ "epoch": 1.8234165067178503,
140
+ "grad_norm": 0.032651085406541824,
141
+ "learning_rate": 0.00015684303258023348,
142
+ "loss": 0.1821,
143
+ "step": 3800
144
+ },
145
+ {
146
+ "epoch": 1.9193857965451055,
147
+ "grad_norm": 0.0359911285340786,
148
+ "learning_rate": 3.30708499545711e-05,
149
+ "loss": 0.1782,
150
+ "step": 4000
151
+ },
152
+ {
153
+ "epoch": 2.0,
154
+ "step": 4168,
155
+ "total_flos": 4.0647058784256e+18,
156
+ "train_loss": 0.3700037432723677,
157
+ "train_runtime": 5461.8246,
158
+ "train_samples_per_second": 36.618,
159
+ "train_steps_per_second": 0.763
160
+ }
161
+ ],
162
+ "logging_steps": 200,
163
+ "max_steps": 4168,
164
+ "num_input_tokens_seen": 0,
165
+ "num_train_epochs": 2,
166
+ "save_steps": 0,
167
+ "stateful_callbacks": {
168
+ "TrainerControl": {
169
+ "args": {
170
+ "should_epoch_stop": false,
171
+ "should_evaluate": false,
172
+ "should_log": false,
173
+ "should_save": true,
174
+ "should_training_stop": true
175
+ },
176
+ "attributes": {}
177
+ }
178
+ },
179
+ "total_flos": 4.0647058784256e+18,
180
+ "train_batch_size": 48,
181
+ "trial_name": null,
182
+ "trial_params": null
183
+ }
nl_tasks/exp100/run_ex12/ft/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": false,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex12/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/exp100/run_ex12/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/exp100/run_ex12/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/exp100/run_ex12/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/exp100/run_ex12/ft2/adapter_config.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "T": 1.0,
3
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
4
+ "bias": "none",
5
+ "inference_mode": true,
6
+ "layers_to_transform": null,
7
+ "modules_to_save": null,
8
+ "num_rotations": 1,
9
+ "peft_type": "ROTATION",
10
+ "r": 16,
11
+ "revision": null,
12
+ "target_modules": [
13
+ "q_proj",
14
+ "v_proj"
15
+ ],
16
+ "target_modules_to_skip": null,
17
+ "task_type": "CAUSAL_LM"
18
+ }
nl_tasks/exp100/run_ex12/ft2/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:86f326abe5a1f356d70a24ce7bab2ee7dd2bcb059d6f2282d04dc2f86fee6dc1
3
+ size 33602915
nl_tasks/exp100/run_ex12/trainer_state.json ADDED
@@ -0,0 +1,260 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 3.0,
6
+ "eval_steps": 500,
7
+ "global_step": 6252,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.09596928982725528,
14
+ "grad_norm": 0.13730373978614807,
15
+ "learning_rate": 0.007990472167684746,
16
+ "loss": 0.4657,
17
+ "step": 200
18
+ },
19
+ {
20
+ "epoch": 0.19193857965451055,
21
+ "grad_norm": 0.0858209878206253,
22
+ "learning_rate": 0.007941961752738508,
23
+ "loss": 0.2803,
24
+ "step": 400
25
+ },
26
+ {
27
+ "epoch": 0.28790786948176583,
28
+ "grad_norm": 0.06287017464637756,
29
+ "learning_rate": 0.007852857684857105,
30
+ "loss": 0.2572,
31
+ "step": 600
32
+ },
33
+ {
34
+ "epoch": 0.3838771593090211,
35
+ "grad_norm": 0.053079769015312195,
36
+ "learning_rate": 0.007724077542605112,
37
+ "loss": 0.2477,
38
+ "step": 800
39
+ },
40
+ {
41
+ "epoch": 0.4798464491362764,
42
+ "grad_norm": 0.03740881383419037,
43
+ "learning_rate": 0.007556947482025495,
44
+ "loss": 0.2368,
45
+ "step": 1000
46
+ },
47
+ {
48
+ "epoch": 0.5758157389635317,
49
+ "grad_norm": 0.034000739455223083,
50
+ "learning_rate": 0.007353188580109798,
51
+ "loss": 0.2351,
52
+ "step": 1200
53
+ },
54
+ {
55
+ "epoch": 0.6717850287907869,
56
+ "grad_norm": 0.04023474082350731,
57
+ "learning_rate": 0.007114899111439472,
58
+ "loss": 0.231,
59
+ "step": 1400
60
+ },
61
+ {
62
+ "epoch": 0.7677543186180422,
63
+ "grad_norm": 0.030921513214707375,
64
+ "learning_rate": 0.006844532940510433,
65
+ "loss": 0.223,
66
+ "step": 1600
67
+ },
68
+ {
69
+ "epoch": 0.8637236084452975,
70
+ "grad_norm": 0.03130370005965233,
71
+ "learning_rate": 0.006544874252253061,
72
+ "loss": 0.221,
73
+ "step": 1800
74
+ },
75
+ {
76
+ "epoch": 0.9596928982725528,
77
+ "grad_norm": 0.029764752835035324,
78
+ "learning_rate": 0.006219008880968432,
79
+ "loss": 0.2163,
80
+ "step": 2000
81
+ },
82
+ {
83
+ "epoch": 1.055662188099808,
84
+ "grad_norm": 0.027886036783456802,
85
+ "learning_rate": 0.0058702925329305366,
86
+ "loss": 0.2034,
87
+ "step": 2200
88
+ },
89
+ {
90
+ "epoch": 1.1516314779270633,
91
+ "grad_norm": 0.026955854147672653,
92
+ "learning_rate": 0.005502316229892758,
93
+ "loss": 0.1974,
94
+ "step": 2400
95
+ },
96
+ {
97
+ "epoch": 1.2476007677543186,
98
+ "grad_norm": 0.024037910625338554,
99
+ "learning_rate": 0.005118869329355457,
100
+ "loss": 0.1958,
101
+ "step": 2600
102
+ },
103
+ {
104
+ "epoch": 1.3435700575815739,
105
+ "grad_norm": 0.02323935180902481,
106
+ "learning_rate": 0.004723900502405729,
107
+ "loss": 0.195,
108
+ "step": 2800
109
+ },
110
+ {
111
+ "epoch": 1.4395393474088292,
112
+ "grad_norm": 0.023859383538365364,
113
+ "learning_rate": 0.00432147707097288,
114
+ "loss": 0.1937,
115
+ "step": 3000
116
+ },
117
+ {
118
+ "epoch": 1.5355086372360844,
119
+ "grad_norm": 0.025017334148287773,
120
+ "learning_rate": 0.0039157431232376815,
121
+ "loss": 0.1901,
122
+ "step": 3200
123
+ },
124
+ {
125
+ "epoch": 1.6314779270633397,
126
+ "grad_norm": 0.024762826040387154,
127
+ "learning_rate": 0.0035108768385157816,
128
+ "loss": 0.1915,
129
+ "step": 3400
130
+ },
131
+ {
132
+ "epoch": 1.727447216890595,
133
+ "grad_norm": 0.023029997944831848,
134
+ "learning_rate": 0.0031110474610764154,
135
+ "loss": 0.189,
136
+ "step": 3600
137
+ },
138
+ {
139
+ "epoch": 1.8234165067178503,
140
+ "grad_norm": 0.023539869114756584,
141
+ "learning_rate": 0.0027203723659726987,
142
+ "loss": 0.1881,
143
+ "step": 3800
144
+ },
145
+ {
146
+ "epoch": 1.9193857965451055,
147
+ "grad_norm": 0.023440731689333916,
148
+ "learning_rate": 0.002342874659012299,
149
+ "loss": 0.1832,
150
+ "step": 4000
151
+ },
152
+ {
153
+ "epoch": 2.015355086372361,
154
+ "grad_norm": 0.024401186034083366,
155
+ "learning_rate": 0.0019824417474968055,
156
+ "loss": 0.1798,
157
+ "step": 4200
158
+ },
159
+ {
160
+ "epoch": 2.111324376199616,
161
+ "grad_norm": 0.02438773214817047,
162
+ "learning_rate": 0.001642785308361261,
163
+ "loss": 0.1588,
164
+ "step": 4400
165
+ },
166
+ {
167
+ "epoch": 2.2072936660268714,
168
+ "grad_norm": 0.023876527324318886,
169
+ "learning_rate": 0.0013274030659551942,
170
+ "loss": 0.1585,
171
+ "step": 4600
172
+ },
173
+ {
174
+ "epoch": 2.3032629558541267,
175
+ "grad_norm": 0.021032139658927917,
176
+ "learning_rate": 0.001039542773071045,
177
+ "loss": 0.1594,
178
+ "step": 4800
179
+ },
180
+ {
181
+ "epoch": 2.399232245681382,
182
+ "grad_norm": 0.023737894371151924,
183
+ "learning_rate": 0.0007821687661372514,
184
+ "loss": 0.1545,
185
+ "step": 5000
186
+ },
187
+ {
188
+ "epoch": 2.495201535508637,
189
+ "grad_norm": 0.02574790269136429,
190
+ "learning_rate": 0.0005579314389849435,
191
+ "loss": 0.1574,
192
+ "step": 5200
193
+ },
194
+ {
195
+ "epoch": 2.5911708253358925,
196
+ "grad_norm": 0.022647960111498833,
197
+ "learning_rate": 0.00036913994954220007,
198
+ "loss": 0.157,
199
+ "step": 5400
200
+ },
201
+ {
202
+ "epoch": 2.6871401151631478,
203
+ "grad_norm": 0.024767184630036354,
204
+ "learning_rate": 0.00021773844051771986,
205
+ "loss": 0.1537,
206
+ "step": 5600
207
+ },
208
+ {
209
+ "epoch": 2.783109404990403,
210
+ "grad_norm": 0.02572454698383808,
211
+ "learning_rate": 0.00010528601894924394,
212
+ "loss": 0.1524,
213
+ "step": 5800
214
+ },
215
+ {
216
+ "epoch": 2.8790786948176583,
217
+ "grad_norm": 0.024511748924851418,
218
+ "learning_rate": 3.294070078395928e-05,
219
+ "loss": 0.1536,
220
+ "step": 6000
221
+ },
222
+ {
223
+ "epoch": 2.9750479846449136,
224
+ "grad_norm": 0.02418585494160652,
225
+ "learning_rate": 1.447485826858941e-06,
226
+ "loss": 0.1544,
227
+ "step": 6200
228
+ },
229
+ {
230
+ "epoch": 3.0,
231
+ "step": 6252,
232
+ "total_flos": 6.0970588176384e+18,
233
+ "train_loss": 0.2022177925951879,
234
+ "train_runtime": 8189.2736,
235
+ "train_samples_per_second": 36.633,
236
+ "train_steps_per_second": 0.763
237
+ }
238
+ ],
239
+ "logging_steps": 200,
240
+ "max_steps": 6252,
241
+ "num_input_tokens_seen": 0,
242
+ "num_train_epochs": 3,
243
+ "save_steps": 0,
244
+ "stateful_callbacks": {
245
+ "TrainerControl": {
246
+ "args": {
247
+ "should_epoch_stop": false,
248
+ "should_evaluate": false,
249
+ "should_log": false,
250
+ "should_save": true,
251
+ "should_training_stop": true
252
+ },
253
+ "attributes": {}
254
+ }
255
+ },
256
+ "total_flos": 6.0970588176384e+18,
257
+ "train_batch_size": 48,
258
+ "trial_name": null,
259
+ "trial_params": null
260
+ }
nl_tasks/expsBOFT/seed43/trainer_state.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.08,
14
+ "grad_norm": 0.08375173062086105,
15
+ "learning_rate": 0.000392,
16
+ "loss": 0.5193,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.16,
21
+ "grad_norm": 0.09268203377723694,
22
+ "learning_rate": 0.0007920000000000001,
23
+ "loss": 0.3316,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.24,
28
+ "grad_norm": 0.08198747783899307,
29
+ "learning_rate": 0.0007964216926581925,
30
+ "loss": 0.304,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.32,
35
+ "grad_norm": 0.0816216915845871,
36
+ "learning_rate": 0.0007854602918076551,
37
+ "loss": 0.2918,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.4,
42
+ "grad_norm": 0.07457849383354187,
43
+ "learning_rate": 0.0007673184950396212,
44
+ "loss": 0.274,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.48,
49
+ "grad_norm": 0.07685171067714691,
50
+ "learning_rate": 0.0007423342497022817,
51
+ "loss": 0.2687,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.56,
56
+ "grad_norm": 0.07849128544330597,
57
+ "learning_rate": 0.0007109729650142636,
58
+ "loss": 0.2651,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.64,
63
+ "grad_norm": 0.07266736030578613,
64
+ "learning_rate": 0.0006738188423714755,
65
+ "loss": 0.2575,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.72,
70
+ "grad_norm": 0.06927025318145752,
71
+ "learning_rate": 0.0006315639927804526,
72
+ "loss": 0.2525,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.8,
77
+ "grad_norm": 0.08536054193973541,
78
+ "learning_rate": 0.00058499554413983,
79
+ "loss": 0.2494,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.88,
84
+ "grad_norm": 0.07602768391370773,
85
+ "learning_rate": 0.000534980978536894,
86
+ "loss": 0.2429,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.96,
91
+ "grad_norm": 0.07055249065160751,
92
+ "learning_rate": 0.00048245197269763485,
93
+ "loss": 0.2457,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 1.04,
98
+ "grad_norm": 0.07144515216350555,
99
+ "learning_rate": 0.00042838704261214224,
100
+ "loss": 0.2292,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 1.12,
105
+ "grad_norm": 0.07937044650316238,
106
+ "learning_rate": 0.00037379331563313267,
107
+ "loss": 0.2169,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 1.2,
112
+ "grad_norm": 0.07409252226352692,
113
+ "learning_rate": 0.00031968776959892677,
114
+ "loss": 0.2098,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 1.28,
119
+ "grad_norm": 0.07844420522451401,
120
+ "learning_rate": 0.00026707828846051743,
121
+ "loss": 0.2145,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 1.3599999999999999,
126
+ "grad_norm": 0.07791652530431747,
127
+ "learning_rate": 0.00021694488731055218,
128
+ "loss": 0.2082,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 1.44,
133
+ "grad_norm": 0.0782908946275711,
134
+ "learning_rate": 0.00017022145655641685,
135
+ "loss": 0.2077,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 1.52,
140
+ "grad_norm": 0.0826650932431221,
141
+ "learning_rate": 0.00012777836530893536,
142
+ "loss": 0.2137,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 1.6,
147
+ "grad_norm": 0.0696156919002533,
148
+ "learning_rate": 9.040624805263558e-05,
149
+ "loss": 0.2076,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 1.6800000000000002,
154
+ "grad_norm": 0.06966507434844971,
155
+ "learning_rate": 5.880127662124091e-05,
156
+ "loss": 0.2108,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 1.76,
161
+ "grad_norm": 0.08326321095228195,
162
+ "learning_rate": 3.355219183361582e-05,
163
+ "loss": 0.2106,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 1.8399999999999999,
168
+ "grad_norm": 0.0792745053768158,
169
+ "learning_rate": 1.512933636625089e-05,
170
+ "loss": 0.2073,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 1.92,
175
+ "grad_norm": 0.07648582756519318,
176
+ "learning_rate": 3.8758931591217575e-06,
177
+ "loss": 0.209,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 2.0,
182
+ "grad_norm": 0.0787830799818039,
183
+ "learning_rate": 1.4925668450960217e-09,
184
+ "loss": 0.2124,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 2.0,
189
+ "step": 1250,
190
+ "total_flos": 1.62594677587968e+18,
191
+ "train_loss": 0.25041088790893556,
192
+ "train_runtime": 3374.0916,
193
+ "train_samples_per_second": 23.71,
194
+ "train_steps_per_second": 0.37
195
+ }
196
+ ],
197
+ "logging_steps": 50,
198
+ "max_steps": 1250,
199
+ "num_input_tokens_seen": 0,
200
+ "num_train_epochs": 2,
201
+ "save_steps": 0,
202
+ "stateful_callbacks": {
203
+ "TrainerControl": {
204
+ "args": {
205
+ "should_epoch_stop": false,
206
+ "should_evaluate": false,
207
+ "should_log": false,
208
+ "should_save": false,
209
+ "should_training_stop": false
210
+ },
211
+ "attributes": {}
212
+ }
213
+ },
214
+ "total_flos": 1.62594677587968e+18,
215
+ "train_batch_size": 32,
216
+ "trial_name": null,
217
+ "trial_params": null
218
+ }
nl_tasks/expsBOFT/seed44/ft/special_tokens_map.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "</s>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": "<unk>",
17
+ "unk_token": {
18
+ "content": "<unk>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ }
24
+ }
nl_tasks/expsBOFT/seed44/ft/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
nl_tasks/expsBOFT/seed44/ft/tokenizer.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9e556afd44213b6bd1be2b850ebbbd98f5481437a8021afaf58ee7fb1818d347
3
+ size 499723
nl_tasks/expsBOFT/seed44/ft/tokenizer_config.json ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": true,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": null,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<unk>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<s>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "</s>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "bos_token": "<s>",
32
+ "clean_up_tokenization_spaces": false,
33
+ "eos_token": "</s>",
34
+ "extra_special_tokens": {},
35
+ "legacy": false,
36
+ "model_max_length": 512,
37
+ "pad_token": "<unk>",
38
+ "padding_side": "right",
39
+ "sp_model_kwargs": {},
40
+ "tokenizer_class": "LlamaTokenizer",
41
+ "unk_token": "<unk>",
42
+ "use_default_system_prompt": false
43
+ }
nl_tasks/expsBOFT/seed44/ft2/README.md ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: meta-llama/Llama-2-7b-hf
3
+ library_name: peft
4
+ tags:
5
+ - base_model:adapter:meta-llama/Llama-2-7b-hf
6
+ - transformers
7
+ ---
8
+
9
+ # Model Card for Model ID
10
+
11
+ <!-- Provide a quick summary of what the model is/does. -->
12
+
13
+
14
+
15
+ ## Model Details
16
+
17
+ ### Model Description
18
+
19
+ <!-- Provide a longer summary of what this model is. -->
20
+
21
+
22
+
23
+ - **Developed by:** [More Information Needed]
24
+ - **Funded by [optional]:** [More Information Needed]
25
+ - **Shared by [optional]:** [More Information Needed]
26
+ - **Model type:** [More Information Needed]
27
+ - **Language(s) (NLP):** [More Information Needed]
28
+ - **License:** [More Information Needed]
29
+ - **Finetuned from model [optional]:** [More Information Needed]
30
+
31
+ ### Model Sources [optional]
32
+
33
+ <!-- Provide the basic links for the model. -->
34
+
35
+ - **Repository:** [More Information Needed]
36
+ - **Paper [optional]:** [More Information Needed]
37
+ - **Demo [optional]:** [More Information Needed]
38
+
39
+ ## Uses
40
+
41
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
42
+
43
+ ### Direct Use
44
+
45
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
46
+
47
+ [More Information Needed]
48
+
49
+ ### Downstream Use [optional]
50
+
51
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
52
+
53
+ [More Information Needed]
54
+
55
+ ### Out-of-Scope Use
56
+
57
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
58
+
59
+ [More Information Needed]
60
+
61
+ ## Bias, Risks, and Limitations
62
+
63
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
64
+
65
+ [More Information Needed]
66
+
67
+ ### Recommendations
68
+
69
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
70
+
71
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
72
+
73
+ ## How to Get Started with the Model
74
+
75
+ Use the code below to get started with the model.
76
+
77
+ [More Information Needed]
78
+
79
+ ## Training Details
80
+
81
+ ### Training Data
82
+
83
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
84
+
85
+ [More Information Needed]
86
+
87
+ ### Training Procedure
88
+
89
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
90
+
91
+ #### Preprocessing [optional]
92
+
93
+ [More Information Needed]
94
+
95
+
96
+ #### Training Hyperparameters
97
+
98
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
99
+
100
+ #### Speeds, Sizes, Times [optional]
101
+
102
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
103
+
104
+ [More Information Needed]
105
+
106
+ ## Evaluation
107
+
108
+ <!-- This section describes the evaluation protocols and provides the results. -->
109
+
110
+ ### Testing Data, Factors & Metrics
111
+
112
+ #### Testing Data
113
+
114
+ <!-- This should link to a Dataset Card if possible. -->
115
+
116
+ [More Information Needed]
117
+
118
+ #### Factors
119
+
120
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
121
+
122
+ [More Information Needed]
123
+
124
+ #### Metrics
125
+
126
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
127
+
128
+ [More Information Needed]
129
+
130
+ ### Results
131
+
132
+ [More Information Needed]
133
+
134
+ #### Summary
135
+
136
+
137
+
138
+ ## Model Examination [optional]
139
+
140
+ <!-- Relevant interpretability work for the model goes here -->
141
+
142
+ [More Information Needed]
143
+
144
+ ## Environmental Impact
145
+
146
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
147
+
148
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
149
+
150
+ - **Hardware Type:** [More Information Needed]
151
+ - **Hours used:** [More Information Needed]
152
+ - **Cloud Provider:** [More Information Needed]
153
+ - **Compute Region:** [More Information Needed]
154
+ - **Carbon Emitted:** [More Information Needed]
155
+
156
+ ## Technical Specifications [optional]
157
+
158
+ ### Model Architecture and Objective
159
+
160
+ [More Information Needed]
161
+
162
+ ### Compute Infrastructure
163
+
164
+ [More Information Needed]
165
+
166
+ #### Hardware
167
+
168
+ [More Information Needed]
169
+
170
+ #### Software
171
+
172
+ [More Information Needed]
173
+
174
+ ## Citation [optional]
175
+
176
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
177
+
178
+ **BibTeX:**
179
+
180
+ [More Information Needed]
181
+
182
+ **APA:**
183
+
184
+ [More Information Needed]
185
+
186
+ ## Glossary [optional]
187
+
188
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
189
+
190
+ [More Information Needed]
191
+
192
+ ## More Information [optional]
193
+
194
+ [More Information Needed]
195
+
196
+ ## Model Card Authors [optional]
197
+
198
+ [More Information Needed]
199
+
200
+ ## Model Card Contact
201
+
202
+ [More Information Needed]
203
+ ### Framework versions
204
+
205
+ - PEFT 0.18.0
nl_tasks/expsBOFT/seed44/ft2/adapter_config.json ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_mapping": {
3
+ "base_model_class": "LlamaForCausalLM",
4
+ "parent_library": "transformers.models.llama.modeling_llama"
5
+ },
6
+ "base_model_name_or_path": "meta-llama/Llama-2-7b-hf",
7
+ "bias": "none",
8
+ "boft_block_num": 0,
9
+ "boft_block_size": 16,
10
+ "boft_dropout": 0.05,
11
+ "boft_n_butterfly_factor": 2,
12
+ "exclude_modules": null,
13
+ "fan_in_fan_out": false,
14
+ "inference_mode": true,
15
+ "init_weights": true,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "modules_to_save": null,
19
+ "peft_type": "BOFT",
20
+ "peft_version": "0.18.0",
21
+ "revision": null,
22
+ "target_modules": [
23
+ "q_proj",
24
+ "v_proj"
25
+ ],
26
+ "task_type": null
27
+ }
nl_tasks/expsBOFT/seed44/ft2/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:584526a06a1f45f2f77e6a89a7201b05aa25a3d6be60f231b255a32c48c4b261
3
+ size 34619504
nl_tasks/expsBOFT/seed44/trainer_state.json ADDED
@@ -0,0 +1,218 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "best_global_step": null,
3
+ "best_metric": null,
4
+ "best_model_checkpoint": null,
5
+ "epoch": 2.0,
6
+ "eval_steps": 500,
7
+ "global_step": 1250,
8
+ "is_hyper_param_search": false,
9
+ "is_local_process_zero": true,
10
+ "is_world_process_zero": true,
11
+ "log_history": [
12
+ {
13
+ "epoch": 0.08,
14
+ "grad_norm": 0.08375173062086105,
15
+ "learning_rate": 0.000392,
16
+ "loss": 0.5193,
17
+ "step": 50
18
+ },
19
+ {
20
+ "epoch": 0.16,
21
+ "grad_norm": 0.09268203377723694,
22
+ "learning_rate": 0.0007920000000000001,
23
+ "loss": 0.3316,
24
+ "step": 100
25
+ },
26
+ {
27
+ "epoch": 0.24,
28
+ "grad_norm": 0.08198747783899307,
29
+ "learning_rate": 0.0007964216926581925,
30
+ "loss": 0.304,
31
+ "step": 150
32
+ },
33
+ {
34
+ "epoch": 0.32,
35
+ "grad_norm": 0.0816216915845871,
36
+ "learning_rate": 0.0007854602918076551,
37
+ "loss": 0.2918,
38
+ "step": 200
39
+ },
40
+ {
41
+ "epoch": 0.4,
42
+ "grad_norm": 0.07457849383354187,
43
+ "learning_rate": 0.0007673184950396212,
44
+ "loss": 0.274,
45
+ "step": 250
46
+ },
47
+ {
48
+ "epoch": 0.48,
49
+ "grad_norm": 0.07685171067714691,
50
+ "learning_rate": 0.0007423342497022817,
51
+ "loss": 0.2687,
52
+ "step": 300
53
+ },
54
+ {
55
+ "epoch": 0.56,
56
+ "grad_norm": 0.07849128544330597,
57
+ "learning_rate": 0.0007109729650142636,
58
+ "loss": 0.2651,
59
+ "step": 350
60
+ },
61
+ {
62
+ "epoch": 0.64,
63
+ "grad_norm": 0.07266736030578613,
64
+ "learning_rate": 0.0006738188423714755,
65
+ "loss": 0.2575,
66
+ "step": 400
67
+ },
68
+ {
69
+ "epoch": 0.72,
70
+ "grad_norm": 0.06927025318145752,
71
+ "learning_rate": 0.0006315639927804526,
72
+ "loss": 0.2525,
73
+ "step": 450
74
+ },
75
+ {
76
+ "epoch": 0.8,
77
+ "grad_norm": 0.08536054193973541,
78
+ "learning_rate": 0.00058499554413983,
79
+ "loss": 0.2494,
80
+ "step": 500
81
+ },
82
+ {
83
+ "epoch": 0.88,
84
+ "grad_norm": 0.07602768391370773,
85
+ "learning_rate": 0.000534980978536894,
86
+ "loss": 0.2429,
87
+ "step": 550
88
+ },
89
+ {
90
+ "epoch": 0.96,
91
+ "grad_norm": 0.07055249065160751,
92
+ "learning_rate": 0.00048245197269763485,
93
+ "loss": 0.2457,
94
+ "step": 600
95
+ },
96
+ {
97
+ "epoch": 1.04,
98
+ "grad_norm": 0.07144515216350555,
99
+ "learning_rate": 0.00042838704261214224,
100
+ "loss": 0.2292,
101
+ "step": 650
102
+ },
103
+ {
104
+ "epoch": 1.12,
105
+ "grad_norm": 0.07937044650316238,
106
+ "learning_rate": 0.00037379331563313267,
107
+ "loss": 0.2169,
108
+ "step": 700
109
+ },
110
+ {
111
+ "epoch": 1.2,
112
+ "grad_norm": 0.07409252226352692,
113
+ "learning_rate": 0.00031968776959892677,
114
+ "loss": 0.2098,
115
+ "step": 750
116
+ },
117
+ {
118
+ "epoch": 1.28,
119
+ "grad_norm": 0.07844420522451401,
120
+ "learning_rate": 0.00026707828846051743,
121
+ "loss": 0.2145,
122
+ "step": 800
123
+ },
124
+ {
125
+ "epoch": 1.3599999999999999,
126
+ "grad_norm": 0.07791652530431747,
127
+ "learning_rate": 0.00021694488731055218,
128
+ "loss": 0.2082,
129
+ "step": 850
130
+ },
131
+ {
132
+ "epoch": 1.44,
133
+ "grad_norm": 0.0782908946275711,
134
+ "learning_rate": 0.00017022145655641685,
135
+ "loss": 0.2077,
136
+ "step": 900
137
+ },
138
+ {
139
+ "epoch": 1.52,
140
+ "grad_norm": 0.0826650932431221,
141
+ "learning_rate": 0.00012777836530893536,
142
+ "loss": 0.2137,
143
+ "step": 950
144
+ },
145
+ {
146
+ "epoch": 1.6,
147
+ "grad_norm": 0.0696156919002533,
148
+ "learning_rate": 9.040624805263558e-05,
149
+ "loss": 0.2076,
150
+ "step": 1000
151
+ },
152
+ {
153
+ "epoch": 1.6800000000000002,
154
+ "grad_norm": 0.06966507434844971,
155
+ "learning_rate": 5.880127662124091e-05,
156
+ "loss": 0.2108,
157
+ "step": 1050
158
+ },
159
+ {
160
+ "epoch": 1.76,
161
+ "grad_norm": 0.08326321095228195,
162
+ "learning_rate": 3.355219183361582e-05,
163
+ "loss": 0.2106,
164
+ "step": 1100
165
+ },
166
+ {
167
+ "epoch": 1.8399999999999999,
168
+ "grad_norm": 0.0792745053768158,
169
+ "learning_rate": 1.512933636625089e-05,
170
+ "loss": 0.2073,
171
+ "step": 1150
172
+ },
173
+ {
174
+ "epoch": 1.92,
175
+ "grad_norm": 0.07648582756519318,
176
+ "learning_rate": 3.8758931591217575e-06,
177
+ "loss": 0.209,
178
+ "step": 1200
179
+ },
180
+ {
181
+ "epoch": 2.0,
182
+ "grad_norm": 0.0787830799818039,
183
+ "learning_rate": 1.4925668450960217e-09,
184
+ "loss": 0.2124,
185
+ "step": 1250
186
+ },
187
+ {
188
+ "epoch": 2.0,
189
+ "step": 1250,
190
+ "total_flos": 1.62594677587968e+18,
191
+ "train_loss": 0.25041088790893556,
192
+ "train_runtime": 3377.6799,
193
+ "train_samples_per_second": 23.685,
194
+ "train_steps_per_second": 0.37
195
+ }
196
+ ],
197
+ "logging_steps": 50,
198
+ "max_steps": 1250,
199
+ "num_input_tokens_seen": 0,
200
+ "num_train_epochs": 2,
201
+ "save_steps": 0,
202
+ "stateful_callbacks": {
203
+ "TrainerControl": {
204
+ "args": {
205
+ "should_epoch_stop": false,
206
+ "should_evaluate": false,
207
+ "should_log": false,
208
+ "should_save": false,
209
+ "should_training_stop": false
210
+ },
211
+ "attributes": {}
212
+ }
213
+ },
214
+ "total_flos": 1.62594677587968e+18,
215
+ "train_batch_size": 32,
216
+ "trial_name": null,
217
+ "trial_params": null
218
+ }
nl_tasks/inference/MATH_infer.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import pdb
4
+ import jsonlines
5
+
6
+ import util
7
+ from vllm import LLM, SamplingParams
8
+ import sys
9
+ MAX_INT = sys.maxsize
10
+ INVALID_ANS = "[invalid]"
11
+ MAX_TOKEN = 1408
12
+
13
+ import random
14
+ import numpy as np
15
+ import torch
16
+ import os
17
+
18
+ invalid_outputs = []
19
+ def remove_boxed(s):
20
+ left = "\\boxed{"
21
+ try:
22
+ assert s[:len(left)] == left
23
+ assert s[-1] == "}"
24
+ return s[len(left):-1]
25
+ except:
26
+ return None
27
+
28
+ def process_results(doc, completion, answer):
29
+ split_ans = completion.split('The answer is: ')
30
+ if len(split_ans) > 1:
31
+ ans = split_ans[-1]
32
+ extract_ans_temp = ans.split('.\n')[0]
33
+ extract_ans_temp = extract_ans_temp.strip()
34
+ if len(extract_ans_temp)>0 and extract_ans_temp[-1] == '.':
35
+ extract_ans = extract_ans_temp[0:-1]
36
+ else:
37
+ extract_ans = extract_ans_temp
38
+ extract_ans = extract_ans.strip()
39
+ if util.is_equiv(extract_ans, answer):
40
+ return True
41
+ else:
42
+ return False
43
+ else:
44
+ temp = {'question': doc, 'output': completion, 'answer': answer}
45
+ invalid_outputs.append(temp)
46
+ return False
47
+ def batch_data(data_list, batch_size=1):
48
+ n = len(data_list) // batch_size
49
+ batch_data = []
50
+ for i in range(n-1):
51
+ start = i * batch_size
52
+ end = (i+1)*batch_size
53
+ batch_data.append(data_list[start:end])
54
+
55
+ last_start = (n-1) * batch_size
56
+ last_end = MAX_INT
57
+ batch_data.append(data_list[last_start:last_end])
58
+ return batch_data
59
+
60
+ def test_hendrycks_math(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
61
+ hendrycks_math_ins = []
62
+ hendrycks_math_answers = []
63
+ problem_prompt = (
64
+ "Below is an instruction that describes a task. "
65
+ "Write a response that appropriately completes the request.\n\n"
66
+ "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
67
+ )
68
+ print('promt =====', problem_prompt)
69
+ with open(data_path, "r+", encoding="utf8") as f:
70
+ for idx, item in enumerate(jsonlines.Reader(f)):
71
+ temp_instr = problem_prompt.format(instruction=item["instruction"])
72
+ hendrycks_math_ins.append(temp_instr)
73
+ solution = item['output']
74
+ temp_ans = remove_boxed(util.last_boxed_only_string(solution))
75
+ hendrycks_math_answers.append(temp_ans)
76
+
77
+
78
+ print('total length ===', len(hendrycks_math_ins))
79
+ hendrycks_math_ins = hendrycks_math_ins[start:end]
80
+ hendrycks_math_answers = hendrycks_math_answers[start:end]
81
+ print('lenght ====', len(hendrycks_math_ins))
82
+ # batch_hendrycks_math_ins = batch_data(hendrycks_math_ins, batch_size=batch_size)
83
+
84
+ stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
85
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_TOKEN, stop=stop_tokens)
86
+ print('sampleing =====', sampling_params)
87
+ llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=0.95)
88
+
89
+ outputs = llm.generate(hendrycks_math_ins, sampling_params)
90
+ res_completions = [output.outputs[0].text for output in outputs]
91
+
92
+ results = []
93
+ for idx, (prompt, completion, prompt_answer) in enumerate(zip(hendrycks_math_ins, res_completions, hendrycks_math_answers)):
94
+ res = process_results(prompt, completion, prompt_answer)
95
+ results.append(res)
96
+
97
+ acc = sum(results) / len(results)
98
+ print('len invalid outputs ====', len(invalid_outputs), ', invalid_outputs===', len(invalid_outputs))
99
+ # print('start===', start, ', end====',end)
100
+ print('length====', len(results), ', acc====', acc*100)
101
+
102
+ current_path = args.model
103
+ parent_dir = os.path.dirname(current_path.rstrip('/'))
104
+ output_filename = os.path.join(parent_dir, 'output.txt')
105
+ # output_filename = args.model + 'output.txt'
106
+ with open(output_filename, "a", encoding="utf-8") as f:
107
+ print(f'\n MATH math MAX TOKEN = {MAX_TOKEN}, length==== {len(results)}, math acc %====, {acc*100}', file=f)
108
+
109
+ def parse_args():
110
+ parser = argparse.ArgumentParser()
111
+ parser.add_argument("--model", type=str, default=0) # model path
112
+ parser.add_argument("--data_file", type=str, default='data/MATH_test.jsonl') # data path
113
+ parser.add_argument("--start", type=int, default=0) #start index
114
+ parser.add_argument("--end", type=int, default=MAX_INT) # end index
115
+ parser.add_argument("--batch_size", type=int, default=50) # batch_size
116
+ parser.add_argument("--tensor_parallel_size", type=int, default=1) # tensor_parallel_size
117
+ return parser.parse_args()
118
+
119
+ def set_deterministic_seed(seed=42):
120
+ random.seed(seed)
121
+ np.random.seed(seed)
122
+ torch.manual_seed(seed)
123
+ torch.cuda.manual_seed_all(seed)
124
+ # torch.backends.cudnn.deterministic = True
125
+ # torch.backends.cudnn.benchmark = False
126
+
127
+ if __name__ == "__main__":
128
+ args = parse_args()
129
+ set_deterministic_seed()
130
+ test_hendrycks_math(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
131
+ print('math ends', args.model)
132
+
nl_tasks/inference/grader.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ This logic is largely copied from the Hendrycks' MATH release (math_equivalence), and borrowed from:
3
+ - https://github.com/microsoft/ProphetNet/tree/master/CRITIC
4
+ """
5
+ import multiprocessing
6
+ from math import isclose
7
+ from typing import Union
8
+
9
+ from sympy import simplify, N
10
+ from sympy.parsing.sympy_parser import parse_expr
11
+ from sympy.parsing.latex import parse_latex
12
+
13
+
14
+ def is_digit(s):
15
+ try:
16
+ float(str(s).replace(",", ""))
17
+ return True
18
+ except ValueError:
19
+ return False
20
+
21
+ def math_equal(prediction: Union[bool, float, str],
22
+ reference: Union[float, str],
23
+ include_percentage: bool = True,
24
+ is_close: bool = True,
25
+ timeout: bool = False,
26
+ ) -> bool:
27
+ """
28
+ Exact match of math if and only if:
29
+ 1. numerical equal: both can convert to float and are equal
30
+ 2. symbolic equal: both can convert to sympy expression and are equal
31
+ """
32
+ try: # 1. numerical equal
33
+ if is_digit(prediction) and is_digit(reference):
34
+ prediction = float(str(prediction).replace(",", ""))
35
+ reference = float(str(reference).replace(",", ""))
36
+ # number questions
37
+ if include_percentage:
38
+ gt_result = [reference / 100, reference, reference * 100]
39
+ else:
40
+ gt_result = [reference]
41
+ for item in gt_result:
42
+ try:
43
+ if is_close:
44
+ if isclose(item, prediction, rel_tol=1e-4):
45
+ return True
46
+ else:
47
+ if item == prediction:
48
+ return True
49
+ except Exception:
50
+ continue
51
+ return False
52
+ except:
53
+ pass
54
+
55
+ if not prediction and prediction not in [0, False]:
56
+ return False
57
+
58
+ # 2. symbolic equal
59
+ reference = str(reference).strip()
60
+ prediction = str(prediction).strip()
61
+
62
+ ## deal with [], (), {}
63
+ pred_str, ref_str = prediction, reference
64
+ if (prediction.startswith("[") and prediction.endswith("]") and not reference.startswith("(")) or \
65
+ (prediction.startswith("(") and prediction.endswith(")") and not reference.startswith("[")):
66
+ pred_str = pred_str.strip("[]()")
67
+ ref_str = ref_str.strip("[]()")
68
+ for s in ['{', "}", "(", ")"]:
69
+ ref_str = ref_str.replace(s, "")
70
+ pred_str = pred_str.replace(s, "")
71
+ if pred_str == ref_str:
72
+ return True
73
+
74
+ ## [a, b] vs. [c, d], return a==c and b==d
75
+ if (prediction.startswith("[") and prediction.endswith("]")) and (reference.startswith("[") and reference.endswith("]")) or \
76
+ (prediction.startswith("(") and prediction.endswith(")")) and (reference.startswith("(") and reference.endswith(")")):
77
+ pred_parts = prediction[1:-1].split(",")
78
+ ref_parts = reference[1:-1].split(",")
79
+ if len(pred_parts) == len(ref_parts):
80
+ if all([math_equal(pred_parts[i], ref_parts[i], include_percentage, is_close) for i in range(len(pred_parts))]):
81
+ return True
82
+
83
+ # symbolic equal with sympy
84
+ if timeout:
85
+ if call_with_timeout(symbolic_equal_process, prediction, reference):
86
+ return True
87
+ else:
88
+ if symbolic_equal(prediction, reference):
89
+ return True
90
+
91
+ return False
92
+
93
+
94
+ def math_equal_process(param):
95
+ return math_equal(param[-2], param[-1])
96
+
97
+
98
+ def symbolic_equal(a, b):
99
+ def _parse(s):
100
+ for f in [parse_latex, parse_expr]:
101
+ try:
102
+ return f(s)
103
+ except:
104
+ pass
105
+ return s
106
+ a = _parse(a)
107
+ b = _parse(b)
108
+
109
+ try:
110
+ if simplify(a-b) == 0:
111
+ return True
112
+ except:
113
+ pass
114
+
115
+ try:
116
+ if isclose(N(a), N(b), rel_tol=1e-3):
117
+ return True
118
+ except:
119
+ pass
120
+ return False
121
+
122
+
123
+ def symbolic_equal_process(a, b, output_queue):
124
+ result = symbolic_equal(a, b)
125
+ output_queue.put(result)
126
+
127
+
128
+ def call_with_timeout(func, *args, timeout=1, **kwargs):
129
+ output_queue = multiprocessing.Queue()
130
+ process_args = args + (output_queue,)
131
+ process = multiprocessing.Process(target=func, args=process_args, kwargs=kwargs)
132
+ process.start()
133
+ process.join(timeout)
134
+
135
+ if process.is_alive():
136
+ process.terminate()
137
+ process.join()
138
+ return False
139
+
140
+ return output_queue.get()
141
+
nl_tasks/inference/gsm8k_infer.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import re
4
+ import jsonlines
5
+ from fraction import Fraction
6
+ from vllm import LLM, SamplingParams
7
+ import sys
8
+ from grader import math_equal
9
+ MAX_INT = sys.maxsize
10
+ MAX_TOKEN = 1024
11
+
12
+ import random
13
+ import numpy as np
14
+ import torch
15
+ import os
16
+
17
+ def is_number(s):
18
+ try:
19
+ float(s)
20
+ return True
21
+ except ValueError:
22
+ pass
23
+ try:
24
+ import unicodedata
25
+ unicodedata.numeric(s)
26
+ return True
27
+ except (TypeError, ValueError):
28
+ pass
29
+ return False
30
+
31
+ def extract_answer_number(completion):
32
+ text = completion.split('The answer is: ')
33
+ if len(text) > 1:
34
+ extract_ans = text[-1].strip()
35
+ match = re.search(r'[\-+]?\d*[\.,/]?\d+', extract_ans)
36
+ if match:
37
+ if '/' in match.group():
38
+ denominator = match.group().split('/')[1]
39
+ numerator = match.group().split('/')[0]
40
+ if is_number(denominator) == True and is_number(numerator) == True:
41
+ if denominator == '0':
42
+ return round(float(numerator.replace(',', '')))
43
+ else:
44
+ frac = Fraction(match.group().replace(',', ''))
45
+ num_numerator = frac.numerator
46
+ num_denominator = frac.denominator
47
+ return round(float(num_numerator / num_denominator))
48
+ else:
49
+ return None
50
+ else:
51
+ if float(match.group().replace(',', '')) == float('inf'):
52
+ return None
53
+ return round(float(match.group().replace(',', '')))
54
+ else:
55
+ return None
56
+ else:
57
+ return None
58
+
59
+ def batch_data(data_list, batch_size=1):
60
+ n = len(data_list) // batch_size
61
+ batch_data = []
62
+ for i in range(n-1):
63
+ start = i * batch_size
64
+ end = (i+1)*batch_size
65
+ batch_data.append(data_list[start:end])
66
+
67
+ last_start = (n-1) * batch_size
68
+ last_end = MAX_INT
69
+ batch_data.append(data_list[last_start:last_end])
70
+ return batch_data
71
+
72
+ def gsm8k_test(model, data_path, start=0, end=MAX_INT, batch_size=1, tensor_parallel_size=1):
73
+ INVALID_ANS = "[invalid]"
74
+ gsm8k_ins = []
75
+ gsm8k_answers = []
76
+ problem_prompt = (
77
+ "Below is an instruction that describes a task. "
78
+ "Write a response that appropriately completes the request.\n\n"
79
+ "### Instruction:\n{instruction}\n\n### Response: Let's think step by step."
80
+ )
81
+ print('prompt =====', problem_prompt)
82
+ with open(data_path,"r+", encoding="utf8") as f:
83
+ for idx, item in enumerate(jsonlines.Reader(f)):
84
+ temp_instr = problem_prompt.format(instruction=item["question"])
85
+ gsm8k_ins.append(temp_instr)
86
+ temp_ans = item['answer'].split('#### ')[1]
87
+ temp_ans = int(temp_ans.replace(',', ''))
88
+ gsm8k_answers.append(temp_ans)
89
+
90
+ gsm8k_ins = gsm8k_ins[start:end]
91
+ gsm8k_answers = gsm8k_answers[start:end]
92
+ print('lenght ====', len(gsm8k_ins))
93
+ # batch_gsm8k_ins = batch_data(gsm8k_ins, batch_size=batch_size)
94
+
95
+ stop_tokens = ["Instruction:", "Instruction", "Response:", "Response"]
96
+ sampling_params = SamplingParams(temperature=0, top_p=1, max_tokens=MAX_TOKEN, stop=stop_tokens)
97
+ print('sampleing =====', sampling_params)
98
+ llm = LLM(model=model,tensor_parallel_size=tensor_parallel_size, gpu_memory_utilization=0.90)
99
+
100
+ result = []
101
+
102
+ outputs = llm.generate(gsm8k_ins, sampling_params)
103
+ res_completions = [output.outputs[0].text for output in outputs]
104
+
105
+ invalid_outputs = []
106
+ for idx, (prompt, completion, prompt_answer) in enumerate(zip(gsm8k_ins, res_completions, gsm8k_answers)):
107
+ doc = {'question': prompt}
108
+ y_pred = extract_answer_number(completion)
109
+ # print('\n y_pred', y_pred, type(y_pred))
110
+ # print('ans', prompt_answer, type(prompt_answer))
111
+ if y_pred != None:
112
+ result.append(float(y_pred) == float(prompt_answer) or math_equal(y_pred, prompt_answer))
113
+ else:
114
+ result.append(False)
115
+ temp = {'question': prompt, 'output': completion, 'answer': prompt_answer}
116
+ invalid_outputs.append(temp)
117
+
118
+ acc = sum(result) / len(result)
119
+ print('len invalid outputs ====', len(invalid_outputs), ', invalid_outputs===', len(invalid_outputs))
120
+ # print('start===', start, ', end====', end)
121
+ print('gsm8k length====', len(result), ', gsm8k acc %====', acc*100)
122
+
123
+
124
+ current_path = args.model
125
+ parent_dir = os.path.dirname(current_path.rstrip('/'))
126
+ output_filename = os.path.join(parent_dir, 'output.txt')
127
+ # output_filename = args.model + 'output.txt'
128
+ with open(output_filename, "a", encoding="utf-8") as f:
129
+ print(f'\n gsm8k MAX TOKEN = {MAX_TOKEN}, length==== {len(result)}, gsm8k acc %====, {acc*100}', file=f)
130
+
131
+
132
+ def parse_args():
133
+ parser = argparse.ArgumentParser()
134
+ parser.add_argument("--model", type=str) # model path
135
+ parser.add_argument("--data_file", type=str, default='data/gsm8k_test.jsonl') # data path
136
+ parser.add_argument("--start", type=int, default=0) #start index
137
+ parser.add_argument("--end", type=int, default=MAX_INT) # end index
138
+ parser.add_argument("--batch_size", type=int, default=60) # batch_size
139
+ parser.add_argument("--tensor_parallel_size", type=int, default=1) # tensor_parallel_size
140
+ return parser.parse_args()
141
+
142
+ def set_deterministic_seed(seed=42):
143
+ random.seed(seed)
144
+ np.random.seed(seed)
145
+ torch.manual_seed(seed)
146
+ torch.cuda.manual_seed_all(seed)
147
+ # torch.backends.cudnn.deterministic = True
148
+ # torch.backends.cudnn.benchmark = False
149
+
150
+
151
+
152
+ if __name__ == "__main__":
153
+ args = parse_args()
154
+ set_deterministic_seed()
155
+ gsm8k_test(model=args.model, data_path=args.data_file, start=args.start, end=args.end, batch_size=args.batch_size, tensor_parallel_size=args.tensor_parallel_size)
156
+ print('gsm ends', args.model)
157
+
nl_tasks/inference/util.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pprint
2
+ from grader import math_equal
3
+
4
+ def last_boxed_only(sample):
5
+ q, a = sample
6
+ a = last_boxed_only_string(a)
7
+ if a == None:
8
+ return None
9
+ return (q, a)
10
+
11
+ def last_boxed_only_string(string):
12
+ idx = string.rfind("\\boxed")
13
+ if idx < 0:
14
+ idx = string.rfind("\\fbox")
15
+ if idx < 0:
16
+ return None
17
+
18
+ i = idx
19
+ right_brace_idx = None
20
+ num_left_braces_open = 0
21
+ while i < len(string):
22
+ if string[i] == "{":
23
+ num_left_braces_open += 1
24
+ if string[i] == "}":
25
+ num_left_braces_open -= 1
26
+ if num_left_braces_open == 0:
27
+ right_brace_idx = i
28
+ break
29
+ i += 1
30
+
31
+ if right_brace_idx == None:
32
+ retval = None
33
+ else:
34
+ retval = string[idx:right_brace_idx + 1]
35
+
36
+ return retval
37
+
38
+ def only_until_first_boxed_from_tokens(string, tokens):
39
+ idx = string.find("\\boxed")
40
+ if idx < 0:
41
+ idx = string.find("\\fbox")
42
+ if idx < 0:
43
+ return None
44
+
45
+ cum_length = 0
46
+ for i, t in enumerate(tokens):
47
+ cum_length += len(t)
48
+ if cum_length >= idx:
49
+ break
50
+
51
+ return tokens[:i]
52
+
53
+
54
+
55
+ def clean_numbers(sample):
56
+ if not sample:
57
+ return None
58
+ new_sample = list()
59
+ for s in sample:
60
+ new_sample.append(_clean_numbers(s))
61
+
62
+ return tuple(new_sample)
63
+
64
+ def _clean_numbers(string):
65
+ """
66
+ Clean Numbers in the given string
67
+
68
+ >>> _clean_numbers(None, "Hello 123")
69
+ 'Hello 123'
70
+ >>> _clean_numbers(None, "Hello 1234")
71
+ 'Hello 1,234'
72
+ >>> _clean_numbers(None, "Hello 1234324asdasd")
73
+ 'Hello 1,234,324asdasd'
74
+ """
75
+ num_prev_digits = 0
76
+ new_string = ""
77
+ for i, c in enumerate(string):
78
+ # isdigit() doesnt work here because of weird unicode chars.
79
+ if c in {'1', '2', '3', '4', '5', '6', '7', '8', '9', '0'}:
80
+ num_prev_digits += 1
81
+ else:
82
+ if num_prev_digits > 3:
83
+ # Some fixing
84
+ string_number = new_string[-num_prev_digits:]
85
+ new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
86
+ num_prev_digits = 0
87
+ new_string += c
88
+
89
+ if num_prev_digits > 3:
90
+ # Some fixing
91
+ string_number = new_string[-num_prev_digits:]
92
+ new_string = new_string[:-num_prev_digits] + "{0:,}".format(int(string_number))
93
+
94
+ return new_string
95
+
96
+ def fix_fracs(string):
97
+ substrs = string.split("\\frac")
98
+ new_str = substrs[0]
99
+ if len(substrs) > 1:
100
+ substrs = substrs[1:]
101
+ for substr in substrs:
102
+ new_str += "\\frac"
103
+ if substr[0] == "{":
104
+ new_str += substr
105
+ else:
106
+ try:
107
+ assert len(substr) >= 2
108
+ except AssertionError:
109
+ return string
110
+ a = substr[0]
111
+ b = substr[1]
112
+ if b != "{":
113
+ if len(substr) > 2:
114
+ post_substr = substr[2:]
115
+ new_str += "{" + a + "}{" + b + "}" + post_substr
116
+ else:
117
+ new_str += "{" + a + "}{" + b + "}"
118
+ else:
119
+ if len(substr) > 2:
120
+ post_substr = substr[2:]
121
+ new_str += "{" + a + "}" + b + post_substr
122
+ else:
123
+ new_str += "{" + a + "}" + b
124
+ string = new_str
125
+ return string
126
+
127
+ def fix_a_slash_b(string):
128
+ if len(string.split("/")) != 2:
129
+ return string
130
+ a = string.split("/")[0]
131
+ b = string.split("/")[1]
132
+ try:
133
+ a = int(a)
134
+ b = int(b)
135
+ assert string == "{}/{}".format(a, b)
136
+ new_string = "\\frac{" + str(a) + "}{" + str(b) + "}"
137
+ return new_string
138
+ except AssertionError:
139
+ return string
140
+
141
+ def remove_right_units(string):
142
+ # "\\text{ " only ever occurs (at least in the val set) when describing units
143
+ if "\\text{ " in string:
144
+ splits = string.split("\\text{ ")
145
+ assert len(splits) == 2
146
+ return splits[0]
147
+ else:
148
+ return string
149
+
150
+ def fix_sqrt(string):
151
+ if "\\sqrt" not in string:
152
+ return string
153
+ splits = string.split("\\sqrt")
154
+ new_string = splits[0]
155
+ for split in splits[1:]:
156
+ if split[0] != "{":
157
+ a = split[0]
158
+ new_substr = "\\sqrt{" + a + "}" + split[1:]
159
+ else:
160
+ new_substr = "\\sqrt" + split
161
+ new_string += new_substr
162
+ return new_string
163
+
164
+
165
+ def strip_string(string):
166
+ # linebreaks
167
+ string = string.replace("\n", "")
168
+
169
+ # remove inverse spaces
170
+ string = string.replace("\\!", "")
171
+
172
+ # replace \\ with \
173
+ string = string.replace("\\\\", "\\")
174
+
175
+ # replace tfrac and dfrac with frac
176
+ string = string.replace("tfrac", "frac")
177
+ string = string.replace("dfrac", "frac")
178
+
179
+ # remove \left and \right
180
+ string = string.replace("\\left", "")
181
+ string = string.replace("\\right", "")
182
+
183
+ # Remove circ (degrees)
184
+ string = string.replace("^{\\circ}", "")
185
+ string = string.replace("^\\circ", "")
186
+
187
+ # remove dollar signs
188
+ string = string.replace("\\$", "")
189
+
190
+ # remove units (on the right)
191
+ string = remove_right_units(string)
192
+
193
+ # remove percentage
194
+ string = string.replace("\\%", "")
195
+ string = string.replace("\%", "") # noqa: W605
196
+
197
+ # " 0." equivalent to " ." and "{0." equivalent to "{." Alternatively, add "0" if "." is the start of the string
198
+ string = string.replace(" .", " 0.")
199
+ string = string.replace("{.", "{0.")
200
+ # if empty, return empty string
201
+ if len(string) == 0:
202
+ return string
203
+ if string[0] == ".":
204
+ string = "0" + string
205
+
206
+ # to consider: get rid of e.g. "k = " or "q = " at beginning
207
+ if len(string.split("=")) == 2:
208
+ if len(string.split("=")[0]) <= 2:
209
+ string = string.split("=")[1]
210
+
211
+ # fix sqrt3 --> sqrt{3}
212
+ string = fix_sqrt(string)
213
+
214
+ # remove spaces
215
+ string = string.replace(" ", "")
216
+
217
+ # \frac1b or \frac12 --> \frac{1}{b} and \frac{1}{2}, etc. Even works with \frac1{72} (but not \frac{72}1). Also does a/b --> \\frac{a}{b}
218
+ string = fix_fracs(string)
219
+
220
+ # manually change 0.5 --> \frac{1}{2}
221
+ if string == "0.5":
222
+ string = "\\frac{1}{2}"
223
+
224
+ # NOTE: X/Y changed to \frac{X}{Y} in dataset, but in simple cases fix in case the model output is X/Y
225
+ string = fix_a_slash_b(string)
226
+
227
+ return string
228
+
229
+
230
+ def is_equiv(str1, str2, verbose=False):
231
+ if str1 is None and str2 is None:
232
+ print("WARNING: Both None")
233
+ return True
234
+ if str1 is None or str2 is None:
235
+ return False
236
+
237
+ try:
238
+ ss1 = strip_string(str1)
239
+ ss2 = strip_string(str2)
240
+ #pdb.set_trace()
241
+ if verbose:
242
+ print(ss1, ss2)
243
+ #return ss1 == ss2
244
+ res = math_equal(ss1,ss2) or ss1 == ss2
245
+ return res
246
+ except Exception:
247
+ #return str1 == str2
248
+ res = math_equal(str1,str1) or str1 == str2
249
+ return res
250
+
251
+ class NotEqual:
252
+ def __eq__(self, other):
253
+ return False