VoCuc commited on
Commit
3abaef4
·
verified ·
1 Parent(s): 15febb1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +24 -0
  2. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md +207 -0
  3. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json +46 -0
  4. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin +3 -0
  5. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json +24 -0
  6. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja +54 -0
  7. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt +0 -0
  8. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json +25 -0
  9. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json +3 -0
  10. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json +207 -0
  11. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json +0 -0
  12. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json +24 -0
  13. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
  14. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja +54 -0
  15. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json +58 -0
  16. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
  17. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json +14 -0
  18. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt +44 -0
  19. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt +0 -0
  20. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors +3 -0
  21. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json +31 -0
  22. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json +3 -0
  23. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json +207 -0
  24. qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json +0 -0
  25. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
  26. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
  27. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt +234 -0
  28. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md +207 -0
  29. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json +46 -0
  30. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin +3 -0
  31. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json +24 -0
  32. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja +54 -0
  33. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt +0 -0
  34. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json +25 -0
  35. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json +3 -0
  36. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json +207 -0
  37. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json +0 -0
  38. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md +207 -0
  39. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json +46 -0
  40. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin +3 -0
  41. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json +24 -0
  42. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja +54 -0
  43. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt +0 -0
  44. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json +25 -0
  45. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json +3 -0
  46. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json +207 -0
  47. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json +0 -0
  48. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md +207 -0
  49. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json +46 -0
  50. qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin +3 -0
.gitattributes CHANGED
@@ -100,3 +100,27 @@ eval_results/vllm/qwen2.5-1.5B-it-nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0-1246/resul
100
  qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
101
  layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
102
  layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
101
  layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
102
  layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text
103
+ qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
104
+ qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
105
+ qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
106
+ qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
107
+ qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
108
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text
109
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
110
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
111
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
112
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
113
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
114
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
115
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
116
+ qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/tokenizer.json filter=lfs diff=lfs merge=lfs -text
117
+ qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
118
+ qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
119
+ qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
120
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
121
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
122
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
123
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
124
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
125
+ qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
126
+ qwen3-1.7B\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "gate_proj",
34
+ "down_proj",
35
+ "up_proj",
36
+ "v_proj",
37
+ "k_proj",
38
+ "o_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:757c433b9241ddd9c09d5aeeb342f38b8298d8a5f6287556aa81da1ed75da682
3
+ size 504133205
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|im_end|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "Qwen2ForCausalLM"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "bos_token_id": 151643,
7
+ "dtype": "float16",
8
+ "eos_token_id": 151645,
9
+ "hidden_act": "silu",
10
+ "hidden_size": 1536,
11
+ "initializer_range": 0.02,
12
+ "intermediate_size": 8960,
13
+ "layer_types": [
14
+ "full_attention",
15
+ "full_attention",
16
+ "full_attention",
17
+ "full_attention",
18
+ "full_attention",
19
+ "full_attention",
20
+ "full_attention",
21
+ "full_attention",
22
+ "full_attention",
23
+ "full_attention",
24
+ "full_attention",
25
+ "full_attention",
26
+ "full_attention",
27
+ "full_attention",
28
+ "full_attention",
29
+ "full_attention",
30
+ "full_attention",
31
+ "full_attention",
32
+ "full_attention",
33
+ "full_attention",
34
+ "full_attention",
35
+ "full_attention",
36
+ "full_attention",
37
+ "full_attention",
38
+ "full_attention",
39
+ "full_attention",
40
+ "full_attention",
41
+ "full_attention"
42
+ ],
43
+ "max_position_embeddings": 32768,
44
+ "max_window_layers": 21,
45
+ "model_type": "qwen2",
46
+ "num_attention_heads": 12,
47
+ "num_hidden_layers": 28,
48
+ "num_key_value_heads": 2,
49
+ "rms_norm_eps": 1e-06,
50
+ "rope_scaling": null,
51
+ "rope_theta": 1000000.0,
52
+ "sliding_window": null,
53
+ "tie_word_embeddings": true,
54
+ "transformers_version": "4.57.3",
55
+ "use_cache": true,
56
+ "use_sliding_window": false,
57
+ "vocab_size": 152064
58
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token_id": 151643,
3
+ "do_sample": true,
4
+ "eos_token_id": [
5
+ 151645,
6
+ 151643
7
+ ],
8
+ "pad_token_id": 151643,
9
+ "repetition_penalty": 1.1,
10
+ "temperature": 0.7,
11
+ "top_k": 20,
12
+ "top_p": 0.8,
13
+ "transformers_version": "4.57.3"
14
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ============================== EXP at 2026-05-17 08:31:39 ==============================
4
+
5
+
6
+ ============================== EXP at 2026-05-17 08:32:21 ==============================
7
+
8
+
9
+ ============================== EXP at 2026-05-17 08:45:44 ==============================
10
+ dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
11
+ train | epoch 0 | Iter: 18/ 29904 | global iter: 10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.457 | step time: 0.831
12
+ train | epoch 0 | Iter: 38/ 29904 | global iter: 20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
13
+ train | epoch 0 | Iter: 58/ 29904 | global iter: 30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.890
14
+ train | epoch 0 | Iter: 78/ 29904 | global iter: 40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.887
15
+ train | epoch 0 | Iter: 98/ 29904 | global iter: 50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
16
+ train | epoch 0 | Iter: 118/ 29904 | global iter: 60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
17
+ train | epoch 0 | Iter: 138/ 29904 | global iter: 70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
18
+ train | epoch 0 | Iter: 158/ 29904 | global iter: 80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.891
19
+ train | epoch 0 | Iter: 178/ 29904 | global iter: 90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale: 1.0000 | micro time: 0.465 | step time: 0.893
20
+ train | epoch 0 | Iter: 198/ 29904 | global iter: 100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
21
+ train | epoch 0 | Iter: 218/ 29904 | global iter: 110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
22
+ train | epoch 0 | Iter: 238/ 29904 | global iter: 120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
23
+ train | epoch 0 | Iter: 258/ 29904 | global iter: 130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.887
24
+ train | epoch 0 | Iter: 278/ 29904 | global iter: 140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.886
25
+ train | epoch 0 | Iter: 298/ 29904 | global iter: 150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
26
+ train | epoch 0 | Iter: 318/ 29904 | global iter: 160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
27
+ train | epoch 0 | Iter: 338/ 29904 | global iter: 170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
28
+ train | epoch 0 | Iter: 358/ 29904 | global iter: 180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
29
+ train | epoch 0 | Iter: 378/ 29904 | global iter: 190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
30
+ train | epoch 0 | Iter: 398/ 29904 | global iter: 200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
31
+ train | epoch 0 | Iter: 418/ 29904 | global iter: 210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.893
32
+ train | epoch 0 | Iter: 438/ 29904 | global iter: 220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.886
33
+ train | epoch 0 | Iter: 458/ 29904 | global iter: 230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
34
+ train | epoch 0 | Iter: 478/ 29904 | global iter: 240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
35
+ train | epoch 0 | Iter: 498/ 29904 | global iter: 250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.884
36
+ train | epoch 0 | Iter: 518/ 29904 | global iter: 260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
37
+ train | epoch 0 | Iter: 538/ 29904 | global iter: 270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.888
38
+ train | epoch 0 | Iter: 558/ 29904 | global iter: 280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.887
39
+ train | epoch 0 | Iter: 578/ 29904 | global iter: 290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.889
40
+ train | epoch 0 | Iter: 598/ 29904 | global iter: 300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
41
+ train | epoch 0 | Iter: 618/ 29904 | global iter: 310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
42
+ train | epoch 0 | Iter: 638/ 29904 | global iter: 320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.887
43
+ train | epoch 0 | Iter: 658/ 29904 | global iter: 330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
44
+ train | epoch 0 | Iter: 678/ 29904 | global iter: 340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0f62e5312da0d56c793167b890fd7cdc2e9eb01cc7533967bfcc1023e72067c9
3
+ size 3087860024
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": {
25
+ "content": "<|endoftext|>",
26
+ "lstrip": false,
27
+ "normalized": false,
28
+ "rstrip": false,
29
+ "single_word": false
30
+ }
31
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+
3
+ ============================== EXP at 2026-05-17 08:55:25 ==============================
4
+ dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
5
+ train | epoch 0 | Iter: 18/ 29904 | global iter: 10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.451 | step time: 0.823
6
+ train | epoch 0 | Iter: 38/ 29904 | global iter: 20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.447 | step time: 0.880
7
+ train | epoch 0 | Iter: 58/ 29904 | global iter: 30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
8
+ train | epoch 0 | Iter: 78/ 29904 | global iter: 40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.878
9
+ train | epoch 0 | Iter: 98/ 29904 | global iter: 50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale: 1.0000 | micro time: 0.446 | step time: 0.874
10
+ train | epoch 0 | Iter: 118/ 29904 | global iter: 60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.873
11
+ train | epoch 0 | Iter: 138/ 29904 | global iter: 70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.875
12
+ train | epoch 0 | Iter: 158/ 29904 | global iter: 80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.876
13
+ train | epoch 0 | Iter: 178/ 29904 | global iter: 90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.875
14
+ train | epoch 0 | Iter: 198/ 29904 | global iter: 100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
15
+ train | epoch 0 | Iter: 218/ 29904 | global iter: 110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.875
16
+ train | epoch 0 | Iter: 238/ 29904 | global iter: 120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.874
17
+ train | epoch 0 | Iter: 258/ 29904 | global iter: 130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
18
+ train | epoch 0 | Iter: 278/ 29904 | global iter: 140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.886
19
+ train | epoch 0 | Iter: 298/ 29904 | global iter: 150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.888
20
+ train | epoch 0 | Iter: 318/ 29904 | global iter: 160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
21
+ train | epoch 0 | Iter: 338/ 29904 | global iter: 170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
22
+ train | epoch 0 | Iter: 358/ 29904 | global iter: 180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.885
23
+ train | epoch 0 | Iter: 378/ 29904 | global iter: 190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.888
24
+ train | epoch 0 | Iter: 398/ 29904 | global iter: 200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
25
+ train | epoch 0 | Iter: 418/ 29904 | global iter: 210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
26
+ train | epoch 0 | Iter: 438/ 29904 | global iter: 220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.888
27
+ train | epoch 0 | Iter: 458/ 29904 | global iter: 230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.887
28
+ train | epoch 0 | Iter: 478/ 29904 | global iter: 240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
29
+ train | epoch 0 | Iter: 498/ 29904 | global iter: 250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
30
+ train | epoch 0 | Iter: 518/ 29904 | global iter: 260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
31
+ train | epoch 0 | Iter: 538/ 29904 | global iter: 270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.885
32
+ train | epoch 0 | Iter: 558/ 29904 | global iter: 280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
33
+ train | epoch 0 | Iter: 578/ 29904 | global iter: 290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.889
34
+ train | epoch 0 | Iter: 598/ 29904 | global iter: 300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.888
35
+ train | epoch 0 | Iter: 618/ 29904 | global iter: 310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
36
+ train | epoch 0 | Iter: 638/ 29904 | global iter: 320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.888
37
+ train | epoch 0 | Iter: 658/ 29904 | global iter: 330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
38
+ train | epoch 0 | Iter: 678/ 29904 | global iter: 340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
39
+ train | epoch 0 | Iter: 698/ 29904 | global iter: 350/ 14952 | loss: -0.0481 | ds_loss: -0.0481 | lr: 9.9866e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.891
40
+ train | epoch 0 | Iter: 718/ 29904 | global iter: 360/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.9858e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.890
41
+ train | epoch 0 | Iter: 738/ 29904 | global iter: 370/ 14952 | loss: -0.0470 | ds_loss: -0.0470 | lr: 9.9850e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.893
42
+ train | epoch 0 | Iter: 758/ 29904 | global iter: 380/ 14952 | loss: -0.0438 | ds_loss: -0.0438 | lr: 9.9842e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
43
+ train | epoch 0 | Iter: 778/ 29904 | global iter: 390/ 14952 | loss: -0.0725 | ds_loss: -0.0725 | lr: 9.9833e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
44
+ train | epoch 0 | Iter: 798/ 29904 | global iter: 400/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.9825e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
45
+ train | epoch 0 | Iter: 818/ 29904 | global iter: 410/ 14952 | loss: -0.0601 | ds_loss: -0.0601 | lr: 9.9816e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.885
46
+ train | epoch 0 | Iter: 838/ 29904 | global iter: 420/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9807e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
47
+ train | epoch 0 | Iter: 858/ 29904 | global iter: 430/ 14952 | loss: -0.0566 | ds_loss: -0.0566 | lr: 9.9797e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
48
+ train | epoch 0 | Iter: 878/ 29904 | global iter: 440/ 14952 | loss: -0.0621 | ds_loss: -0.0621 | lr: 9.9788e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
49
+ train | epoch 0 | Iter: 898/ 29904 | global iter: 450/ 14952 | loss: -0.0574 | ds_loss: -0.0574 | lr: 9.9778e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.888
50
+ train | epoch 0 | Iter: 918/ 29904 | global iter: 460/ 14952 | loss: -0.0430 | ds_loss: -0.0430 | lr: 9.9768e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.891
51
+ train | epoch 0 | Iter: 938/ 29904 | global iter: 470/ 14952 | loss: -0.0646 | ds_loss: -0.0646 | lr: 9.9758e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.893
52
+ train | epoch 0 | Iter: 958/ 29904 | global iter: 480/ 14952 | loss: -0.0583 | ds_loss: -0.0583 | lr: 9.9747e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.890
53
+ train | epoch 0 | Iter: 978/ 29904 | global iter: 490/ 14952 | loss: -0.0492 | ds_loss: -0.0492 | lr: 9.9737e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
54
+ train | epoch 0 | Iter: 998/ 29904 | global iter: 500/ 14952 | loss: -0.0456 | ds_loss: -0.0456 | lr: 9.9726e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
55
+ train | epoch 0 | Iter: 1018/ 29904 | global iter: 510/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9715e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
56
+ train | epoch 0 | Iter: 1038/ 29904 | global iter: 520/ 14952 | loss: -0.0596 | ds_loss: -0.0596 | lr: 9.9703e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
57
+ train | epoch 0 | Iter: 1058/ 29904 | global iter: 530/ 14952 | loss: -0.0477 | ds_loss: -0.0477 | lr: 9.9692e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.888
58
+ train | epoch 0 | Iter: 1078/ 29904 | global iter: 540/ 14952 | loss: -0.0459 | ds_loss: -0.0459 | lr: 9.9680e-05 | scale: 1.0000 | micro time: 0.476 | step time: 0.884
59
+ train | epoch 0 | Iter: 1098/ 29904 | global iter: 550/ 14952 | loss: -0.0659 | ds_loss: -0.0659 | lr: 9.9668e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.889
60
+ train | epoch 0 | Iter: 1118/ 29904 | global iter: 560/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9656e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.888
61
+ train | epoch 0 | Iter: 1138/ 29904 | global iter: 570/ 14952 | loss: -0.0471 | ds_loss: -0.0471 | lr: 9.9643e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
62
+ train | epoch 0 | Iter: 1158/ 29904 | global iter: 580/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9631e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.890
63
+ train | epoch 0 | Iter: 1178/ 29904 | global iter: 590/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.9618e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.889
64
+ train | epoch 0 | Iter: 1198/ 29904 | global iter: 600/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9605e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.886
65
+ train | epoch 0 | Iter: 1218/ 29904 | global iter: 610/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9592e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.888
66
+ train | epoch 0 | Iter: 1238/ 29904 | global iter: 620/ 14952 | loss: -0.0740 | ds_loss: -0.0740 | lr: 9.9578e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
67
+ train | epoch 0 | Iter: 1258/ 29904 | global iter: 630/ 14952 | loss: -0.0396 | ds_loss: -0.0396 | lr: 9.9564e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
68
+ train | epoch 0 | Iter: 1278/ 29904 | global iter: 640/ 14952 | loss: -0.0657 | ds_loss: -0.0657 | lr: 9.9550e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.888
69
+ train | epoch 0 | Iter: 1298/ 29904 | global iter: 650/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9536e-05 | scale: 1.0000 | micro time: 0.468 | step time: 0.900
70
+ train | epoch 0 | Iter: 1318/ 29904 | global iter: 660/ 14952 | loss: -0.0509 | ds_loss: -0.0509 | lr: 9.9522e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.891
71
+ train | epoch 0 | Iter: 1338/ 29904 | global iter: 670/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.9507e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.894
72
+ train | epoch 0 | Iter: 1358/ 29904 | global iter: 680/ 14952 | loss: -0.0706 | ds_loss: -0.0706 | lr: 9.9493e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.895
73
+ train | epoch 0 | Iter: 1378/ 29904 | global iter: 690/ 14952 | loss: -0.0615 | ds_loss: -0.0615 | lr: 9.9477e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
74
+ train | epoch 0 | Iter: 1398/ 29904 | global iter: 700/ 14952 | loss: -0.0546 | ds_loss: -0.0546 | lr: 9.9462e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.892
75
+ train | epoch 0 | Iter: 1418/ 29904 | global iter: 710/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9447e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
76
+ train | epoch 0 | Iter: 1438/ 29904 | global iter: 720/ 14952 | loss: -0.0502 | ds_loss: -0.0502 | lr: 9.9431e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
77
+ train | epoch 0 | Iter: 1458/ 29904 | global iter: 730/ 14952 | loss: -0.0760 | ds_loss: -0.0760 | lr: 9.9415e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
78
+ train | epoch 0 | Iter: 1478/ 29904 | global iter: 740/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.9399e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
79
+ train | epoch 0 | Iter: 1498/ 29904 | global iter: 750/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9383e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.892
80
+ train | epoch 0 | Iter: 1518/ 29904 | global iter: 760/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9366e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.886
81
+ train | epoch 0 | Iter: 1538/ 29904 | global iter: 770/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9349e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.893
82
+ train | epoch 0 | Iter: 1558/ 29904 | global iter: 780/ 14952 | loss: -0.0710 | ds_loss: -0.0710 | lr: 9.9332e-05 | scale: 1.0000 | micro time: 0.465 | step time: 0.895
83
+ train | epoch 0 | Iter: 1578/ 29904 | global iter: 790/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9315e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.892
84
+ train | epoch 0 | Iter: 1598/ 29904 | global iter: 800/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9298e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
85
+ train | epoch 0 | Iter: 1618/ 29904 | global iter: 810/ 14952 | loss: -0.0500 | ds_loss: -0.0500 | lr: 9.9280e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
86
+ train | epoch 0 | Iter: 1638/ 29904 | global iter: 820/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9262e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.924
87
+ train | epoch 0 | Iter: 1658/ 29904 | global iter: 830/ 14952 | loss: -0.0624 | ds_loss: -0.0624 | lr: 9.9244e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.927
88
+ train | epoch 0 | Iter: 1678/ 29904 | global iter: 840/ 14952 | loss: -0.0722 | ds_loss: -0.0722 | lr: 9.9226e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.885
89
+ train | epoch 0 | Iter: 1698/ 29904 | global iter: 850/ 14952 | loss: -0.0666 | ds_loss: -0.0666 | lr: 9.9207e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
90
+ train | epoch 0 | Iter: 1718/ 29904 | global iter: 860/ 14952 | loss: -0.0451 | ds_loss: -0.0451 | lr: 9.9189e-05 | scale: 1.0000 | micro time: 0.469 | step time: 0.889
91
+ train | epoch 0 | Iter: 1738/ 29904 | global iter: 870/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9170e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
92
+ train | epoch 0 | Iter: 1758/ 29904 | global iter: 880/ 14952 | loss: -0.0484 | ds_loss: -0.0484 | lr: 9.9151e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
93
+ train | epoch 0 | Iter: 1778/ 29904 | global iter: 890/ 14952 | loss: -0.0524 | ds_loss: -0.0524 | lr: 9.9131e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
94
+ train | epoch 0 | Iter: 1798/ 29904 | global iter: 900/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.9112e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
95
+ train | epoch 0 | Iter: 1818/ 29904 | global iter: 910/ 14952 | loss: -0.0606 | ds_loss: -0.0606 | lr: 9.9092e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.880
96
+ train | epoch 0 | Iter: 1838/ 29904 | global iter: 920/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.9072e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.882
97
+ train | epoch 0 | Iter: 1858/ 29904 | global iter: 930/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.9051e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
98
+ train | epoch 0 | Iter: 1878/ 29904 | global iter: 940/ 14952 | loss: -0.0719 | ds_loss: -0.0719 | lr: 9.9031e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
99
+ train | epoch 0 | Iter: 1898/ 29904 | global iter: 950/ 14952 | loss: -0.0622 | ds_loss: -0.0622 | lr: 9.9010e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.890
100
+ train | epoch 0 | Iter: 1918/ 29904 | global iter: 960/ 14952 | loss: -0.0499 | ds_loss: -0.0499 | lr: 9.8989e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
101
+ train | epoch 0 | Iter: 1938/ 29904 | global iter: 970/ 14952 | loss: -0.0839 | ds_loss: -0.0839 | lr: 9.8968e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.885
102
+ train | epoch 0 | Iter: 1958/ 29904 | global iter: 980/ 14952 | loss: -0.0293 | ds_loss: -0.0293 | lr: 9.8947e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
103
+ train | epoch 0 | Iter: 1978/ 29904 | global iter: 990/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8925e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.885
104
+ train | epoch 0 | Iter: 1998/ 29904 | global iter: 1000/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.8904e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.883
105
+ train | epoch 0 | Iter: 2018/ 29904 | global iter: 1010/ 14952 | loss: -0.0851 | ds_loss: -0.0851 | lr: 9.8882e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.883
106
+ train | epoch 0 | Iter: 2038/ 29904 | global iter: 1020/ 14952 | loss: -0.0395 | ds_loss: -0.0395 | lr: 9.8859e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
107
+ train | epoch 0 | Iter: 2058/ 29904 | global iter: 1030/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.8837e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
108
+ train | epoch 0 | Iter: 2078/ 29904 | global iter: 1040/ 14952 | loss: -0.0589 | ds_loss: -0.0589 | lr: 9.8814e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.881
109
+ train | epoch 0 | Iter: 2098/ 29904 | global iter: 1050/ 14952 | loss: -0.0519 | ds_loss: -0.0519 | lr: 9.8792e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.885
110
+ train | epoch 0 | Iter: 2118/ 29904 | global iter: 1060/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.8769e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.880
111
+ train | epoch 0 | Iter: 2138/ 29904 | global iter: 1070/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8745e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
112
+ train | epoch 0 | Iter: 2158/ 29904 | global iter: 1080/ 14952 | loss: -0.0527 | ds_loss: -0.0527 | lr: 9.8722e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
113
+ train | epoch 0 | Iter: 2178/ 29904 | global iter: 1090/ 14952 | loss: -0.0354 | ds_loss: -0.0354 | lr: 9.8698e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
114
+ train | epoch 0 | Iter: 2198/ 29904 | global iter: 1100/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.8674e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.881
115
+ train | epoch 0 | Iter: 2218/ 29904 | global iter: 1110/ 14952 | loss: -0.0343 | ds_loss: -0.0343 | lr: 9.8650e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.880
116
+ train | epoch 0 | Iter: 2238/ 29904 | global iter: 1120/ 14952 | loss: -0.0587 | ds_loss: -0.0587 | lr: 9.8626e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
117
+ train | epoch 0 | Iter: 2258/ 29904 | global iter: 1130/ 14952 | loss: -0.0361 | ds_loss: -0.0361 | lr: 9.8601e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
118
+ train | epoch 0 | Iter: 2278/ 29904 | global iter: 1140/ 14952 | loss: -0.0561 | ds_loss: -0.0561 | lr: 9.8576e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
119
+ train | epoch 0 | Iter: 2298/ 29904 | global iter: 1150/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.8551e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.885
120
+ train | epoch 0 | Iter: 2318/ 29904 | global iter: 1160/ 14952 | loss: -0.0379 | ds_loss: -0.0379 | lr: 9.8526e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.888
121
+ train | epoch 0 | Iter: 2338/ 29904 | global iter: 1170/ 14952 | loss: -0.0728 | ds_loss: -0.0728 | lr: 9.8501e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.888
122
+ train | epoch 0 | Iter: 2358/ 29904 | global iter: 1180/ 14952 | loss: -0.0491 | ds_loss: -0.0491 | lr: 9.8475e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.894
123
+ train | epoch 0 | Iter: 2378/ 29904 | global iter: 1190/ 14952 | loss: -0.0578 | ds_loss: -0.0578 | lr: 9.8449e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.883
124
+ train | epoch 0 | Iter: 2398/ 29904 | global iter: 1200/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.8423e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.882
125
+ train | epoch 0 | Iter: 2418/ 29904 | global iter: 1210/ 14952 | loss: -0.0544 | ds_loss: -0.0544 | lr: 9.8397e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.881
126
+ train | epoch 0 | Iter: 2438/ 29904 | global iter: 1220/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.8371e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.880
127
+ train | epoch 0 | Iter: 2458/ 29904 | global iter: 1230/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.8344e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.883
128
+ train | epoch 0 | Iter: 2478/ 29904 | global iter: 1240/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.8317e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
129
+ train | epoch 0 | Iter: 2498/ 29904 | global iter: 1250/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8290e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.883
130
+ train | epoch 0 | Iter: 2518/ 29904 | global iter: 1260/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8263e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.886
131
+ train | epoch 0 | Iter: 2538/ 29904 | global iter: 1270/ 14952 | loss: -0.0486 | ds_loss: -0.0486 | lr: 9.8235e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.885
132
+ train | epoch 0 | Iter: 2558/ 29904 | global iter: 1280/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.8207e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.885
133
+ train | epoch 0 | Iter: 2578/ 29904 | global iter: 1290/ 14952 | loss: -0.0428 | ds_loss: -0.0428 | lr: 9.8179e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
134
+ train | epoch 0 | Iter: 2598/ 29904 | global iter: 1300/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.8151e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.885
135
+ train | epoch 0 | Iter: 2618/ 29904 | global iter: 1310/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.8123e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
136
+ train | epoch 0 | Iter: 2638/ 29904 | global iter: 1320/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8094e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.914
137
+ train | epoch 0 | Iter: 2658/ 29904 | global iter: 1330/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8065e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
138
+ train | epoch 0 | Iter: 2678/ 29904 | global iter: 1340/ 14952 | loss: -0.0729 | ds_loss: -0.0729 | lr: 9.8036e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
139
+ train | epoch 0 | Iter: 2698/ 29904 | global iter: 1350/ 14952 | loss: -0.0489 | ds_loss: -0.0489 | lr: 9.8007e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
140
+ train | epoch 0 | Iter: 2718/ 29904 | global iter: 1360/ 14952 | loss: -0.0655 | ds_loss: -0.0655 | lr: 9.7977e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
141
+ train | epoch 0 | Iter: 2738/ 29904 | global iter: 1370/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7948e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
142
+ train | epoch 0 | Iter: 2758/ 29904 | global iter: 1380/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7918e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
143
+ train | epoch 0 | Iter: 2778/ 29904 | global iter: 1390/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.7888e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.880
144
+ train | epoch 0 | Iter: 2798/ 29904 | global iter: 1400/ 14952 | loss: -0.0485 | ds_loss: -0.0485 | lr: 9.7858e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
145
+ train | epoch 0 | Iter: 2818/ 29904 | global iter: 1410/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.7827e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.876
146
+ train | epoch 0 | Iter: 2838/ 29904 | global iter: 1420/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.7796e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.883
147
+ train | epoch 0 | Iter: 2858/ 29904 | global iter: 1430/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.7765e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
148
+ train | epoch 0 | Iter: 2878/ 29904 | global iter: 1440/ 14952 | loss: -0.0584 | ds_loss: -0.0584 | lr: 9.7734e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.892
149
+ train | epoch 0 | Iter: 2898/ 29904 | global iter: 1450/ 14952 | loss: -0.0365 | ds_loss: -0.0365 | lr: 9.7703e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
150
+ train | epoch 0 | Iter: 2918/ 29904 | global iter: 1460/ 14952 | loss: -0.0554 | ds_loss: -0.0554 | lr: 9.7671e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.892
151
+ train | epoch 0 | Iter: 2938/ 29904 | global iter: 1470/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.7640e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.891
152
+ train | epoch 0 | Iter: 2958/ 29904 | global iter: 1480/ 14952 | loss: -0.0660 | ds_loss: -0.0660 | lr: 9.7608e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
153
+ train | epoch 0 | Iter: 2978/ 29904 | global iter: 1490/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.7575e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.889
154
+ train | epoch 0 | Iter: 2998/ 29904 | global iter: 1500/ 14952 | loss: -0.0483 | ds_loss: -0.0483 | lr: 9.7543e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
155
+ train | epoch 0 | Iter: 3018/ 29904 | global iter: 1510/ 14952 | loss: -0.0529 | ds_loss: -0.0529 | lr: 9.7510e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.886
156
+ train | epoch 0 | Iter: 3038/ 29904 | global iter: 1520/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.7477e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.890
157
+ train | epoch 0 | Iter: 3058/ 29904 | global iter: 1530/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.7444e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
158
+ train | epoch 0 | Iter: 3078/ 29904 | global iter: 1540/ 14952 | loss: -0.0543 | ds_loss: -0.0543 | lr: 9.7411e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
159
+ train | epoch 0 | Iter: 3098/ 29904 | global iter: 1550/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.7378e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
160
+ train | epoch 0 | Iter: 3118/ 29904 | global iter: 1560/ 14952 | loss: -0.0378 | ds_loss: -0.0378 | lr: 9.7344e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.888
161
+ train | epoch 0 | Iter: 3138/ 29904 | global iter: 1570/ 14952 | loss: -0.0614 | ds_loss: -0.0614 | lr: 9.7310e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.886
162
+ train | epoch 0 | Iter: 3158/ 29904 | global iter: 1580/ 14952 | loss: -0.0720 | ds_loss: -0.0720 | lr: 9.7276e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
163
+ train | epoch 0 | Iter: 3178/ 29904 | global iter: 1590/ 14952 | loss: -0.0702 | ds_loss: -0.0702 | lr: 9.7242e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
164
+ train | epoch 0 | Iter: 3198/ 29904 | global iter: 1600/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.7207e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
165
+ train | epoch 0 | Iter: 3218/ 29904 | global iter: 1610/ 14952 | loss: -0.0585 | ds_loss: -0.0585 | lr: 9.7173e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.886
166
+ train | epoch 0 | Iter: 3238/ 29904 | global iter: 1620/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.7138e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.876
167
+ train | epoch 0 | Iter: 3258/ 29904 | global iter: 1630/ 14952 | loss: -0.0503 | ds_loss: -0.0503 | lr: 9.7103e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
168
+ train | epoch 0 | Iter: 3278/ 29904 | global iter: 1640/ 14952 | loss: -0.0632 | ds_loss: -0.0632 | lr: 9.7067e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
169
+ train | epoch 0 | Iter: 3298/ 29904 | global iter: 1650/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.7032e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
170
+ train | epoch 0 | Iter: 3318/ 29904 | global iter: 1660/ 14952 | loss: -0.0382 | ds_loss: -0.0382 | lr: 9.6996e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.881
171
+ train | epoch 0 | Iter: 3338/ 29904 | global iter: 1670/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.6960e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
172
+ train | epoch 0 | Iter: 3358/ 29904 | global iter: 1680/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.6924e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
173
+ train | epoch 0 | Iter: 3378/ 29904 | global iter: 1690/ 14952 | loss: -0.0522 | ds_loss: -0.0522 | lr: 9.6888e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.878
174
+ train | epoch 0 | Iter: 3398/ 29904 | global iter: 1700/ 14952 | loss: -0.0580 | ds_loss: -0.0580 | lr: 9.6851e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.876
175
+ train | epoch 0 | Iter: 3418/ 29904 | global iter: 1710/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.6814e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.878
176
+ train | epoch 0 | Iter: 3438/ 29904 | global iter: 1720/ 14952 | loss: -0.0392 | ds_loss: -0.0392 | lr: 9.6777e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
177
+ train | epoch 0 | Iter: 3458/ 29904 | global iter: 1730/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6740e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.879
178
+ train | epoch 0 | Iter: 3478/ 29904 | global iter: 1740/ 14952 | loss: -0.0549 | ds_loss: -0.0549 | lr: 9.6703e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
179
+ train | epoch 0 | Iter: 3498/ 29904 | global iter: 1750/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6665e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.879
180
+ train | epoch 0 | Iter: 3518/ 29904 | global iter: 1760/ 14952 | loss: -0.0597 | ds_loss: -0.0597 | lr: 9.6627e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.886
181
+ train | epoch 0 | Iter: 3538/ 29904 | global iter: 1770/ 14952 | loss: -0.0563 | ds_loss: -0.0563 | lr: 9.6589e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.879
182
+ train | epoch 0 | Iter: 3558/ 29904 | global iter: 1780/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6551e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.879
183
+ train | epoch 0 | Iter: 3578/ 29904 | global iter: 1790/ 14952 | loss: -0.0351 | ds_loss: -0.0351 | lr: 9.6513e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
184
+ train | epoch 0 | Iter: 3598/ 29904 | global iter: 1800/ 14952 | loss: -0.0437 | ds_loss: -0.0437 | lr: 9.6474e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.876
185
+ train | epoch 0 | Iter: 3618/ 29904 | global iter: 1810/ 14952 | loss: -0.0732 | ds_loss: -0.0732 | lr: 9.6435e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.874
186
+ train | epoch 0 | Iter: 3638/ 29904 | global iter: 1820/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6396e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.907
187
+ train | epoch 0 | Iter: 3658/ 29904 | global iter: 1830/ 14952 | loss: -0.0709 | ds_loss: -0.0709 | lr: 9.6357e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.882
188
+ train | epoch 0 | Iter: 3678/ 29904 | global iter: 1840/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.6317e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
189
+ train | epoch 0 | Iter: 3698/ 29904 | global iter: 1850/ 14952 | loss: -0.0676 | ds_loss: -0.0676 | lr: 9.6278e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
190
+ train | epoch 0 | Iter: 3718/ 29904 | global iter: 1860/ 14952 | loss: -0.0348 | ds_loss: -0.0348 | lr: 9.6238e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.881
191
+ train | epoch 0 | Iter: 3738/ 29904 | global iter: 1870/ 14952 | loss: -0.0747 | ds_loss: -0.0747 | lr: 9.6198e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
192
+ train | epoch 0 | Iter: 3758/ 29904 | global iter: 1880/ 14952 | loss: -0.0535 | ds_loss: -0.0535 | lr: 9.6158e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
193
+ train | epoch 0 | Iter: 3778/ 29904 | global iter: 1890/ 14952 | loss: -0.0345 | ds_loss: -0.0345 | lr: 9.6117e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.881
194
+ train | epoch 0 | Iter: 3798/ 29904 | global iter: 1900/ 14952 | loss: -0.0558 | ds_loss: -0.0558 | lr: 9.6076e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.882
195
+ train | epoch 0 | Iter: 3818/ 29904 | global iter: 1910/ 14952 | loss: -0.0464 | ds_loss: -0.0464 | lr: 9.6036e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
196
+ train | epoch 0 | Iter: 3838/ 29904 | global iter: 1920/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5994e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.885
197
+ train | epoch 0 | Iter: 3858/ 29904 | global iter: 1930/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5953e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.886
198
+ train | epoch 0 | Iter: 3878/ 29904 | global iter: 1940/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.5912e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.882
199
+ train | epoch 0 | Iter: 3898/ 29904 | global iter: 1950/ 14952 | loss: -0.0415 | ds_loss: -0.0415 | lr: 9.5870e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
200
+ train | epoch 0 | Iter: 3918/ 29904 | global iter: 1960/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.5828e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.881
201
+ train | epoch 0 | Iter: 3938/ 29904 | global iter: 1970/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.5786e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
202
+ train | epoch 0 | Iter: 3958/ 29904 | global iter: 1980/ 14952 | loss: -0.0581 | ds_loss: -0.0581 | lr: 9.5744e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.882
203
+ train | epoch 0 | Iter: 3978/ 29904 | global iter: 1990/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.5701e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
204
+ train | epoch 0 | Iter: 3998/ 29904 | global iter: 2000/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5659e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.879
205
+ train | epoch 0 | Iter: 4018/ 29904 | global iter: 2010/ 14952 | loss: -0.0667 | ds_loss: -0.0667 | lr: 9.5616e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.878
206
+ train | epoch 0 | Iter: 4038/ 29904 | global iter: 2020/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.5573e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.877
207
+ train | epoch 0 | Iter: 4058/ 29904 | global iter: 2030/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.5529e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
208
+ train | epoch 0 | Iter: 4078/ 29904 | global iter: 2040/ 14952 | loss: -0.0355 | ds_loss: -0.0355 | lr: 9.5486e-05 | scale: 1.0000 | micro time: 0.446 | step time: 0.879
209
+ train | epoch 0 | Iter: 4098/ 29904 | global iter: 2050/ 14952 | loss: -0.0495 | ds_loss: -0.0495 | lr: 9.5442e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.877
210
+ train | epoch 0 | Iter: 4118/ 29904 | global iter: 2060/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.5398e-05 | scale: 1.0000 | micro time: 0.472 | step time: 0.878
211
+ train | epoch 0 | Iter: 4138/ 29904 | global iter: 2070/ 14952 | loss: -0.0441 | ds_loss: -0.0441 | lr: 9.5354e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.889
212
+ train | epoch 0 | Iter: 4158/ 29904 | global iter: 2080/ 14952 | loss: -0.0656 | ds_loss: -0.0656 | lr: 9.5310e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
213
+ train | epoch 0 | Iter: 4178/ 29904 | global iter: 2090/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5265e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.880
214
+ train | epoch 0 | Iter: 4198/ 29904 | global iter: 2100/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.5221e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
215
+ train | epoch 0 | Iter: 4218/ 29904 | global iter: 2110/ 14952 | loss: -0.0548 | ds_loss: -0.0548 | lr: 9.5176e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.880
216
+ train | epoch 0 | Iter: 4238/ 29904 | global iter: 2120/ 14952 | loss: -0.0414 | ds_loss: -0.0414 | lr: 9.5131e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.883
217
+ train | epoch 0 | Iter: 4258/ 29904 | global iter: 2130/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5085e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.880
218
+ train | epoch 0 | Iter: 4278/ 29904 | global iter: 2140/ 14952 | loss: -0.0833 | ds_loss: -0.0833 | lr: 9.5040e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.879
219
+ train | epoch 0 | Iter: 4298/ 29904 | global iter: 2150/ 14952 | loss: -0.0411 | ds_loss: -0.0411 | lr: 9.4994e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
220
+ train | epoch 0 | Iter: 4318/ 29904 | global iter: 2160/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.4948e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
221
+ train | epoch 0 | Iter: 4338/ 29904 | global iter: 2170/ 14952 | loss: -0.0644 | ds_loss: -0.0644 | lr: 9.4902e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
222
+ train | epoch 0 | Iter: 4358/ 29904 | global iter: 2180/ 14952 | loss: -0.0412 | ds_loss: -0.0412 | lr: 9.4856e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
223
+ train | epoch 0 | Iter: 4378/ 29904 | global iter: 2190/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.4809e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.884
224
+ train | epoch 0 | Iter: 4398/ 29904 | global iter: 2200/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.4763e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
225
+ train | epoch 0 | Iter: 4418/ 29904 | global iter: 2210/ 14952 | loss: -0.0536 | ds_loss: -0.0536 | lr: 9.4716e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
226
+ train | epoch 0 | Iter: 4438/ 29904 | global iter: 2220/ 14952 | loss: -0.0380 | ds_loss: -0.0380 | lr: 9.4669e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
227
+ train | epoch 0 | Iter: 4458/ 29904 | global iter: 2230/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.4621e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
228
+ train | epoch 0 | Iter: 4478/ 29904 | global iter: 2240/ 14952 | loss: -0.0344 | ds_loss: -0.0344 | lr: 9.4574e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.881
229
+ train | epoch 0 | Iter: 4498/ 29904 | global iter: 2250/ 14952 | loss: -0.0490 | ds_loss: -0.0490 | lr: 9.4526e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.883
230
+ train | epoch 0 | Iter: 4518/ 29904 | global iter: 2260/ 14952 | loss: -0.0518 | ds_loss: -0.0518 | lr: 9.4478e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
231
+ train | epoch 0 | Iter: 4538/ 29904 | global iter: 2270/ 14952 | loss: -0.0617 | ds_loss: -0.0617 | lr: 9.4430e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.884
232
+ train | epoch 0 | Iter: 4558/ 29904 | global iter: 2280/ 14952 | loss: -0.0320 | ds_loss: -0.0320 | lr: 9.4382e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
233
+ train | epoch 0 | Iter: 4578/ 29904 | global iter: 2290/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.4334e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
234
+ train | epoch 0 | Iter: 4598/ 29904 | global iter: 2300/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.4285e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "down_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "gate_proj",
37
+ "o_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:25ffab0951358a2a0ade91fcbb1c4d8f212b82bb0ba74234477d26554ea34c3e
3
+ size 504133205
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|im_end|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "down_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "gate_proj",
37
+ "o_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f403f53d9b3384b42396cdef5cb104c4bcabf4efed72c8a737769d426f7f17ac
3
+ size 504133205
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "</tool_call>": 151658,
3
+ "<tool_call>": 151657,
4
+ "<|box_end|>": 151649,
5
+ "<|box_start|>": 151648,
6
+ "<|endoftext|>": 151643,
7
+ "<|file_sep|>": 151664,
8
+ "<|fim_middle|>": 151660,
9
+ "<|fim_pad|>": 151662,
10
+ "<|fim_prefix|>": 151659,
11
+ "<|fim_suffix|>": 151661,
12
+ "<|im_end|>": 151645,
13
+ "<|im_start|>": 151644,
14
+ "<|image_pad|>": 151655,
15
+ "<|object_ref_end|>": 151647,
16
+ "<|object_ref_start|>": 151646,
17
+ "<|quad_end|>": 151651,
18
+ "<|quad_start|>": 151650,
19
+ "<|repo_name|>": 151663,
20
+ "<|video_pad|>": 151656,
21
+ "<|vision_end|>": 151653,
22
+ "<|vision_pad|>": 151654,
23
+ "<|vision_start|>": 151652
24
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if tools %}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0]['role'] == 'system' %}
4
+ {{- messages[0]['content'] }}
5
+ {%- else %}
6
+ {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
7
+ {%- endif %}
8
+ {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
9
+ {%- for tool in tools %}
10
+ {{- "\n" }}
11
+ {{- tool | tojson }}
12
+ {%- endfor %}
13
+ {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
14
+ {%- else %}
15
+ {%- if messages[0]['role'] == 'system' %}
16
+ {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
17
+ {%- else %}
18
+ {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
19
+ {%- endif %}
20
+ {%- endif %}
21
+ {%- for message in messages %}
22
+ {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
23
+ {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
24
+ {%- elif message.role == "assistant" %}
25
+ {{- '<|im_start|>' + message.role }}
26
+ {%- if message.content %}
27
+ {{- '\n' + message.content }}
28
+ {%- endif %}
29
+ {%- for tool_call in message.tool_calls %}
30
+ {%- if tool_call.function is defined %}
31
+ {%- set tool_call = tool_call.function %}
32
+ {%- endif %}
33
+ {{- '\n<tool_call>\n{"name": "' }}
34
+ {{- tool_call.name }}
35
+ {{- '", "arguments": ' }}
36
+ {{- tool_call.arguments | tojson }}
37
+ {{- '}\n</tool_call>' }}
38
+ {%- endfor %}
39
+ {{- '<|im_end|>\n' }}
40
+ {%- elif message.role == "tool" %}
41
+ {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
42
+ {{- '<|im_start|>user' }}
43
+ {%- endif %}
44
+ {{- '\n<tool_response>\n' }}
45
+ {{- message.content }}
46
+ {{- '\n</tool_response>' }}
47
+ {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif %}
50
+ {%- endif %}
51
+ {%- endfor %}
52
+ {%- if add_generation_prompt %}
53
+ {{- '<|im_start|>assistant\n' }}
54
+ {%- endif %}
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ "<|im_start|>",
4
+ "<|im_end|>",
5
+ "<|object_ref_start|>",
6
+ "<|object_ref_end|>",
7
+ "<|box_start|>",
8
+ "<|box_end|>",
9
+ "<|quad_start|>",
10
+ "<|quad_end|>",
11
+ "<|vision_start|>",
12
+ "<|vision_end|>",
13
+ "<|vision_pad|>",
14
+ "<|image_pad|>",
15
+ "<|video_pad|>"
16
+ ],
17
+ "eos_token": {
18
+ "content": "<|im_end|>",
19
+ "lstrip": false,
20
+ "normalized": false,
21
+ "rstrip": false,
22
+ "single_word": false
23
+ },
24
+ "pad_token": "<|im_end|>"
25
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
3
+ size 11421896
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "clean_up_tokenization_spaces": false,
199
+ "eos_token": "<|im_end|>",
200
+ "errors": "replace",
201
+ "extra_special_tokens": {},
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|im_end|>",
204
+ "split_special_tokens": false,
205
+ "tokenizer_class": "Qwen2Tokenizer",
206
+ "unk_token": null
207
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: Qwen/Qwen2.5-1.5B-Instruct
3
+ library_name: peft
4
+ pipeline_tag: text-generation
5
+ tags:
6
+ - base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
7
+ - lora
8
+ - transformers
9
+ ---
10
+
11
+ # Model Card for Model ID
12
+
13
+ <!-- Provide a quick summary of what the model is/does. -->
14
+
15
+
16
+
17
+ ## Model Details
18
+
19
+ ### Model Description
20
+
21
+ <!-- Provide a longer summary of what this model is. -->
22
+
23
+
24
+
25
+ - **Developed by:** [More Information Needed]
26
+ - **Funded by [optional]:** [More Information Needed]
27
+ - **Shared by [optional]:** [More Information Needed]
28
+ - **Model type:** [More Information Needed]
29
+ - **Language(s) (NLP):** [More Information Needed]
30
+ - **License:** [More Information Needed]
31
+ - **Finetuned from model [optional]:** [More Information Needed]
32
+
33
+ ### Model Sources [optional]
34
+
35
+ <!-- Provide the basic links for the model. -->
36
+
37
+ - **Repository:** [More Information Needed]
38
+ - **Paper [optional]:** [More Information Needed]
39
+ - **Demo [optional]:** [More Information Needed]
40
+
41
+ ## Uses
42
+
43
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
44
+
45
+ ### Direct Use
46
+
47
+ <!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
48
+
49
+ [More Information Needed]
50
+
51
+ ### Downstream Use [optional]
52
+
53
+ <!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
54
+
55
+ [More Information Needed]
56
+
57
+ ### Out-of-Scope Use
58
+
59
+ <!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
60
+
61
+ [More Information Needed]
62
+
63
+ ## Bias, Risks, and Limitations
64
+
65
+ <!-- This section is meant to convey both technical and sociotechnical limitations. -->
66
+
67
+ [More Information Needed]
68
+
69
+ ### Recommendations
70
+
71
+ <!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
72
+
73
+ Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
74
+
75
+ ## How to Get Started with the Model
76
+
77
+ Use the code below to get started with the model.
78
+
79
+ [More Information Needed]
80
+
81
+ ## Training Details
82
+
83
+ ### Training Data
84
+
85
+ <!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
86
+
87
+ [More Information Needed]
88
+
89
+ ### Training Procedure
90
+
91
+ <!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
92
+
93
+ #### Preprocessing [optional]
94
+
95
+ [More Information Needed]
96
+
97
+
98
+ #### Training Hyperparameters
99
+
100
+ - **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
101
+
102
+ #### Speeds, Sizes, Times [optional]
103
+
104
+ <!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
105
+
106
+ [More Information Needed]
107
+
108
+ ## Evaluation
109
+
110
+ <!-- This section describes the evaluation protocols and provides the results. -->
111
+
112
+ ### Testing Data, Factors & Metrics
113
+
114
+ #### Testing Data
115
+
116
+ <!-- This should link to a Dataset Card if possible. -->
117
+
118
+ [More Information Needed]
119
+
120
+ #### Factors
121
+
122
+ <!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
123
+
124
+ [More Information Needed]
125
+
126
+ #### Metrics
127
+
128
+ <!-- These are the evaluation metrics being used, ideally with a description of why. -->
129
+
130
+ [More Information Needed]
131
+
132
+ ### Results
133
+
134
+ [More Information Needed]
135
+
136
+ #### Summary
137
+
138
+
139
+
140
+ ## Model Examination [optional]
141
+
142
+ <!-- Relevant interpretability work for the model goes here -->
143
+
144
+ [More Information Needed]
145
+
146
+ ## Environmental Impact
147
+
148
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
149
+
150
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
151
+
152
+ - **Hardware Type:** [More Information Needed]
153
+ - **Hours used:** [More Information Needed]
154
+ - **Cloud Provider:** [More Information Needed]
155
+ - **Compute Region:** [More Information Needed]
156
+ - **Carbon Emitted:** [More Information Needed]
157
+
158
+ ## Technical Specifications [optional]
159
+
160
+ ### Model Architecture and Objective
161
+
162
+ [More Information Needed]
163
+
164
+ ### Compute Infrastructure
165
+
166
+ [More Information Needed]
167
+
168
+ #### Hardware
169
+
170
+ [More Information Needed]
171
+
172
+ #### Software
173
+
174
+ [More Information Needed]
175
+
176
+ ## Citation [optional]
177
+
178
+ <!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
179
+
180
+ **BibTeX:**
181
+
182
+ [More Information Needed]
183
+
184
+ **APA:**
185
+
186
+ [More Information Needed]
187
+
188
+ ## Glossary [optional]
189
+
190
+ <!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
191
+
192
+ [More Information Needed]
193
+
194
+ ## More Information [optional]
195
+
196
+ [More Information Needed]
197
+
198
+ ## Model Card Authors [optional]
199
+
200
+ [More Information Needed]
201
+
202
+ ## Model Card Contact
203
+
204
+ [More Information Needed]
205
+ ### Framework versions
206
+
207
+ - PEFT 0.18.1
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alora_invocation_tokens": null,
3
+ "alpha_pattern": {},
4
+ "arrow_config": null,
5
+ "auto_mapping": null,
6
+ "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
7
+ "bias": "none",
8
+ "corda_config": null,
9
+ "ensure_weight_tying": false,
10
+ "eva_config": null,
11
+ "exclude_modules": null,
12
+ "fan_in_fan_out": false,
13
+ "inference_mode": true,
14
+ "init_lora_weights": true,
15
+ "layer_replication": null,
16
+ "layers_pattern": null,
17
+ "layers_to_transform": null,
18
+ "loftq_config": {},
19
+ "lora_alpha": 128,
20
+ "lora_bias": false,
21
+ "lora_dropout": 0.05,
22
+ "megatron_config": null,
23
+ "megatron_core": "megatron.core",
24
+ "modules_to_save": null,
25
+ "peft_type": "LORA",
26
+ "peft_version": "0.18.1",
27
+ "qalora_group_size": 16,
28
+ "r": 16,
29
+ "rank_pattern": {},
30
+ "revision": null,
31
+ "target_modules": [
32
+ "q_proj",
33
+ "down_proj",
34
+ "up_proj",
35
+ "v_proj",
36
+ "gate_proj",
37
+ "o_proj",
38
+ "k_proj"
39
+ ],
40
+ "target_parameters": null,
41
+ "task_type": "CAUSAL_LM",
42
+ "trainable_token_indices": null,
43
+ "use_dora": false,
44
+ "use_qalora": false,
45
+ "use_rslora": false
46
+ }
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfd4d760f29b46310a656dda6324b650093e68aa899d807abd642782734615a9
3
+ size 504133205