Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitattributes +24 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md +207 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json +46 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin +3 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json +24 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja +54 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt +0 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json +25 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json +3 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json +207 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json +0 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json +24 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja +54 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json +58 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json +14 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt +44 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt +0 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors +3 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json +31 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json +3 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json +207 -0
- qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt +234 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md +207 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json +46 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin +3 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json +24 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja +54 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json +25 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json +3 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json +207 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md +207 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json +46 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin +3 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json +24 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja +54 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json +25 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json +3 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json +207 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json +0 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md +207 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json +46 -0
- qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin +3 -0
.gitattributes
CHANGED
|
@@ -100,3 +100,27 @@ eval_results/vllm/qwen2.5-1.5B-it-nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0-1246/resul
|
|
| 100 |
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 101 |
layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
|
| 102 |
layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 101 |
layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
|
| 102 |
layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text
|
| 103 |
+
qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 104 |
+
qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 105 |
+
qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 106 |
+
qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 107 |
+
qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 108 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 109 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 110 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 111 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 112 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 113 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 114 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 115 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 116 |
+
qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 117 |
+
qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 118 |
+
qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 122 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 123 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 124 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 125 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 126 |
+
qwen3-1.7B\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 128,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 16,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"gate_proj",
|
| 34 |
+
"down_proj",
|
| 35 |
+
"up_proj",
|
| 36 |
+
"v_proj",
|
| 37 |
+
"k_proj",
|
| 38 |
+
"o_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:757c433b9241ddd9c09d5aeeb342f38b8298d8a5f6287556aa81da1ed75da682
|
| 3 |
+
size 504133205
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": "<|im_end|>"
|
| 25 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|im_end|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"Qwen2ForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_dropout": 0.0,
|
| 6 |
+
"bos_token_id": 151643,
|
| 7 |
+
"dtype": "float16",
|
| 8 |
+
"eos_token_id": 151645,
|
| 9 |
+
"hidden_act": "silu",
|
| 10 |
+
"hidden_size": 1536,
|
| 11 |
+
"initializer_range": 0.02,
|
| 12 |
+
"intermediate_size": 8960,
|
| 13 |
+
"layer_types": [
|
| 14 |
+
"full_attention",
|
| 15 |
+
"full_attention",
|
| 16 |
+
"full_attention",
|
| 17 |
+
"full_attention",
|
| 18 |
+
"full_attention",
|
| 19 |
+
"full_attention",
|
| 20 |
+
"full_attention",
|
| 21 |
+
"full_attention",
|
| 22 |
+
"full_attention",
|
| 23 |
+
"full_attention",
|
| 24 |
+
"full_attention",
|
| 25 |
+
"full_attention",
|
| 26 |
+
"full_attention",
|
| 27 |
+
"full_attention",
|
| 28 |
+
"full_attention",
|
| 29 |
+
"full_attention",
|
| 30 |
+
"full_attention",
|
| 31 |
+
"full_attention",
|
| 32 |
+
"full_attention",
|
| 33 |
+
"full_attention",
|
| 34 |
+
"full_attention",
|
| 35 |
+
"full_attention",
|
| 36 |
+
"full_attention",
|
| 37 |
+
"full_attention",
|
| 38 |
+
"full_attention",
|
| 39 |
+
"full_attention",
|
| 40 |
+
"full_attention",
|
| 41 |
+
"full_attention"
|
| 42 |
+
],
|
| 43 |
+
"max_position_embeddings": 32768,
|
| 44 |
+
"max_window_layers": 21,
|
| 45 |
+
"model_type": "qwen2",
|
| 46 |
+
"num_attention_heads": 12,
|
| 47 |
+
"num_hidden_layers": 28,
|
| 48 |
+
"num_key_value_heads": 2,
|
| 49 |
+
"rms_norm_eps": 1e-06,
|
| 50 |
+
"rope_scaling": null,
|
| 51 |
+
"rope_theta": 1000000.0,
|
| 52 |
+
"sliding_window": null,
|
| 53 |
+
"tie_word_embeddings": true,
|
| 54 |
+
"transformers_version": "4.57.3",
|
| 55 |
+
"use_cache": true,
|
| 56 |
+
"use_sliding_window": false,
|
| 57 |
+
"vocab_size": 152064
|
| 58 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"do_sample": true,
|
| 4 |
+
"eos_token_id": [
|
| 5 |
+
151645,
|
| 6 |
+
151643
|
| 7 |
+
],
|
| 8 |
+
"pad_token_id": 151643,
|
| 9 |
+
"repetition_penalty": 1.1,
|
| 10 |
+
"temperature": 0.7,
|
| 11 |
+
"top_k": 20,
|
| 12 |
+
"top_p": 0.8,
|
| 13 |
+
"transformers_version": "4.57.3"
|
| 14 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
============================== EXP at 2026-05-17 08:31:39 ==============================
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
============================== EXP at 2026-05-17 08:32:21 ==============================
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
============================== EXP at 2026-05-17 08:45:44 ==============================
|
| 10 |
+
dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
|
| 11 |
+
train | epoch 0 | Iter: 18/ 29904 | global iter: 10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.457 | step time: 0.831
|
| 12 |
+
train | epoch 0 | Iter: 38/ 29904 | global iter: 20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
|
| 13 |
+
train | epoch 0 | Iter: 58/ 29904 | global iter: 30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.890
|
| 14 |
+
train | epoch 0 | Iter: 78/ 29904 | global iter: 40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.887
|
| 15 |
+
train | epoch 0 | Iter: 98/ 29904 | global iter: 50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
|
| 16 |
+
train | epoch 0 | Iter: 118/ 29904 | global iter: 60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
|
| 17 |
+
train | epoch 0 | Iter: 138/ 29904 | global iter: 70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
|
| 18 |
+
train | epoch 0 | Iter: 158/ 29904 | global iter: 80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.891
|
| 19 |
+
train | epoch 0 | Iter: 178/ 29904 | global iter: 90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale: 1.0000 | micro time: 0.465 | step time: 0.893
|
| 20 |
+
train | epoch 0 | Iter: 198/ 29904 | global iter: 100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
|
| 21 |
+
train | epoch 0 | Iter: 218/ 29904 | global iter: 110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
|
| 22 |
+
train | epoch 0 | Iter: 238/ 29904 | global iter: 120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
|
| 23 |
+
train | epoch 0 | Iter: 258/ 29904 | global iter: 130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.887
|
| 24 |
+
train | epoch 0 | Iter: 278/ 29904 | global iter: 140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.886
|
| 25 |
+
train | epoch 0 | Iter: 298/ 29904 | global iter: 150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 26 |
+
train | epoch 0 | Iter: 318/ 29904 | global iter: 160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 27 |
+
train | epoch 0 | Iter: 338/ 29904 | global iter: 170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 28 |
+
train | epoch 0 | Iter: 358/ 29904 | global iter: 180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
|
| 29 |
+
train | epoch 0 | Iter: 378/ 29904 | global iter: 190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
|
| 30 |
+
train | epoch 0 | Iter: 398/ 29904 | global iter: 200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
|
| 31 |
+
train | epoch 0 | Iter: 418/ 29904 | global iter: 210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.893
|
| 32 |
+
train | epoch 0 | Iter: 438/ 29904 | global iter: 220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.886
|
| 33 |
+
train | epoch 0 | Iter: 458/ 29904 | global iter: 230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 34 |
+
train | epoch 0 | Iter: 478/ 29904 | global iter: 240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
|
| 35 |
+
train | epoch 0 | Iter: 498/ 29904 | global iter: 250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.884
|
| 36 |
+
train | epoch 0 | Iter: 518/ 29904 | global iter: 260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 37 |
+
train | epoch 0 | Iter: 538/ 29904 | global iter: 270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.888
|
| 38 |
+
train | epoch 0 | Iter: 558/ 29904 | global iter: 280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.887
|
| 39 |
+
train | epoch 0 | Iter: 578/ 29904 | global iter: 290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.889
|
| 40 |
+
train | epoch 0 | Iter: 598/ 29904 | global iter: 300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
|
| 41 |
+
train | epoch 0 | Iter: 618/ 29904 | global iter: 310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
|
| 42 |
+
train | epoch 0 | Iter: 638/ 29904 | global iter: 320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.887
|
| 43 |
+
train | epoch 0 | Iter: 658/ 29904 | global iter: 330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
|
| 44 |
+
train | epoch 0 | Iter: 678/ 29904 | global iter: 340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:0f62e5312da0d56c793167b890fd7cdc2e9eb01cc7533967bfcc1023e72067c9
|
| 3 |
+
size 3087860024
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": {
|
| 25 |
+
"content": "<|endoftext|>",
|
| 26 |
+
"lstrip": false,
|
| 27 |
+
"normalized": false,
|
| 28 |
+
"rstrip": false,
|
| 29 |
+
"single_word": false
|
| 30 |
+
}
|
| 31 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
|
| 3 |
+
============================== EXP at 2026-05-17 08:55:25 ==============================
|
| 4 |
+
dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
|
| 5 |
+
train | epoch 0 | Iter: 18/ 29904 | global iter: 10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.451 | step time: 0.823
|
| 6 |
+
train | epoch 0 | Iter: 38/ 29904 | global iter: 20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale: 1.0000 | micro time: 0.447 | step time: 0.880
|
| 7 |
+
train | epoch 0 | Iter: 58/ 29904 | global iter: 30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
|
| 8 |
+
train | epoch 0 | Iter: 78/ 29904 | global iter: 40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.878
|
| 9 |
+
train | epoch 0 | Iter: 98/ 29904 | global iter: 50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale: 1.0000 | micro time: 0.446 | step time: 0.874
|
| 10 |
+
train | epoch 0 | Iter: 118/ 29904 | global iter: 60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.873
|
| 11 |
+
train | epoch 0 | Iter: 138/ 29904 | global iter: 70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.875
|
| 12 |
+
train | epoch 0 | Iter: 158/ 29904 | global iter: 80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.876
|
| 13 |
+
train | epoch 0 | Iter: 178/ 29904 | global iter: 90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.875
|
| 14 |
+
train | epoch 0 | Iter: 198/ 29904 | global iter: 100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
|
| 15 |
+
train | epoch 0 | Iter: 218/ 29904 | global iter: 110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.875
|
| 16 |
+
train | epoch 0 | Iter: 238/ 29904 | global iter: 120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.874
|
| 17 |
+
train | epoch 0 | Iter: 258/ 29904 | global iter: 130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
|
| 18 |
+
train | epoch 0 | Iter: 278/ 29904 | global iter: 140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.886
|
| 19 |
+
train | epoch 0 | Iter: 298/ 29904 | global iter: 150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.888
|
| 20 |
+
train | epoch 0 | Iter: 318/ 29904 | global iter: 160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
|
| 21 |
+
train | epoch 0 | Iter: 338/ 29904 | global iter: 170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
|
| 22 |
+
train | epoch 0 | Iter: 358/ 29904 | global iter: 180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.885
|
| 23 |
+
train | epoch 0 | Iter: 378/ 29904 | global iter: 190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.888
|
| 24 |
+
train | epoch 0 | Iter: 398/ 29904 | global iter: 200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
|
| 25 |
+
train | epoch 0 | Iter: 418/ 29904 | global iter: 210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
|
| 26 |
+
train | epoch 0 | Iter: 438/ 29904 | global iter: 220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.888
|
| 27 |
+
train | epoch 0 | Iter: 458/ 29904 | global iter: 230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.887
|
| 28 |
+
train | epoch 0 | Iter: 478/ 29904 | global iter: 240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
|
| 29 |
+
train | epoch 0 | Iter: 498/ 29904 | global iter: 250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
|
| 30 |
+
train | epoch 0 | Iter: 518/ 29904 | global iter: 260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
|
| 31 |
+
train | epoch 0 | Iter: 538/ 29904 | global iter: 270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.885
|
| 32 |
+
train | epoch 0 | Iter: 558/ 29904 | global iter: 280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
|
| 33 |
+
train | epoch 0 | Iter: 578/ 29904 | global iter: 290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.889
|
| 34 |
+
train | epoch 0 | Iter: 598/ 29904 | global iter: 300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.888
|
| 35 |
+
train | epoch 0 | Iter: 618/ 29904 | global iter: 310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
|
| 36 |
+
train | epoch 0 | Iter: 638/ 29904 | global iter: 320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.888
|
| 37 |
+
train | epoch 0 | Iter: 658/ 29904 | global iter: 330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
|
| 38 |
+
train | epoch 0 | Iter: 678/ 29904 | global iter: 340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
|
| 39 |
+
train | epoch 0 | Iter: 698/ 29904 | global iter: 350/ 14952 | loss: -0.0481 | ds_loss: -0.0481 | lr: 9.9866e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.891
|
| 40 |
+
train | epoch 0 | Iter: 718/ 29904 | global iter: 360/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.9858e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.890
|
| 41 |
+
train | epoch 0 | Iter: 738/ 29904 | global iter: 370/ 14952 | loss: -0.0470 | ds_loss: -0.0470 | lr: 9.9850e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.893
|
| 42 |
+
train | epoch 0 | Iter: 758/ 29904 | global iter: 380/ 14952 | loss: -0.0438 | ds_loss: -0.0438 | lr: 9.9842e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.889
|
| 43 |
+
train | epoch 0 | Iter: 778/ 29904 | global iter: 390/ 14952 | loss: -0.0725 | ds_loss: -0.0725 | lr: 9.9833e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.890
|
| 44 |
+
train | epoch 0 | Iter: 798/ 29904 | global iter: 400/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.9825e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
|
| 45 |
+
train | epoch 0 | Iter: 818/ 29904 | global iter: 410/ 14952 | loss: -0.0601 | ds_loss: -0.0601 | lr: 9.9816e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.885
|
| 46 |
+
train | epoch 0 | Iter: 838/ 29904 | global iter: 420/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9807e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
|
| 47 |
+
train | epoch 0 | Iter: 858/ 29904 | global iter: 430/ 14952 | loss: -0.0566 | ds_loss: -0.0566 | lr: 9.9797e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
|
| 48 |
+
train | epoch 0 | Iter: 878/ 29904 | global iter: 440/ 14952 | loss: -0.0621 | ds_loss: -0.0621 | lr: 9.9788e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.887
|
| 49 |
+
train | epoch 0 | Iter: 898/ 29904 | global iter: 450/ 14952 | loss: -0.0574 | ds_loss: -0.0574 | lr: 9.9778e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.888
|
| 50 |
+
train | epoch 0 | Iter: 918/ 29904 | global iter: 460/ 14952 | loss: -0.0430 | ds_loss: -0.0430 | lr: 9.9768e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.891
|
| 51 |
+
train | epoch 0 | Iter: 938/ 29904 | global iter: 470/ 14952 | loss: -0.0646 | ds_loss: -0.0646 | lr: 9.9758e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.893
|
| 52 |
+
train | epoch 0 | Iter: 958/ 29904 | global iter: 480/ 14952 | loss: -0.0583 | ds_loss: -0.0583 | lr: 9.9747e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.890
|
| 53 |
+
train | epoch 0 | Iter: 978/ 29904 | global iter: 490/ 14952 | loss: -0.0492 | ds_loss: -0.0492 | lr: 9.9737e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
|
| 54 |
+
train | epoch 0 | Iter: 998/ 29904 | global iter: 500/ 14952 | loss: -0.0456 | ds_loss: -0.0456 | lr: 9.9726e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.886
|
| 55 |
+
train | epoch 0 | Iter: 1018/ 29904 | global iter: 510/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9715e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
|
| 56 |
+
train | epoch 0 | Iter: 1038/ 29904 | global iter: 520/ 14952 | loss: -0.0596 | ds_loss: -0.0596 | lr: 9.9703e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 57 |
+
train | epoch 0 | Iter: 1058/ 29904 | global iter: 530/ 14952 | loss: -0.0477 | ds_loss: -0.0477 | lr: 9.9692e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.888
|
| 58 |
+
train | epoch 0 | Iter: 1078/ 29904 | global iter: 540/ 14952 | loss: -0.0459 | ds_loss: -0.0459 | lr: 9.9680e-05 | scale: 1.0000 | micro time: 0.476 | step time: 0.884
|
| 59 |
+
train | epoch 0 | Iter: 1098/ 29904 | global iter: 550/ 14952 | loss: -0.0659 | ds_loss: -0.0659 | lr: 9.9668e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.889
|
| 60 |
+
train | epoch 0 | Iter: 1118/ 29904 | global iter: 560/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9656e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.888
|
| 61 |
+
train | epoch 0 | Iter: 1138/ 29904 | global iter: 570/ 14952 | loss: -0.0471 | ds_loss: -0.0471 | lr: 9.9643e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
|
| 62 |
+
train | epoch 0 | Iter: 1158/ 29904 | global iter: 580/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9631e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.890
|
| 63 |
+
train | epoch 0 | Iter: 1178/ 29904 | global iter: 590/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.9618e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.889
|
| 64 |
+
train | epoch 0 | Iter: 1198/ 29904 | global iter: 600/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9605e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.886
|
| 65 |
+
train | epoch 0 | Iter: 1218/ 29904 | global iter: 610/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9592e-05 | scale: 1.0000 | micro time: 0.462 | step time: 0.888
|
| 66 |
+
train | epoch 0 | Iter: 1238/ 29904 | global iter: 620/ 14952 | loss: -0.0740 | ds_loss: -0.0740 | lr: 9.9578e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
|
| 67 |
+
train | epoch 0 | Iter: 1258/ 29904 | global iter: 630/ 14952 | loss: -0.0396 | ds_loss: -0.0396 | lr: 9.9564e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
|
| 68 |
+
train | epoch 0 | Iter: 1278/ 29904 | global iter: 640/ 14952 | loss: -0.0657 | ds_loss: -0.0657 | lr: 9.9550e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.888
|
| 69 |
+
train | epoch 0 | Iter: 1298/ 29904 | global iter: 650/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9536e-05 | scale: 1.0000 | micro time: 0.468 | step time: 0.900
|
| 70 |
+
train | epoch 0 | Iter: 1318/ 29904 | global iter: 660/ 14952 | loss: -0.0509 | ds_loss: -0.0509 | lr: 9.9522e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.891
|
| 71 |
+
train | epoch 0 | Iter: 1338/ 29904 | global iter: 670/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.9507e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.894
|
| 72 |
+
train | epoch 0 | Iter: 1358/ 29904 | global iter: 680/ 14952 | loss: -0.0706 | ds_loss: -0.0706 | lr: 9.9493e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.895
|
| 73 |
+
train | epoch 0 | Iter: 1378/ 29904 | global iter: 690/ 14952 | loss: -0.0615 | ds_loss: -0.0615 | lr: 9.9477e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.890
|
| 74 |
+
train | epoch 0 | Iter: 1398/ 29904 | global iter: 700/ 14952 | loss: -0.0546 | ds_loss: -0.0546 | lr: 9.9462e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.892
|
| 75 |
+
train | epoch 0 | Iter: 1418/ 29904 | global iter: 710/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9447e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
|
| 76 |
+
train | epoch 0 | Iter: 1438/ 29904 | global iter: 720/ 14952 | loss: -0.0502 | ds_loss: -0.0502 | lr: 9.9431e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 77 |
+
train | epoch 0 | Iter: 1458/ 29904 | global iter: 730/ 14952 | loss: -0.0760 | ds_loss: -0.0760 | lr: 9.9415e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
|
| 78 |
+
train | epoch 0 | Iter: 1478/ 29904 | global iter: 740/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.9399e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.887
|
| 79 |
+
train | epoch 0 | Iter: 1498/ 29904 | global iter: 750/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9383e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.892
|
| 80 |
+
train | epoch 0 | Iter: 1518/ 29904 | global iter: 760/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9366e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.886
|
| 81 |
+
train | epoch 0 | Iter: 1538/ 29904 | global iter: 770/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9349e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.893
|
| 82 |
+
train | epoch 0 | Iter: 1558/ 29904 | global iter: 780/ 14952 | loss: -0.0710 | ds_loss: -0.0710 | lr: 9.9332e-05 | scale: 1.0000 | micro time: 0.465 | step time: 0.895
|
| 83 |
+
train | epoch 0 | Iter: 1578/ 29904 | global iter: 790/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9315e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.892
|
| 84 |
+
train | epoch 0 | Iter: 1598/ 29904 | global iter: 800/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9298e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
|
| 85 |
+
train | epoch 0 | Iter: 1618/ 29904 | global iter: 810/ 14952 | loss: -0.0500 | ds_loss: -0.0500 | lr: 9.9280e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.889
|
| 86 |
+
train | epoch 0 | Iter: 1638/ 29904 | global iter: 820/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9262e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.924
|
| 87 |
+
train | epoch 0 | Iter: 1658/ 29904 | global iter: 830/ 14952 | loss: -0.0624 | ds_loss: -0.0624 | lr: 9.9244e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.927
|
| 88 |
+
train | epoch 0 | Iter: 1678/ 29904 | global iter: 840/ 14952 | loss: -0.0722 | ds_loss: -0.0722 | lr: 9.9226e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.885
|
| 89 |
+
train | epoch 0 | Iter: 1698/ 29904 | global iter: 850/ 14952 | loss: -0.0666 | ds_loss: -0.0666 | lr: 9.9207e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 90 |
+
train | epoch 0 | Iter: 1718/ 29904 | global iter: 860/ 14952 | loss: -0.0451 | ds_loss: -0.0451 | lr: 9.9189e-05 | scale: 1.0000 | micro time: 0.469 | step time: 0.889
|
| 91 |
+
train | epoch 0 | Iter: 1738/ 29904 | global iter: 870/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9170e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.888
|
| 92 |
+
train | epoch 0 | Iter: 1758/ 29904 | global iter: 880/ 14952 | loss: -0.0484 | ds_loss: -0.0484 | lr: 9.9151e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
|
| 93 |
+
train | epoch 0 | Iter: 1778/ 29904 | global iter: 890/ 14952 | loss: -0.0524 | ds_loss: -0.0524 | lr: 9.9131e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
|
| 94 |
+
train | epoch 0 | Iter: 1798/ 29904 | global iter: 900/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.9112e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
|
| 95 |
+
train | epoch 0 | Iter: 1818/ 29904 | global iter: 910/ 14952 | loss: -0.0606 | ds_loss: -0.0606 | lr: 9.9092e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.880
|
| 96 |
+
train | epoch 0 | Iter: 1838/ 29904 | global iter: 920/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.9072e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.882
|
| 97 |
+
train | epoch 0 | Iter: 1858/ 29904 | global iter: 930/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.9051e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
|
| 98 |
+
train | epoch 0 | Iter: 1878/ 29904 | global iter: 940/ 14952 | loss: -0.0719 | ds_loss: -0.0719 | lr: 9.9031e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 99 |
+
train | epoch 0 | Iter: 1898/ 29904 | global iter: 950/ 14952 | loss: -0.0622 | ds_loss: -0.0622 | lr: 9.9010e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.890
|
| 100 |
+
train | epoch 0 | Iter: 1918/ 29904 | global iter: 960/ 14952 | loss: -0.0499 | ds_loss: -0.0499 | lr: 9.8989e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
|
| 101 |
+
train | epoch 0 | Iter: 1938/ 29904 | global iter: 970/ 14952 | loss: -0.0839 | ds_loss: -0.0839 | lr: 9.8968e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.885
|
| 102 |
+
train | epoch 0 | Iter: 1958/ 29904 | global iter: 980/ 14952 | loss: -0.0293 | ds_loss: -0.0293 | lr: 9.8947e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
|
| 103 |
+
train | epoch 0 | Iter: 1978/ 29904 | global iter: 990/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8925e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.885
|
| 104 |
+
train | epoch 0 | Iter: 1998/ 29904 | global iter: 1000/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.8904e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.883
|
| 105 |
+
train | epoch 0 | Iter: 2018/ 29904 | global iter: 1010/ 14952 | loss: -0.0851 | ds_loss: -0.0851 | lr: 9.8882e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.883
|
| 106 |
+
train | epoch 0 | Iter: 2038/ 29904 | global iter: 1020/ 14952 | loss: -0.0395 | ds_loss: -0.0395 | lr: 9.8859e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
|
| 107 |
+
train | epoch 0 | Iter: 2058/ 29904 | global iter: 1030/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.8837e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
|
| 108 |
+
train | epoch 0 | Iter: 2078/ 29904 | global iter: 1040/ 14952 | loss: -0.0589 | ds_loss: -0.0589 | lr: 9.8814e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.881
|
| 109 |
+
train | epoch 0 | Iter: 2098/ 29904 | global iter: 1050/ 14952 | loss: -0.0519 | ds_loss: -0.0519 | lr: 9.8792e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.885
|
| 110 |
+
train | epoch 0 | Iter: 2118/ 29904 | global iter: 1060/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.8769e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.880
|
| 111 |
+
train | epoch 0 | Iter: 2138/ 29904 | global iter: 1070/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8745e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
|
| 112 |
+
train | epoch 0 | Iter: 2158/ 29904 | global iter: 1080/ 14952 | loss: -0.0527 | ds_loss: -0.0527 | lr: 9.8722e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
|
| 113 |
+
train | epoch 0 | Iter: 2178/ 29904 | global iter: 1090/ 14952 | loss: -0.0354 | ds_loss: -0.0354 | lr: 9.8698e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
|
| 114 |
+
train | epoch 0 | Iter: 2198/ 29904 | global iter: 1100/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.8674e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.881
|
| 115 |
+
train | epoch 0 | Iter: 2218/ 29904 | global iter: 1110/ 14952 | loss: -0.0343 | ds_loss: -0.0343 | lr: 9.8650e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.880
|
| 116 |
+
train | epoch 0 | Iter: 2238/ 29904 | global iter: 1120/ 14952 | loss: -0.0587 | ds_loss: -0.0587 | lr: 9.8626e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
|
| 117 |
+
train | epoch 0 | Iter: 2258/ 29904 | global iter: 1130/ 14952 | loss: -0.0361 | ds_loss: -0.0361 | lr: 9.8601e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
|
| 118 |
+
train | epoch 0 | Iter: 2278/ 29904 | global iter: 1140/ 14952 | loss: -0.0561 | ds_loss: -0.0561 | lr: 9.8576e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.884
|
| 119 |
+
train | epoch 0 | Iter: 2298/ 29904 | global iter: 1150/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.8551e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.885
|
| 120 |
+
train | epoch 0 | Iter: 2318/ 29904 | global iter: 1160/ 14952 | loss: -0.0379 | ds_loss: -0.0379 | lr: 9.8526e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.888
|
| 121 |
+
train | epoch 0 | Iter: 2338/ 29904 | global iter: 1170/ 14952 | loss: -0.0728 | ds_loss: -0.0728 | lr: 9.8501e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.888
|
| 122 |
+
train | epoch 0 | Iter: 2358/ 29904 | global iter: 1180/ 14952 | loss: -0.0491 | ds_loss: -0.0491 | lr: 9.8475e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.894
|
| 123 |
+
train | epoch 0 | Iter: 2378/ 29904 | global iter: 1190/ 14952 | loss: -0.0578 | ds_loss: -0.0578 | lr: 9.8449e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.883
|
| 124 |
+
train | epoch 0 | Iter: 2398/ 29904 | global iter: 1200/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.8423e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.882
|
| 125 |
+
train | epoch 0 | Iter: 2418/ 29904 | global iter: 1210/ 14952 | loss: -0.0544 | ds_loss: -0.0544 | lr: 9.8397e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.881
|
| 126 |
+
train | epoch 0 | Iter: 2438/ 29904 | global iter: 1220/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.8371e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.880
|
| 127 |
+
train | epoch 0 | Iter: 2458/ 29904 | global iter: 1230/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.8344e-05 | scale: 1.0000 | micro time: 0.459 | step time: 0.883
|
| 128 |
+
train | epoch 0 | Iter: 2478/ 29904 | global iter: 1240/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.8317e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
|
| 129 |
+
train | epoch 0 | Iter: 2498/ 29904 | global iter: 1250/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8290e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.883
|
| 130 |
+
train | epoch 0 | Iter: 2518/ 29904 | global iter: 1260/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8263e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.886
|
| 131 |
+
train | epoch 0 | Iter: 2538/ 29904 | global iter: 1270/ 14952 | loss: -0.0486 | ds_loss: -0.0486 | lr: 9.8235e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.885
|
| 132 |
+
train | epoch 0 | Iter: 2558/ 29904 | global iter: 1280/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.8207e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.885
|
| 133 |
+
train | epoch 0 | Iter: 2578/ 29904 | global iter: 1290/ 14952 | loss: -0.0428 | ds_loss: -0.0428 | lr: 9.8179e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
|
| 134 |
+
train | epoch 0 | Iter: 2598/ 29904 | global iter: 1300/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.8151e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.885
|
| 135 |
+
train | epoch 0 | Iter: 2618/ 29904 | global iter: 1310/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.8123e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
|
| 136 |
+
train | epoch 0 | Iter: 2638/ 29904 | global iter: 1320/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8094e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.914
|
| 137 |
+
train | epoch 0 | Iter: 2658/ 29904 | global iter: 1330/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8065e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
|
| 138 |
+
train | epoch 0 | Iter: 2678/ 29904 | global iter: 1340/ 14952 | loss: -0.0729 | ds_loss: -0.0729 | lr: 9.8036e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
|
| 139 |
+
train | epoch 0 | Iter: 2698/ 29904 | global iter: 1350/ 14952 | loss: -0.0489 | ds_loss: -0.0489 | lr: 9.8007e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
|
| 140 |
+
train | epoch 0 | Iter: 2718/ 29904 | global iter: 1360/ 14952 | loss: -0.0655 | ds_loss: -0.0655 | lr: 9.7977e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 141 |
+
train | epoch 0 | Iter: 2738/ 29904 | global iter: 1370/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7948e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 142 |
+
train | epoch 0 | Iter: 2758/ 29904 | global iter: 1380/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7918e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
|
| 143 |
+
train | epoch 0 | Iter: 2778/ 29904 | global iter: 1390/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.7888e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.880
|
| 144 |
+
train | epoch 0 | Iter: 2798/ 29904 | global iter: 1400/ 14952 | loss: -0.0485 | ds_loss: -0.0485 | lr: 9.7858e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.874
|
| 145 |
+
train | epoch 0 | Iter: 2818/ 29904 | global iter: 1410/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.7827e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.876
|
| 146 |
+
train | epoch 0 | Iter: 2838/ 29904 | global iter: 1420/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.7796e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.883
|
| 147 |
+
train | epoch 0 | Iter: 2858/ 29904 | global iter: 1430/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.7765e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 148 |
+
train | epoch 0 | Iter: 2878/ 29904 | global iter: 1440/ 14952 | loss: -0.0584 | ds_loss: -0.0584 | lr: 9.7734e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.892
|
| 149 |
+
train | epoch 0 | Iter: 2898/ 29904 | global iter: 1450/ 14952 | loss: -0.0365 | ds_loss: -0.0365 | lr: 9.7703e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.891
|
| 150 |
+
train | epoch 0 | Iter: 2918/ 29904 | global iter: 1460/ 14952 | loss: -0.0554 | ds_loss: -0.0554 | lr: 9.7671e-05 | scale: 1.0000 | micro time: 0.463 | step time: 0.892
|
| 151 |
+
train | epoch 0 | Iter: 2938/ 29904 | global iter: 1470/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.7640e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.891
|
| 152 |
+
train | epoch 0 | Iter: 2958/ 29904 | global iter: 1480/ 14952 | loss: -0.0660 | ds_loss: -0.0660 | lr: 9.7608e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.887
|
| 153 |
+
train | epoch 0 | Iter: 2978/ 29904 | global iter: 1490/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.7575e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.889
|
| 154 |
+
train | epoch 0 | Iter: 2998/ 29904 | global iter: 1500/ 14952 | loss: -0.0483 | ds_loss: -0.0483 | lr: 9.7543e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.888
|
| 155 |
+
train | epoch 0 | Iter: 3018/ 29904 | global iter: 1510/ 14952 | loss: -0.0529 | ds_loss: -0.0529 | lr: 9.7510e-05 | scale: 1.0000 | micro time: 0.458 | step time: 0.886
|
| 156 |
+
train | epoch 0 | Iter: 3038/ 29904 | global iter: 1520/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.7477e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.890
|
| 157 |
+
train | epoch 0 | Iter: 3058/ 29904 | global iter: 1530/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.7444e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.883
|
| 158 |
+
train | epoch 0 | Iter: 3078/ 29904 | global iter: 1540/ 14952 | loss: -0.0543 | ds_loss: -0.0543 | lr: 9.7411e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
|
| 159 |
+
train | epoch 0 | Iter: 3098/ 29904 | global iter: 1550/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.7378e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.882
|
| 160 |
+
train | epoch 0 | Iter: 3118/ 29904 | global iter: 1560/ 14952 | loss: -0.0378 | ds_loss: -0.0378 | lr: 9.7344e-05 | scale: 1.0000 | micro time: 0.466 | step time: 0.888
|
| 161 |
+
train | epoch 0 | Iter: 3138/ 29904 | global iter: 1570/ 14952 | loss: -0.0614 | ds_loss: -0.0614 | lr: 9.7310e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.886
|
| 162 |
+
train | epoch 0 | Iter: 3158/ 29904 | global iter: 1580/ 14952 | loss: -0.0720 | ds_loss: -0.0720 | lr: 9.7276e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
|
| 163 |
+
train | epoch 0 | Iter: 3178/ 29904 | global iter: 1590/ 14952 | loss: -0.0702 | ds_loss: -0.0702 | lr: 9.7242e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
|
| 164 |
+
train | epoch 0 | Iter: 3198/ 29904 | global iter: 1600/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.7207e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.885
|
| 165 |
+
train | epoch 0 | Iter: 3218/ 29904 | global iter: 1610/ 14952 | loss: -0.0585 | ds_loss: -0.0585 | lr: 9.7173e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.886
|
| 166 |
+
train | epoch 0 | Iter: 3238/ 29904 | global iter: 1620/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.7138e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.876
|
| 167 |
+
train | epoch 0 | Iter: 3258/ 29904 | global iter: 1630/ 14952 | loss: -0.0503 | ds_loss: -0.0503 | lr: 9.7103e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
|
| 168 |
+
train | epoch 0 | Iter: 3278/ 29904 | global iter: 1640/ 14952 | loss: -0.0632 | ds_loss: -0.0632 | lr: 9.7067e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 169 |
+
train | epoch 0 | Iter: 3298/ 29904 | global iter: 1650/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.7032e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
|
| 170 |
+
train | epoch 0 | Iter: 3318/ 29904 | global iter: 1660/ 14952 | loss: -0.0382 | ds_loss: -0.0382 | lr: 9.6996e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.881
|
| 171 |
+
train | epoch 0 | Iter: 3338/ 29904 | global iter: 1670/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.6960e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
|
| 172 |
+
train | epoch 0 | Iter: 3358/ 29904 | global iter: 1680/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.6924e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.881
|
| 173 |
+
train | epoch 0 | Iter: 3378/ 29904 | global iter: 1690/ 14952 | loss: -0.0522 | ds_loss: -0.0522 | lr: 9.6888e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.878
|
| 174 |
+
train | epoch 0 | Iter: 3398/ 29904 | global iter: 1700/ 14952 | loss: -0.0580 | ds_loss: -0.0580 | lr: 9.6851e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.876
|
| 175 |
+
train | epoch 0 | Iter: 3418/ 29904 | global iter: 1710/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.6814e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.878
|
| 176 |
+
train | epoch 0 | Iter: 3438/ 29904 | global iter: 1720/ 14952 | loss: -0.0392 | ds_loss: -0.0392 | lr: 9.6777e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
|
| 177 |
+
train | epoch 0 | Iter: 3458/ 29904 | global iter: 1730/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6740e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.879
|
| 178 |
+
train | epoch 0 | Iter: 3478/ 29904 | global iter: 1740/ 14952 | loss: -0.0549 | ds_loss: -0.0549 | lr: 9.6703e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
|
| 179 |
+
train | epoch 0 | Iter: 3498/ 29904 | global iter: 1750/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6665e-05 | scale: 1.0000 | micro time: 0.460 | step time: 0.879
|
| 180 |
+
train | epoch 0 | Iter: 3518/ 29904 | global iter: 1760/ 14952 | loss: -0.0597 | ds_loss: -0.0597 | lr: 9.6627e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.886
|
| 181 |
+
train | epoch 0 | Iter: 3538/ 29904 | global iter: 1770/ 14952 | loss: -0.0563 | ds_loss: -0.0563 | lr: 9.6589e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.879
|
| 182 |
+
train | epoch 0 | Iter: 3558/ 29904 | global iter: 1780/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6551e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.879
|
| 183 |
+
train | epoch 0 | Iter: 3578/ 29904 | global iter: 1790/ 14952 | loss: -0.0351 | ds_loss: -0.0351 | lr: 9.6513e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.883
|
| 184 |
+
train | epoch 0 | Iter: 3598/ 29904 | global iter: 1800/ 14952 | loss: -0.0437 | ds_loss: -0.0437 | lr: 9.6474e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.876
|
| 185 |
+
train | epoch 0 | Iter: 3618/ 29904 | global iter: 1810/ 14952 | loss: -0.0732 | ds_loss: -0.0732 | lr: 9.6435e-05 | scale: 1.0000 | micro time: 0.445 | step time: 0.874
|
| 186 |
+
train | epoch 0 | Iter: 3638/ 29904 | global iter: 1820/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6396e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.907
|
| 187 |
+
train | epoch 0 | Iter: 3658/ 29904 | global iter: 1830/ 14952 | loss: -0.0709 | ds_loss: -0.0709 | lr: 9.6357e-05 | scale: 1.0000 | micro time: 0.461 | step time: 0.882
|
| 188 |
+
train | epoch 0 | Iter: 3678/ 29904 | global iter: 1840/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.6317e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
|
| 189 |
+
train | epoch 0 | Iter: 3698/ 29904 | global iter: 1850/ 14952 | loss: -0.0676 | ds_loss: -0.0676 | lr: 9.6278e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.880
|
| 190 |
+
train | epoch 0 | Iter: 3718/ 29904 | global iter: 1860/ 14952 | loss: -0.0348 | ds_loss: -0.0348 | lr: 9.6238e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.881
|
| 191 |
+
train | epoch 0 | Iter: 3738/ 29904 | global iter: 1870/ 14952 | loss: -0.0747 | ds_loss: -0.0747 | lr: 9.6198e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
|
| 192 |
+
train | epoch 0 | Iter: 3758/ 29904 | global iter: 1880/ 14952 | loss: -0.0535 | ds_loss: -0.0535 | lr: 9.6158e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
|
| 193 |
+
train | epoch 0 | Iter: 3778/ 29904 | global iter: 1890/ 14952 | loss: -0.0345 | ds_loss: -0.0345 | lr: 9.6117e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.881
|
| 194 |
+
train | epoch 0 | Iter: 3798/ 29904 | global iter: 1900/ 14952 | loss: -0.0558 | ds_loss: -0.0558 | lr: 9.6076e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.882
|
| 195 |
+
train | epoch 0 | Iter: 3818/ 29904 | global iter: 1910/ 14952 | loss: -0.0464 | ds_loss: -0.0464 | lr: 9.6036e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.886
|
| 196 |
+
train | epoch 0 | Iter: 3838/ 29904 | global iter: 1920/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5994e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.885
|
| 197 |
+
train | epoch 0 | Iter: 3858/ 29904 | global iter: 1930/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5953e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.886
|
| 198 |
+
train | epoch 0 | Iter: 3878/ 29904 | global iter: 1940/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.5912e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.882
|
| 199 |
+
train | epoch 0 | Iter: 3898/ 29904 | global iter: 1950/ 14952 | loss: -0.0415 | ds_loss: -0.0415 | lr: 9.5870e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.882
|
| 200 |
+
train | epoch 0 | Iter: 3918/ 29904 | global iter: 1960/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.5828e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.881
|
| 201 |
+
train | epoch 0 | Iter: 3938/ 29904 | global iter: 1970/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.5786e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
|
| 202 |
+
train | epoch 0 | Iter: 3958/ 29904 | global iter: 1980/ 14952 | loss: -0.0581 | ds_loss: -0.0581 | lr: 9.5744e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.882
|
| 203 |
+
train | epoch 0 | Iter: 3978/ 29904 | global iter: 1990/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.5701e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.878
|
| 204 |
+
train | epoch 0 | Iter: 3998/ 29904 | global iter: 2000/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5659e-05 | scale: 1.0000 | micro time: 0.447 | step time: 0.879
|
| 205 |
+
train | epoch 0 | Iter: 4018/ 29904 | global iter: 2010/ 14952 | loss: -0.0667 | ds_loss: -0.0667 | lr: 9.5616e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.878
|
| 206 |
+
train | epoch 0 | Iter: 4038/ 29904 | global iter: 2020/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.5573e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.877
|
| 207 |
+
train | epoch 0 | Iter: 4058/ 29904 | global iter: 2030/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.5529e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.879
|
| 208 |
+
train | epoch 0 | Iter: 4078/ 29904 | global iter: 2040/ 14952 | loss: -0.0355 | ds_loss: -0.0355 | lr: 9.5486e-05 | scale: 1.0000 | micro time: 0.446 | step time: 0.879
|
| 209 |
+
train | epoch 0 | Iter: 4098/ 29904 | global iter: 2050/ 14952 | loss: -0.0495 | ds_loss: -0.0495 | lr: 9.5442e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.877
|
| 210 |
+
train | epoch 0 | Iter: 4118/ 29904 | global iter: 2060/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.5398e-05 | scale: 1.0000 | micro time: 0.472 | step time: 0.878
|
| 211 |
+
train | epoch 0 | Iter: 4138/ 29904 | global iter: 2070/ 14952 | loss: -0.0441 | ds_loss: -0.0441 | lr: 9.5354e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.889
|
| 212 |
+
train | epoch 0 | Iter: 4158/ 29904 | global iter: 2080/ 14952 | loss: -0.0656 | ds_loss: -0.0656 | lr: 9.5310e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 213 |
+
train | epoch 0 | Iter: 4178/ 29904 | global iter: 2090/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5265e-05 | scale: 1.0000 | micro time: 0.464 | step time: 0.880
|
| 214 |
+
train | epoch 0 | Iter: 4198/ 29904 | global iter: 2100/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.5221e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 215 |
+
train | epoch 0 | Iter: 4218/ 29904 | global iter: 2110/ 14952 | loss: -0.0548 | ds_loss: -0.0548 | lr: 9.5176e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.880
|
| 216 |
+
train | epoch 0 | Iter: 4238/ 29904 | global iter: 2120/ 14952 | loss: -0.0414 | ds_loss: -0.0414 | lr: 9.5131e-05 | scale: 1.0000 | micro time: 0.448 | step time: 0.883
|
| 217 |
+
train | epoch 0 | Iter: 4258/ 29904 | global iter: 2130/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5085e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.880
|
| 218 |
+
train | epoch 0 | Iter: 4278/ 29904 | global iter: 2140/ 14952 | loss: -0.0833 | ds_loss: -0.0833 | lr: 9.5040e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.879
|
| 219 |
+
train | epoch 0 | Iter: 4298/ 29904 | global iter: 2150/ 14952 | loss: -0.0411 | ds_loss: -0.0411 | lr: 9.4994e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 220 |
+
train | epoch 0 | Iter: 4318/ 29904 | global iter: 2160/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.4948e-05 | scale: 1.0000 | micro time: 0.449 | step time: 0.879
|
| 221 |
+
train | epoch 0 | Iter: 4338/ 29904 | global iter: 2170/ 14952 | loss: -0.0644 | ds_loss: -0.0644 | lr: 9.4902e-05 | scale: 1.0000 | micro time: 0.452 | step time: 0.884
|
| 222 |
+
train | epoch 0 | Iter: 4358/ 29904 | global iter: 2180/ 14952 | loss: -0.0412 | ds_loss: -0.0412 | lr: 9.4856e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.882
|
| 223 |
+
train | epoch 0 | Iter: 4378/ 29904 | global iter: 2190/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.4809e-05 | scale: 1.0000 | micro time: 0.451 | step time: 0.884
|
| 224 |
+
train | epoch 0 | Iter: 4398/ 29904 | global iter: 2200/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.4763e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.886
|
| 225 |
+
train | epoch 0 | Iter: 4418/ 29904 | global iter: 2210/ 14952 | loss: -0.0536 | ds_loss: -0.0536 | lr: 9.4716e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.884
|
| 226 |
+
train | epoch 0 | Iter: 4438/ 29904 | global iter: 2220/ 14952 | loss: -0.0380 | ds_loss: -0.0380 | lr: 9.4669e-05 | scale: 1.0000 | micro time: 0.450 | step time: 0.884
|
| 227 |
+
train | epoch 0 | Iter: 4458/ 29904 | global iter: 2230/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.4621e-05 | scale: 1.0000 | micro time: 0.457 | step time: 0.883
|
| 228 |
+
train | epoch 0 | Iter: 4478/ 29904 | global iter: 2240/ 14952 | loss: -0.0344 | ds_loss: -0.0344 | lr: 9.4574e-05 | scale: 1.0000 | micro time: 0.453 | step time: 0.881
|
| 229 |
+
train | epoch 0 | Iter: 4498/ 29904 | global iter: 2250/ 14952 | loss: -0.0490 | ds_loss: -0.0490 | lr: 9.4526e-05 | scale: 1.0000 | micro time: 0.456 | step time: 0.883
|
| 230 |
+
train | epoch 0 | Iter: 4518/ 29904 | global iter: 2260/ 14952 | loss: -0.0518 | ds_loss: -0.0518 | lr: 9.4478e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
|
| 231 |
+
train | epoch 0 | Iter: 4538/ 29904 | global iter: 2270/ 14952 | loss: -0.0617 | ds_loss: -0.0617 | lr: 9.4430e-05 | scale: 1.0000 | micro time: 0.454 | step time: 0.884
|
| 232 |
+
train | epoch 0 | Iter: 4558/ 29904 | global iter: 2280/ 14952 | loss: -0.0320 | ds_loss: -0.0320 | lr: 9.4382e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.884
|
| 233 |
+
train | epoch 0 | Iter: 4578/ 29904 | global iter: 2290/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.4334e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
|
| 234 |
+
train | epoch 0 | Iter: 4598/ 29904 | global iter: 2300/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.4285e-05 | scale: 1.0000 | micro time: 0.455 | step time: 0.883
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 128,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 16,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"down_proj",
|
| 34 |
+
"up_proj",
|
| 35 |
+
"v_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"o_proj",
|
| 38 |
+
"k_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:25ffab0951358a2a0ade91fcbb1c4d8f212b82bb0ba74234477d26554ea34c3e
|
| 3 |
+
size 504133205
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": "<|im_end|>"
|
| 25 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|im_end|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 128,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 16,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"down_proj",
|
| 34 |
+
"up_proj",
|
| 35 |
+
"v_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"o_proj",
|
| 38 |
+
"k_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f403f53d9b3384b42396cdef5cb104c4bcabf4efed72c8a737769d426f7f17ac
|
| 3 |
+
size 504133205
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"</tool_call>": 151658,
|
| 3 |
+
"<tool_call>": 151657,
|
| 4 |
+
"<|box_end|>": 151649,
|
| 5 |
+
"<|box_start|>": 151648,
|
| 6 |
+
"<|endoftext|>": 151643,
|
| 7 |
+
"<|file_sep|>": 151664,
|
| 8 |
+
"<|fim_middle|>": 151660,
|
| 9 |
+
"<|fim_pad|>": 151662,
|
| 10 |
+
"<|fim_prefix|>": 151659,
|
| 11 |
+
"<|fim_suffix|>": 151661,
|
| 12 |
+
"<|im_end|>": 151645,
|
| 13 |
+
"<|im_start|>": 151644,
|
| 14 |
+
"<|image_pad|>": 151655,
|
| 15 |
+
"<|object_ref_end|>": 151647,
|
| 16 |
+
"<|object_ref_start|>": 151646,
|
| 17 |
+
"<|quad_end|>": 151651,
|
| 18 |
+
"<|quad_start|>": 151650,
|
| 19 |
+
"<|repo_name|>": 151663,
|
| 20 |
+
"<|video_pad|>": 151656,
|
| 21 |
+
"<|vision_end|>": 151653,
|
| 22 |
+
"<|vision_pad|>": 151654,
|
| 23 |
+
"<|vision_start|>": 151652
|
| 24 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{%- if tools %}
|
| 2 |
+
{{- '<|im_start|>system\n' }}
|
| 3 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 4 |
+
{{- messages[0]['content'] }}
|
| 5 |
+
{%- else %}
|
| 6 |
+
{{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
|
| 7 |
+
{%- endif %}
|
| 8 |
+
{{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
|
| 9 |
+
{%- for tool in tools %}
|
| 10 |
+
{{- "\n" }}
|
| 11 |
+
{{- tool | tojson }}
|
| 12 |
+
{%- endfor %}
|
| 13 |
+
{{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
|
| 14 |
+
{%- else %}
|
| 15 |
+
{%- if messages[0]['role'] == 'system' %}
|
| 16 |
+
{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
|
| 17 |
+
{%- else %}
|
| 18 |
+
{{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
|
| 19 |
+
{%- endif %}
|
| 20 |
+
{%- endif %}
|
| 21 |
+
{%- for message in messages %}
|
| 22 |
+
{%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
|
| 23 |
+
{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
|
| 24 |
+
{%- elif message.role == "assistant" %}
|
| 25 |
+
{{- '<|im_start|>' + message.role }}
|
| 26 |
+
{%- if message.content %}
|
| 27 |
+
{{- '\n' + message.content }}
|
| 28 |
+
{%- endif %}
|
| 29 |
+
{%- for tool_call in message.tool_calls %}
|
| 30 |
+
{%- if tool_call.function is defined %}
|
| 31 |
+
{%- set tool_call = tool_call.function %}
|
| 32 |
+
{%- endif %}
|
| 33 |
+
{{- '\n<tool_call>\n{"name": "' }}
|
| 34 |
+
{{- tool_call.name }}
|
| 35 |
+
{{- '", "arguments": ' }}
|
| 36 |
+
{{- tool_call.arguments | tojson }}
|
| 37 |
+
{{- '}\n</tool_call>' }}
|
| 38 |
+
{%- endfor %}
|
| 39 |
+
{{- '<|im_end|>\n' }}
|
| 40 |
+
{%- elif message.role == "tool" %}
|
| 41 |
+
{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
|
| 42 |
+
{{- '<|im_start|>user' }}
|
| 43 |
+
{%- endif %}
|
| 44 |
+
{{- '\n<tool_response>\n' }}
|
| 45 |
+
{{- message.content }}
|
| 46 |
+
{{- '\n</tool_response>' }}
|
| 47 |
+
{%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
|
| 48 |
+
{{- '<|im_end|>\n' }}
|
| 49 |
+
{%- endif %}
|
| 50 |
+
{%- endif %}
|
| 51 |
+
{%- endfor %}
|
| 52 |
+
{%- if add_generation_prompt %}
|
| 53 |
+
{{- '<|im_start|>assistant\n' }}
|
| 54 |
+
{%- endif %}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"additional_special_tokens": [
|
| 3 |
+
"<|im_start|>",
|
| 4 |
+
"<|im_end|>",
|
| 5 |
+
"<|object_ref_start|>",
|
| 6 |
+
"<|object_ref_end|>",
|
| 7 |
+
"<|box_start|>",
|
| 8 |
+
"<|box_end|>",
|
| 9 |
+
"<|quad_start|>",
|
| 10 |
+
"<|quad_end|>",
|
| 11 |
+
"<|vision_start|>",
|
| 12 |
+
"<|vision_end|>",
|
| 13 |
+
"<|vision_pad|>",
|
| 14 |
+
"<|image_pad|>",
|
| 15 |
+
"<|video_pad|>"
|
| 16 |
+
],
|
| 17 |
+
"eos_token": {
|
| 18 |
+
"content": "<|im_end|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": false,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
},
|
| 24 |
+
"pad_token": "<|im_end|>"
|
| 25 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
|
| 3 |
+
size 11421896
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"clean_up_tokenization_spaces": false,
|
| 199 |
+
"eos_token": "<|im_end|>",
|
| 200 |
+
"errors": "replace",
|
| 201 |
+
"extra_special_tokens": {},
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|im_end|>",
|
| 204 |
+
"split_special_tokens": false,
|
| 205 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 206 |
+
"unk_token": null
|
| 207 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md
ADDED
|
@@ -0,0 +1,207 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
base_model: Qwen/Qwen2.5-1.5B-Instruct
|
| 3 |
+
library_name: peft
|
| 4 |
+
pipeline_tag: text-generation
|
| 5 |
+
tags:
|
| 6 |
+
- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
|
| 7 |
+
- lora
|
| 8 |
+
- transformers
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
# Model Card for Model ID
|
| 12 |
+
|
| 13 |
+
<!-- Provide a quick summary of what the model is/does. -->
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
## Model Details
|
| 18 |
+
|
| 19 |
+
### Model Description
|
| 20 |
+
|
| 21 |
+
<!-- Provide a longer summary of what this model is. -->
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
- **Developed by:** [More Information Needed]
|
| 26 |
+
- **Funded by [optional]:** [More Information Needed]
|
| 27 |
+
- **Shared by [optional]:** [More Information Needed]
|
| 28 |
+
- **Model type:** [More Information Needed]
|
| 29 |
+
- **Language(s) (NLP):** [More Information Needed]
|
| 30 |
+
- **License:** [More Information Needed]
|
| 31 |
+
- **Finetuned from model [optional]:** [More Information Needed]
|
| 32 |
+
|
| 33 |
+
### Model Sources [optional]
|
| 34 |
+
|
| 35 |
+
<!-- Provide the basic links for the model. -->
|
| 36 |
+
|
| 37 |
+
- **Repository:** [More Information Needed]
|
| 38 |
+
- **Paper [optional]:** [More Information Needed]
|
| 39 |
+
- **Demo [optional]:** [More Information Needed]
|
| 40 |
+
|
| 41 |
+
## Uses
|
| 42 |
+
|
| 43 |
+
<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
|
| 44 |
+
|
| 45 |
+
### Direct Use
|
| 46 |
+
|
| 47 |
+
<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
|
| 48 |
+
|
| 49 |
+
[More Information Needed]
|
| 50 |
+
|
| 51 |
+
### Downstream Use [optional]
|
| 52 |
+
|
| 53 |
+
<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
|
| 54 |
+
|
| 55 |
+
[More Information Needed]
|
| 56 |
+
|
| 57 |
+
### Out-of-Scope Use
|
| 58 |
+
|
| 59 |
+
<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
|
| 60 |
+
|
| 61 |
+
[More Information Needed]
|
| 62 |
+
|
| 63 |
+
## Bias, Risks, and Limitations
|
| 64 |
+
|
| 65 |
+
<!-- This section is meant to convey both technical and sociotechnical limitations. -->
|
| 66 |
+
|
| 67 |
+
[More Information Needed]
|
| 68 |
+
|
| 69 |
+
### Recommendations
|
| 70 |
+
|
| 71 |
+
<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
|
| 72 |
+
|
| 73 |
+
Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
|
| 74 |
+
|
| 75 |
+
## How to Get Started with the Model
|
| 76 |
+
|
| 77 |
+
Use the code below to get started with the model.
|
| 78 |
+
|
| 79 |
+
[More Information Needed]
|
| 80 |
+
|
| 81 |
+
## Training Details
|
| 82 |
+
|
| 83 |
+
### Training Data
|
| 84 |
+
|
| 85 |
+
<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
|
| 86 |
+
|
| 87 |
+
[More Information Needed]
|
| 88 |
+
|
| 89 |
+
### Training Procedure
|
| 90 |
+
|
| 91 |
+
<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
|
| 92 |
+
|
| 93 |
+
#### Preprocessing [optional]
|
| 94 |
+
|
| 95 |
+
[More Information Needed]
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
#### Training Hyperparameters
|
| 99 |
+
|
| 100 |
+
- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
|
| 101 |
+
|
| 102 |
+
#### Speeds, Sizes, Times [optional]
|
| 103 |
+
|
| 104 |
+
<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
|
| 105 |
+
|
| 106 |
+
[More Information Needed]
|
| 107 |
+
|
| 108 |
+
## Evaluation
|
| 109 |
+
|
| 110 |
+
<!-- This section describes the evaluation protocols and provides the results. -->
|
| 111 |
+
|
| 112 |
+
### Testing Data, Factors & Metrics
|
| 113 |
+
|
| 114 |
+
#### Testing Data
|
| 115 |
+
|
| 116 |
+
<!-- This should link to a Dataset Card if possible. -->
|
| 117 |
+
|
| 118 |
+
[More Information Needed]
|
| 119 |
+
|
| 120 |
+
#### Factors
|
| 121 |
+
|
| 122 |
+
<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
|
| 123 |
+
|
| 124 |
+
[More Information Needed]
|
| 125 |
+
|
| 126 |
+
#### Metrics
|
| 127 |
+
|
| 128 |
+
<!-- These are the evaluation metrics being used, ideally with a description of why. -->
|
| 129 |
+
|
| 130 |
+
[More Information Needed]
|
| 131 |
+
|
| 132 |
+
### Results
|
| 133 |
+
|
| 134 |
+
[More Information Needed]
|
| 135 |
+
|
| 136 |
+
#### Summary
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
|
| 140 |
+
## Model Examination [optional]
|
| 141 |
+
|
| 142 |
+
<!-- Relevant interpretability work for the model goes here -->
|
| 143 |
+
|
| 144 |
+
[More Information Needed]
|
| 145 |
+
|
| 146 |
+
## Environmental Impact
|
| 147 |
+
|
| 148 |
+
<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
|
| 149 |
+
|
| 150 |
+
Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
|
| 151 |
+
|
| 152 |
+
- **Hardware Type:** [More Information Needed]
|
| 153 |
+
- **Hours used:** [More Information Needed]
|
| 154 |
+
- **Cloud Provider:** [More Information Needed]
|
| 155 |
+
- **Compute Region:** [More Information Needed]
|
| 156 |
+
- **Carbon Emitted:** [More Information Needed]
|
| 157 |
+
|
| 158 |
+
## Technical Specifications [optional]
|
| 159 |
+
|
| 160 |
+
### Model Architecture and Objective
|
| 161 |
+
|
| 162 |
+
[More Information Needed]
|
| 163 |
+
|
| 164 |
+
### Compute Infrastructure
|
| 165 |
+
|
| 166 |
+
[More Information Needed]
|
| 167 |
+
|
| 168 |
+
#### Hardware
|
| 169 |
+
|
| 170 |
+
[More Information Needed]
|
| 171 |
+
|
| 172 |
+
#### Software
|
| 173 |
+
|
| 174 |
+
[More Information Needed]
|
| 175 |
+
|
| 176 |
+
## Citation [optional]
|
| 177 |
+
|
| 178 |
+
<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
|
| 179 |
+
|
| 180 |
+
**BibTeX:**
|
| 181 |
+
|
| 182 |
+
[More Information Needed]
|
| 183 |
+
|
| 184 |
+
**APA:**
|
| 185 |
+
|
| 186 |
+
[More Information Needed]
|
| 187 |
+
|
| 188 |
+
## Glossary [optional]
|
| 189 |
+
|
| 190 |
+
<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
|
| 191 |
+
|
| 192 |
+
[More Information Needed]
|
| 193 |
+
|
| 194 |
+
## More Information [optional]
|
| 195 |
+
|
| 196 |
+
[More Information Needed]
|
| 197 |
+
|
| 198 |
+
## Model Card Authors [optional]
|
| 199 |
+
|
| 200 |
+
[More Information Needed]
|
| 201 |
+
|
| 202 |
+
## Model Card Contact
|
| 203 |
+
|
| 204 |
+
[More Information Needed]
|
| 205 |
+
### Framework versions
|
| 206 |
+
|
| 207 |
+
- PEFT 0.18.1
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"alora_invocation_tokens": null,
|
| 3 |
+
"alpha_pattern": {},
|
| 4 |
+
"arrow_config": null,
|
| 5 |
+
"auto_mapping": null,
|
| 6 |
+
"base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
|
| 7 |
+
"bias": "none",
|
| 8 |
+
"corda_config": null,
|
| 9 |
+
"ensure_weight_tying": false,
|
| 10 |
+
"eva_config": null,
|
| 11 |
+
"exclude_modules": null,
|
| 12 |
+
"fan_in_fan_out": false,
|
| 13 |
+
"inference_mode": true,
|
| 14 |
+
"init_lora_weights": true,
|
| 15 |
+
"layer_replication": null,
|
| 16 |
+
"layers_pattern": null,
|
| 17 |
+
"layers_to_transform": null,
|
| 18 |
+
"loftq_config": {},
|
| 19 |
+
"lora_alpha": 128,
|
| 20 |
+
"lora_bias": false,
|
| 21 |
+
"lora_dropout": 0.05,
|
| 22 |
+
"megatron_config": null,
|
| 23 |
+
"megatron_core": "megatron.core",
|
| 24 |
+
"modules_to_save": null,
|
| 25 |
+
"peft_type": "LORA",
|
| 26 |
+
"peft_version": "0.18.1",
|
| 27 |
+
"qalora_group_size": 16,
|
| 28 |
+
"r": 16,
|
| 29 |
+
"rank_pattern": {},
|
| 30 |
+
"revision": null,
|
| 31 |
+
"target_modules": [
|
| 32 |
+
"q_proj",
|
| 33 |
+
"down_proj",
|
| 34 |
+
"up_proj",
|
| 35 |
+
"v_proj",
|
| 36 |
+
"gate_proj",
|
| 37 |
+
"o_proj",
|
| 38 |
+
"k_proj"
|
| 39 |
+
],
|
| 40 |
+
"target_parameters": null,
|
| 41 |
+
"task_type": "CAUSAL_LM",
|
| 42 |
+
"trainable_token_indices": null,
|
| 43 |
+
"use_dora": false,
|
| 44 |
+
"use_qalora": false,
|
| 45 |
+
"use_rslora": false
|
| 46 |
+
}
|
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cfd4d760f29b46310a656dda6324b650093e68aa899d807abd642782734615a9
|
| 3 |
+
size 504133205
|