VoCuc commited on 15 days ago

Commit

3abaef4

verified ·

1 Parent(s): 15febb1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +24 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md +207 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json +46 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin +3 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json +24 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja +54 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt +0 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json +25 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json +3 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json +207 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json +0 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json +24 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja +54 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json +58 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json +14 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt +44 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt +0 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors +3 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json +31 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json +3 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json +207 -0
qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json +1 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt +234 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md +207 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json +46 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin +3 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json +24 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja +54 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json +25 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json +3 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json +207 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md +207 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json +46 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin +3 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json +24 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja +54 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json +25 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json +3 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json +207 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json +0 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md +207 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json +46 -0
qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin +3 -0

.gitattributes CHANGED Viewed

@@ -100,3 +100,27 @@ eval_results/vllm/qwen2.5-1.5B-it-nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0-1246/resul
 qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
 layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text

 qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 layer_analysis/combined.png filter=lfs diff=lfs merge=lfs -text
 layer_analysis/curvature.png filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#csd/ab_pr_0.5_0.5_8_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/1246_full/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/1246/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen2.5-1.5B-Instruct\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch2_lr1e-4_kdr0.75/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.1_K128_L4_epoch2_lr1e-4_kdr1.0/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm0.9_K128_L4_epoch2_lr1e-4_kdr0.75/4984/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+qwen3-1.7B\#sfkl_nnm_lora/nnm1.0_K128_L4_epoch1_lr1e-4_kdr1.0/2492/tokenizer.json filter=lfs diff=lfs merge=lfs -text

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "gate_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "k_proj",
+    "o_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:757c433b9241ddd9c09d5aeeb342f38b8298d8a5f6287556aa81da1ed75da682
+size 504133205

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/7476/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/args.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "architectures": [
+    "Qwen2ForCausalLM"
+  ],
+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "dtype": "float16",
+  "eos_token_id": 151645,
+  "hidden_act": "silu",
+  "hidden_size": 1536,
+  "initializer_range": 0.02,
+  "intermediate_size": 8960,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 32768,
+  "max_window_layers": 21,
+  "model_type": "qwen2",
+  "num_attention_heads": 12,
+  "num_hidden_layers": 28,
+  "num_key_value_heads": 2,
+  "rms_norm_eps": 1e-06,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "4.57.3",
+  "use_cache": true,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/generation_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "bos_token_id": 151643,
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "repetition_penalty": 1.1,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "4.57.3"
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/log.txt ADDED Viewed

	@@ -0,0 +1,44 @@

+============================== EXP at 2026-05-17 08:31:39 ==============================
+============================== EXP at 2026-05-17 08:32:21 ==============================
+============================== EXP at 2026-05-17 08:45:44 ==============================
+dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
+train | epoch   0 | Iter:     18/ 29904 | global iter:     10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale:     1.0000 | micro time: 0.457 | step time: 0.831
+train | epoch   0 | Iter:     38/ 29904 | global iter:     20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale:     1.0000 | micro time: 0.458 | step time: 0.890
+train | epoch   0 | Iter:     58/ 29904 | global iter:     30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale:     1.0000 | micro time: 0.461 | step time: 0.890
+train | epoch   0 | Iter:     78/ 29904 | global iter:     40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.887
+train | epoch   0 | Iter:     98/ 29904 | global iter:     50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.888
+train | epoch   0 | Iter:    118/ 29904 | global iter:     60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.888
+train | epoch   0 | Iter:    138/ 29904 | global iter:     70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.889
+train | epoch   0 | Iter:    158/ 29904 | global iter:     80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.891
+train | epoch   0 | Iter:    178/ 29904 | global iter:     90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale:     1.0000 | micro time: 0.465 | step time: 0.893
+train | epoch   0 | Iter:    198/ 29904 | global iter:    100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.889
+train | epoch   0 | Iter:    218/ 29904 | global iter:    110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.891
+train | epoch   0 | Iter:    238/ 29904 | global iter:    120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.890
+train | epoch   0 | Iter:    258/ 29904 | global iter:    130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.887
+train | epoch   0 | Iter:    278/ 29904 | global iter:    140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.886
+train | epoch   0 | Iter:    298/ 29904 | global iter:    150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:    318/ 29904 | global iter:    160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:    338/ 29904 | global iter:    170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:    358/ 29904 | global iter:    180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.887
+train | epoch   0 | Iter:    378/ 29904 | global iter:    190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.886
+train | epoch   0 | Iter:    398/ 29904 | global iter:    200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.886
+train | epoch   0 | Iter:    418/ 29904 | global iter:    210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.893
+train | epoch   0 | Iter:    438/ 29904 | global iter:    220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.886
+train | epoch   0 | Iter:    458/ 29904 | global iter:    230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:    478/ 29904 | global iter:    240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.884
+train | epoch   0 | Iter:    498/ 29904 | global iter:    250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.884
+train | epoch   0 | Iter:    518/ 29904 | global iter:    260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:    538/ 29904 | global iter:    270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.888
+train | epoch   0 | Iter:    558/ 29904 | global iter:    280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.887
+train | epoch   0 | Iter:    578/ 29904 | global iter:    290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.889
+train | epoch   0 | Iter:    598/ 29904 | global iter:    300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.889
+train | epoch   0 | Iter:    618/ 29904 | global iter:    310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.888
+train | epoch   0 | Iter:    638/ 29904 | global iter:    320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.887
+train | epoch   0 | Iter:    658/ 29904 | global iter:    330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.887
+train | epoch   0 | Iter:    678/ 29904 | global iter:    340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.886

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f62e5312da0d56c793167b890fd7cdc2e9eb01cc7533967bfcc1023e72067c9
+size 3087860024

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,31 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen2.5-1.5B-Instruct#amid/ab_pr_0.5_0.5_4_1e-4/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/args.json ADDED Viewed

	@@ -0,0 +1 @@

+ {"model_path": "Qwen/Qwen2.5-1.5B-Instruct", "ckpt_name": "qwen2.5-1.5B-Instruct", "model_type": "gpt2", "teacher_model_type": null, "n_gpu": 2, "n_nodes": 1, "teacher_model_path": "Qwen/Qwen2.5-14B-Instruct", "teacher_ckpt_name": "qwen2.5-14B-Instruct", "teacher_model_fp16": true, "model_parallel": false, "model_parallel_size": null, "no_value": false, "dropout_path_rate": null, "fp32": false, "type": "adaptive-csd", "do_train": true, "do_valid": true, "do_eval": false, "base_path": ".", "load": null, "save": "./results/qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4", "log_interval": 10, "mid_log_num": -1, "save_interval": -1, "eval_interval": -1, "local_rank": 0, "save_additional_suffix": "", "save_rollout": false, "eb_sample_times": 3, "data_dir": "./processed_data/ultraInteract/Qwen/Qwen2.5-14B-Instruct/", "processed_data_dir": null, "force_process": false, "force_process_demo": false, "data_process_workers": -1, "train_num": -1, "train_ratio": 1, "dev_num": -1, "dev_ratio": 1, "gen_num": -1, "data_names": null, "prompt_type": null, "num_workers": 4, "max_prompt_length": 512, "min_prompt_length": 128, "json_data": false, "bin_data": false, "txt_data": false, "prompt_data_dir": null, "lm_data_dir": null, "eval_ppl": false, "eval_rw": false, "eval_gen": true, "only_prompt": false, "batch_size": 4, "eval_batch_size": 16, "clip_grad": 1.0, "total_iters": null, "train_iters_per_epoch": -1, "max_length": 1024, "seed": 10, "seed_order": 42, "seed_data": 42, "seed_ppo": 42, "seed_lm": 7, "epochs": 3, "training_epochs": 10000, "gradient_accumulation_steps": 2, "gradient_checkpointing": false, "attn_dtype": null, "lr": 0.0001, "lr_min": 1e-07, "weight_decay": 0.01, "loss_scale": 65536, "kd_ratio": 1.0, "warmup_iters": 0, "lr_decay_iters": null, "lr_decay_style": "cosine", "scheduler_name": "constant_trm", "reward_scaling": null, "cliprange_reward": 1, "ppo_epochs": null, "num_rollouts": 256, "num_rollouts_per_device": null, "cliprange": 0.2, "chunk_size": null, "gamma": 0.95, "length_norm": false, "single_step_reg": false, "teacher_mixed_alpha": null, "lm_coef": 1, "skew_alpha": 0.1, "student_gen": true, "gen_top_p": 1.0, "gen_num_beams": 1, "mixed_alpha": 0.5, "loss_eps": 0.1, "init_threshold": 0.0, "capacity": 1000, "replay_ratio": "decreasing", "delta_threshold": 0.1, "top_k": 0, "top_p": 1.0, "do_sample": true, "no_repeat_ngram_size": 6, "repetition_penalty": null, "num_beams": 1, "temperature": 1.0, "peft": "lora", "peft_lora_r": 16, "peft_lora_alpha": 128, "peft_lora_dropout": 0.05, "peft_name": null, "peft_path": null, "teacher_peft_name": null, "teacher_peft_path": null, "deepspeed": true, "deepspeed_config": "./configs/deepspeed/ds_config_zero0_bf16.json", "deepscale": false, "deepscale_config": null, "ab_alpha": 0.5, "ab_beta": 0.5, "amid_div_name": "ab", "amid_div_order": "pr", "amid_alpha": 0.5, "amid_lam": 0.5, "nnm": true, "nnm_ratio": 0.1, "nnm_n_layers": 4, "nnm_K": 128, "nnm_eta": 0.05, "nnm_T_dead": 50, "nnm_centroid_batches": 500, "nnm_d_prime": 256, "nnm_ns_iters": 5, "nnm_warmup_steps": 0, "nnm_ramp_steps": 0, "rank": 0, "world_size": 2}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/eval/0/answers.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_4_1e-4/log.txt ADDED Viewed

	@@ -0,0 +1,234 @@

+============================== EXP at 2026-05-17 08:55:25 ==============================
+dev | avg_loss: 1.9095982142857142 | {'exact_match': 0.0, 'rougeL': 6.7578} | threshold: 0.0
+train | epoch   0 | Iter:     18/ 29904 | global iter:     10/ 14952 | loss: -0.3565 | ds_loss: -0.3565 | lr: 1.0000e-04 | scale:     1.0000 | micro time: 0.451 | step time: 0.823
+train | epoch   0 | Iter:     38/ 29904 | global iter:     20/ 14952 | loss: -0.1421 | ds_loss: -0.1421 | lr: 1.0000e-04 | scale:     1.0000 | micro time: 0.447 | step time: 0.880
+train | epoch   0 | Iter:     58/ 29904 | global iter:     30/ 14952 | loss: -0.1540 | ds_loss: -0.1540 | lr: 9.9999e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.881
+train | epoch   0 | Iter:     78/ 29904 | global iter:     40/ 14952 | loss: -0.0845 | ds_loss: -0.0845 | lr: 9.9998e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.878
+train | epoch   0 | Iter:     98/ 29904 | global iter:     50/ 14952 | loss: -0.0781 | ds_loss: -0.0781 | lr: 9.9997e-05 | scale:     1.0000 | micro time: 0.446 | step time: 0.874
+train | epoch   0 | Iter:    118/ 29904 | global iter:     60/ 14952 | loss: -0.0858 | ds_loss: -0.0858 | lr: 9.9996e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.873
+train | epoch   0 | Iter:    138/ 29904 | global iter:     70/ 14952 | loss: -0.0648 | ds_loss: -0.0648 | lr: 9.9995e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.875
+train | epoch   0 | Iter:    158/ 29904 | global iter:     80/ 14952 | loss: -0.0911 | ds_loss: -0.0911 | lr: 9.9993e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.876
+train | epoch   0 | Iter:    178/ 29904 | global iter:     90/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9991e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.875
+train | epoch   0 | Iter:    198/ 29904 | global iter:    100/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.9989e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.874
+train | epoch   0 | Iter:    218/ 29904 | global iter:    110/ 14952 | loss: -0.0713 | ds_loss: -0.0713 | lr: 9.9987e-05 | scale:     1.0000 | micro time: 0.445 | step time: 0.875
+train | epoch   0 | Iter:    238/ 29904 | global iter:    120/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.9984e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.874
+train | epoch   0 | Iter:    258/ 29904 | global iter:    130/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.9982e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.874
+train | epoch   0 | Iter:    278/ 29904 | global iter:    140/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9979e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.886
+train | epoch   0 | Iter:    298/ 29904 | global iter:    150/ 14952 | loss: -0.0756 | ds_loss: -0.0756 | lr: 9.9976e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.888
+train | epoch   0 | Iter:    318/ 29904 | global iter:    160/ 14952 | loss: -0.0628 | ds_loss: -0.0628 | lr: 9.9972e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.883
+train | epoch   0 | Iter:    338/ 29904 | global iter:    170/ 14952 | loss: -0.0577 | ds_loss: -0.0577 | lr: 9.9969e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.884
+train | epoch   0 | Iter:    358/ 29904 | global iter:    180/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9965e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.885
+train | epoch   0 | Iter:    378/ 29904 | global iter:    190/ 14952 | loss: -0.0704 | ds_loss: -0.0704 | lr: 9.9961e-05 | scale:     1.0000 | micro time: 0.466 | step time: 0.888
+train | epoch   0 | Iter:    398/ 29904 | global iter:    200/ 14952 | loss: -0.0640 | ds_loss: -0.0640 | lr: 9.9956e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.890
+train | epoch   0 | Iter:    418/ 29904 | global iter:    210/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9952e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.890
+train | epoch   0 | Iter:    438/ 29904 | global iter:    220/ 14952 | loss: -0.0819 | ds_loss: -0.0819 | lr: 9.9947e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.888
+train | epoch   0 | Iter:    458/ 29904 | global iter:    230/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.9942e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.887
+train | epoch   0 | Iter:    478/ 29904 | global iter:    240/ 14952 | loss: -0.0610 | ds_loss: -0.0610 | lr: 9.9937e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.887
+train | epoch   0 | Iter:    498/ 29904 | global iter:    250/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9932e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.884
+train | epoch   0 | Iter:    518/ 29904 | global iter:    260/ 14952 | loss: -0.0642 | ds_loss: -0.0642 | lr: 9.9926e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.887
+train | epoch   0 | Iter:    538/ 29904 | global iter:    270/ 14952 | loss: -0.0743 | ds_loss: -0.0743 | lr: 9.9920e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.885
+train | epoch   0 | Iter:    558/ 29904 | global iter:    280/ 14952 | loss: -0.0724 | ds_loss: -0.0724 | lr: 9.9914e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.887
+train | epoch   0 | Iter:    578/ 29904 | global iter:    290/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.9908e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.889
+train | epoch   0 | Iter:    598/ 29904 | global iter:    300/ 14952 | loss: -0.0607 | ds_loss: -0.0607 | lr: 9.9901e-05 | scale:     1.0000 | micro time: 0.462 | step time: 0.888
+train | epoch   0 | Iter:    618/ 29904 | global iter:    310/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9895e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.890
+train | epoch   0 | Iter:    638/ 29904 | global iter:    320/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.9888e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.888
+train | epoch   0 | Iter:    658/ 29904 | global iter:    330/ 14952 | loss: -0.0508 | ds_loss: -0.0508 | lr: 9.9881e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.889
+train | epoch   0 | Iter:    678/ 29904 | global iter:    340/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9873e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.888
+train | epoch   0 | Iter:    698/ 29904 | global iter:    350/ 14952 | loss: -0.0481 | ds_loss: -0.0481 | lr: 9.9866e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.891
+train | epoch   0 | Iter:    718/ 29904 | global iter:    360/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.9858e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.890
+train | epoch   0 | Iter:    738/ 29904 | global iter:    370/ 14952 | loss: -0.0470 | ds_loss: -0.0470 | lr: 9.9850e-05 | scale:     1.0000 | micro time: 0.462 | step time: 0.893
+train | epoch   0 | Iter:    758/ 29904 | global iter:    380/ 14952 | loss: -0.0438 | ds_loss: -0.0438 | lr: 9.9842e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.889
+train | epoch   0 | Iter:    778/ 29904 | global iter:    390/ 14952 | loss: -0.0725 | ds_loss: -0.0725 | lr: 9.9833e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.890
+train | epoch   0 | Iter:    798/ 29904 | global iter:    400/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.9825e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.886
+train | epoch   0 | Iter:    818/ 29904 | global iter:    410/ 14952 | loss: -0.0601 | ds_loss: -0.0601 | lr: 9.9816e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.885
+train | epoch   0 | Iter:    838/ 29904 | global iter:    420/ 14952 | loss: -0.0512 | ds_loss: -0.0512 | lr: 9.9807e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.886
+train | epoch   0 | Iter:    858/ 29904 | global iter:    430/ 14952 | loss: -0.0566 | ds_loss: -0.0566 | lr: 9.9797e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.888
+train | epoch   0 | Iter:    878/ 29904 | global iter:    440/ 14952 | loss: -0.0621 | ds_loss: -0.0621 | lr: 9.9788e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.887
+train | epoch   0 | Iter:    898/ 29904 | global iter:    450/ 14952 | loss: -0.0574 | ds_loss: -0.0574 | lr: 9.9778e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.888
+train | epoch   0 | Iter:    918/ 29904 | global iter:    460/ 14952 | loss: -0.0430 | ds_loss: -0.0430 | lr: 9.9768e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.891
+train | epoch   0 | Iter:    938/ 29904 | global iter:    470/ 14952 | loss: -0.0646 | ds_loss: -0.0646 | lr: 9.9758e-05 | scale:     1.0000 | micro time: 0.462 | step time: 0.893
+train | epoch   0 | Iter:    958/ 29904 | global iter:    480/ 14952 | loss: -0.0583 | ds_loss: -0.0583 | lr: 9.9747e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.890
+train | epoch   0 | Iter:    978/ 29904 | global iter:    490/ 14952 | loss: -0.0492 | ds_loss: -0.0492 | lr: 9.9737e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.886
+train | epoch   0 | Iter:    998/ 29904 | global iter:    500/ 14952 | loss: -0.0456 | ds_loss: -0.0456 | lr: 9.9726e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.886
+train | epoch   0 | Iter:   1018/ 29904 | global iter:    510/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9715e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.883
+train | epoch   0 | Iter:   1038/ 29904 | global iter:    520/ 14952 | loss: -0.0596 | ds_loss: -0.0596 | lr: 9.9703e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:   1058/ 29904 | global iter:    530/ 14952 | loss: -0.0477 | ds_loss: -0.0477 | lr: 9.9692e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.888
+train | epoch   0 | Iter:   1078/ 29904 | global iter:    540/ 14952 | loss: -0.0459 | ds_loss: -0.0459 | lr: 9.9680e-05 | scale:     1.0000 | micro time: 0.476 | step time: 0.884
+train | epoch   0 | Iter:   1098/ 29904 | global iter:    550/ 14952 | loss: -0.0659 | ds_loss: -0.0659 | lr: 9.9668e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.889
+train | epoch   0 | Iter:   1118/ 29904 | global iter:    560/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9656e-05 | scale:     1.0000 | micro time: 0.464 | step time: 0.888
+train | epoch   0 | Iter:   1138/ 29904 | global iter:    570/ 14952 | loss: -0.0471 | ds_loss: -0.0471 | lr: 9.9643e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.890
+train | epoch   0 | Iter:   1158/ 29904 | global iter:    580/ 14952 | loss: -0.0619 | ds_loss: -0.0619 | lr: 9.9631e-05 | scale:     1.0000 | micro time: 0.462 | step time: 0.890
+train | epoch   0 | Iter:   1178/ 29904 | global iter:    590/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.9618e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.889
+train | epoch   0 | Iter:   1198/ 29904 | global iter:    600/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9605e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.886
+train | epoch   0 | Iter:   1218/ 29904 | global iter:    610/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.9592e-05 | scale:     1.0000 | micro time: 0.462 | step time: 0.888
+train | epoch   0 | Iter:   1238/ 29904 | global iter:    620/ 14952 | loss: -0.0740 | ds_loss: -0.0740 | lr: 9.9578e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.886
+train | epoch   0 | Iter:   1258/ 29904 | global iter:    630/ 14952 | loss: -0.0396 | ds_loss: -0.0396 | lr: 9.9564e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.887
+train | epoch   0 | Iter:   1278/ 29904 | global iter:    640/ 14952 | loss: -0.0657 | ds_loss: -0.0657 | lr: 9.9550e-05 | scale:     1.0000 | micro time: 0.464 | step time: 0.888
+train | epoch   0 | Iter:   1298/ 29904 | global iter:    650/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9536e-05 | scale:     1.0000 | micro time: 0.468 | step time: 0.900
+train | epoch   0 | Iter:   1318/ 29904 | global iter:    660/ 14952 | loss: -0.0509 | ds_loss: -0.0509 | lr: 9.9522e-05 | scale:     1.0000 | micro time: 0.461 | step time: 0.891
+train | epoch   0 | Iter:   1338/ 29904 | global iter:    670/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.9507e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.894
+train | epoch   0 | Iter:   1358/ 29904 | global iter:    680/ 14952 | loss: -0.0706 | ds_loss: -0.0706 | lr: 9.9493e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.895
+train | epoch   0 | Iter:   1378/ 29904 | global iter:    690/ 14952 | loss: -0.0615 | ds_loss: -0.0615 | lr: 9.9477e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.890
+train | epoch   0 | Iter:   1398/ 29904 | global iter:    700/ 14952 | loss: -0.0546 | ds_loss: -0.0546 | lr: 9.9462e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.892
+train | epoch   0 | Iter:   1418/ 29904 | global iter:    710/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.9447e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.884
+train | epoch   0 | Iter:   1438/ 29904 | global iter:    720/ 14952 | loss: -0.0502 | ds_loss: -0.0502 | lr: 9.9431e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:   1458/ 29904 | global iter:    730/ 14952 | loss: -0.0760 | ds_loss: -0.0760 | lr: 9.9415e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.884
+train | epoch   0 | Iter:   1478/ 29904 | global iter:    740/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.9399e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.887
+train | epoch   0 | Iter:   1498/ 29904 | global iter:    750/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9383e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.892
+train | epoch   0 | Iter:   1518/ 29904 | global iter:    760/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.9366e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.886
+train | epoch   0 | Iter:   1538/ 29904 | global iter:    770/ 14952 | loss: -0.0575 | ds_loss: -0.0575 | lr: 9.9349e-05 | scale:     1.0000 | micro time: 0.466 | step time: 0.893
+train | epoch   0 | Iter:   1558/ 29904 | global iter:    780/ 14952 | loss: -0.0710 | ds_loss: -0.0710 | lr: 9.9332e-05 | scale:     1.0000 | micro time: 0.465 | step time: 0.895
+train | epoch   0 | Iter:   1578/ 29904 | global iter:    790/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.9315e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.892
+train | epoch   0 | Iter:   1598/ 29904 | global iter:    800/ 14952 | loss: -0.0679 | ds_loss: -0.0679 | lr: 9.9298e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.884
+train | epoch   0 | Iter:   1618/ 29904 | global iter:    810/ 14952 | loss: -0.0500 | ds_loss: -0.0500 | lr: 9.9280e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.889
+train | epoch   0 | Iter:   1638/ 29904 | global iter:    820/ 14952 | loss: -0.0550 | ds_loss: -0.0550 | lr: 9.9262e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.924
+train | epoch   0 | Iter:   1658/ 29904 | global iter:    830/ 14952 | loss: -0.0624 | ds_loss: -0.0624 | lr: 9.9244e-05 | scale:     1.0000 | micro time: 0.461 | step time: 0.927
+train | epoch   0 | Iter:   1678/ 29904 | global iter:    840/ 14952 | loss: -0.0722 | ds_loss: -0.0722 | lr: 9.9226e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.885
+train | epoch   0 | Iter:   1698/ 29904 | global iter:    850/ 14952 | loss: -0.0666 | ds_loss: -0.0666 | lr: 9.9207e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:   1718/ 29904 | global iter:    860/ 14952 | loss: -0.0451 | ds_loss: -0.0451 | lr: 9.9189e-05 | scale:     1.0000 | micro time: 0.469 | step time: 0.889
+train | epoch   0 | Iter:   1738/ 29904 | global iter:    870/ 14952 | loss: -0.0571 | ds_loss: -0.0571 | lr: 9.9170e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.888
+train | epoch   0 | Iter:   1758/ 29904 | global iter:    880/ 14952 | loss: -0.0484 | ds_loss: -0.0484 | lr: 9.9151e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.882
+train | epoch   0 | Iter:   1778/ 29904 | global iter:    890/ 14952 | loss: -0.0524 | ds_loss: -0.0524 | lr: 9.9131e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.880
+train | epoch   0 | Iter:   1798/ 29904 | global iter:    900/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.9112e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.880
+train | epoch   0 | Iter:   1818/ 29904 | global iter:    910/ 14952 | loss: -0.0606 | ds_loss: -0.0606 | lr: 9.9092e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.880
+train | epoch   0 | Iter:   1838/ 29904 | global iter:    920/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.9072e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.882
+train | epoch   0 | Iter:   1858/ 29904 | global iter:    930/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.9051e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.883
+train | epoch   0 | Iter:   1878/ 29904 | global iter:    940/ 14952 | loss: -0.0719 | ds_loss: -0.0719 | lr: 9.9031e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:   1898/ 29904 | global iter:    950/ 14952 | loss: -0.0622 | ds_loss: -0.0622 | lr: 9.9010e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.890
+train | epoch   0 | Iter:   1918/ 29904 | global iter:    960/ 14952 | loss: -0.0499 | ds_loss: -0.0499 | lr: 9.8989e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.891
+train | epoch   0 | Iter:   1938/ 29904 | global iter:    970/ 14952 | loss: -0.0839 | ds_loss: -0.0839 | lr: 9.8968e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.885
+train | epoch   0 | Iter:   1958/ 29904 | global iter:    980/ 14952 | loss: -0.0293 | ds_loss: -0.0293 | lr: 9.8947e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.886
+train | epoch   0 | Iter:   1978/ 29904 | global iter:    990/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8925e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.885
+train | epoch   0 | Iter:   1998/ 29904 | global iter:   1000/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.8904e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.883
+train | epoch   0 | Iter:   2018/ 29904 | global iter:   1010/ 14952 | loss: -0.0851 | ds_loss: -0.0851 | lr: 9.8882e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.883
+train | epoch   0 | Iter:   2038/ 29904 | global iter:   1020/ 14952 | loss: -0.0395 | ds_loss: -0.0395 | lr: 9.8859e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.884
+train | epoch   0 | Iter:   2058/ 29904 | global iter:   1030/ 14952 | loss: -0.0579 | ds_loss: -0.0579 | lr: 9.8837e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.879
+train | epoch   0 | Iter:   2078/ 29904 | global iter:   1040/ 14952 | loss: -0.0589 | ds_loss: -0.0589 | lr: 9.8814e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.881
+train | epoch   0 | Iter:   2098/ 29904 | global iter:   1050/ 14952 | loss: -0.0519 | ds_loss: -0.0519 | lr: 9.8792e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.885
+train | epoch   0 | Iter:   2118/ 29904 | global iter:   1060/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.8769e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.880
+train | epoch   0 | Iter:   2138/ 29904 | global iter:   1070/ 14952 | loss: -0.0557 | ds_loss: -0.0557 | lr: 9.8745e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.882
+train | epoch   0 | Iter:   2158/ 29904 | global iter:   1080/ 14952 | loss: -0.0527 | ds_loss: -0.0527 | lr: 9.8722e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.883
+train | epoch   0 | Iter:   2178/ 29904 | global iter:   1090/ 14952 | loss: -0.0354 | ds_loss: -0.0354 | lr: 9.8698e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.880
+train | epoch   0 | Iter:   2198/ 29904 | global iter:   1100/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.8674e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.881
+train | epoch   0 | Iter:   2218/ 29904 | global iter:   1110/ 14952 | loss: -0.0343 | ds_loss: -0.0343 | lr: 9.8650e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.880
+train | epoch   0 | Iter:   2238/ 29904 | global iter:   1120/ 14952 | loss: -0.0587 | ds_loss: -0.0587 | lr: 9.8626e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.883
+train | epoch   0 | Iter:   2258/ 29904 | global iter:   1130/ 14952 | loss: -0.0361 | ds_loss: -0.0361 | lr: 9.8601e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.882
+train | epoch   0 | Iter:   2278/ 29904 | global iter:   1140/ 14952 | loss: -0.0561 | ds_loss: -0.0561 | lr: 9.8576e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.884
+train | epoch   0 | Iter:   2298/ 29904 | global iter:   1150/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.8551e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.885
+train | epoch   0 | Iter:   2318/ 29904 | global iter:   1160/ 14952 | loss: -0.0379 | ds_loss: -0.0379 | lr: 9.8526e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.888
+train | epoch   0 | Iter:   2338/ 29904 | global iter:   1170/ 14952 | loss: -0.0728 | ds_loss: -0.0728 | lr: 9.8501e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.888
+train | epoch   0 | Iter:   2358/ 29904 | global iter:   1180/ 14952 | loss: -0.0491 | ds_loss: -0.0491 | lr: 9.8475e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.894
+train | epoch   0 | Iter:   2378/ 29904 | global iter:   1190/ 14952 | loss: -0.0578 | ds_loss: -0.0578 | lr: 9.8449e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.883
+train | epoch   0 | Iter:   2398/ 29904 | global iter:   1200/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.8423e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.882
+train | epoch   0 | Iter:   2418/ 29904 | global iter:   1210/ 14952 | loss: -0.0544 | ds_loss: -0.0544 | lr: 9.8397e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.881
+train | epoch   0 | Iter:   2438/ 29904 | global iter:   1220/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.8371e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.880
+train | epoch   0 | Iter:   2458/ 29904 | global iter:   1230/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.8344e-05 | scale:     1.0000 | micro time: 0.459 | step time: 0.883
+train | epoch   0 | Iter:   2478/ 29904 | global iter:   1240/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.8317e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.887
+train | epoch   0 | Iter:   2498/ 29904 | global iter:   1250/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8290e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.883
+train | epoch   0 | Iter:   2518/ 29904 | global iter:   1260/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8263e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.886
+train | epoch   0 | Iter:   2538/ 29904 | global iter:   1270/ 14952 | loss: -0.0486 | ds_loss: -0.0486 | lr: 9.8235e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.885
+train | epoch   0 | Iter:   2558/ 29904 | global iter:   1280/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.8207e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.885
+train | epoch   0 | Iter:   2578/ 29904 | global iter:   1290/ 14952 | loss: -0.0428 | ds_loss: -0.0428 | lr: 9.8179e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.883
+train | epoch   0 | Iter:   2598/ 29904 | global iter:   1300/ 14952 | loss: -0.0634 | ds_loss: -0.0634 | lr: 9.8151e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.885
+train | epoch   0 | Iter:   2618/ 29904 | global iter:   1310/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.8123e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.884
+train | epoch   0 | Iter:   2638/ 29904 | global iter:   1320/ 14952 | loss: -0.0418 | ds_loss: -0.0418 | lr: 9.8094e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.914
+train | epoch   0 | Iter:   2658/ 29904 | global iter:   1330/ 14952 | loss: -0.0651 | ds_loss: -0.0651 | lr: 9.8065e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.880
+train | epoch   0 | Iter:   2678/ 29904 | global iter:   1340/ 14952 | loss: -0.0729 | ds_loss: -0.0729 | lr: 9.8036e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.881
+train | epoch   0 | Iter:   2698/ 29904 | global iter:   1350/ 14952 | loss: -0.0489 | ds_loss: -0.0489 | lr: 9.8007e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.883
+train | epoch   0 | Iter:   2718/ 29904 | global iter:   1360/ 14952 | loss: -0.0655 | ds_loss: -0.0655 | lr: 9.7977e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   2738/ 29904 | global iter:   1370/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7948e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:   2758/ 29904 | global iter:   1380/ 14952 | loss: -0.0538 | ds_loss: -0.0538 | lr: 9.7918e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.883
+train | epoch   0 | Iter:   2778/ 29904 | global iter:   1390/ 14952 | loss: -0.0534 | ds_loss: -0.0534 | lr: 9.7888e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.880
+train | epoch   0 | Iter:   2798/ 29904 | global iter:   1400/ 14952 | loss: -0.0485 | ds_loss: -0.0485 | lr: 9.7858e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.874
+train | epoch   0 | Iter:   2818/ 29904 | global iter:   1410/ 14952 | loss: -0.0466 | ds_loss: -0.0466 | lr: 9.7827e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.876
+train | epoch   0 | Iter:   2838/ 29904 | global iter:   1420/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.7796e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.883
+train | epoch   0 | Iter:   2858/ 29904 | global iter:   1430/ 14952 | loss: -0.0612 | ds_loss: -0.0612 | lr: 9.7765e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   2878/ 29904 | global iter:   1440/ 14952 | loss: -0.0584 | ds_loss: -0.0584 | lr: 9.7734e-05 | scale:     1.0000 | micro time: 0.464 | step time: 0.892
+train | epoch   0 | Iter:   2898/ 29904 | global iter:   1450/ 14952 | loss: -0.0365 | ds_loss: -0.0365 | lr: 9.7703e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.891
+train | epoch   0 | Iter:   2918/ 29904 | global iter:   1460/ 14952 | loss: -0.0554 | ds_loss: -0.0554 | lr: 9.7671e-05 | scale:     1.0000 | micro time: 0.463 | step time: 0.892
+train | epoch   0 | Iter:   2938/ 29904 | global iter:   1470/ 14952 | loss: -0.0541 | ds_loss: -0.0541 | lr: 9.7640e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.891
+train | epoch   0 | Iter:   2958/ 29904 | global iter:   1480/ 14952 | loss: -0.0660 | ds_loss: -0.0660 | lr: 9.7608e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.887
+train | epoch   0 | Iter:   2978/ 29904 | global iter:   1490/ 14952 | loss: -0.0480 | ds_loss: -0.0480 | lr: 9.7575e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.889
+train | epoch   0 | Iter:   2998/ 29904 | global iter:   1500/ 14952 | loss: -0.0483 | ds_loss: -0.0483 | lr: 9.7543e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.888
+train | epoch   0 | Iter:   3018/ 29904 | global iter:   1510/ 14952 | loss: -0.0529 | ds_loss: -0.0529 | lr: 9.7510e-05 | scale:     1.0000 | micro time: 0.458 | step time: 0.886
+train | epoch   0 | Iter:   3038/ 29904 | global iter:   1520/ 14952 | loss: -0.0442 | ds_loss: -0.0442 | lr: 9.7477e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.890
+train | epoch   0 | Iter:   3058/ 29904 | global iter:   1530/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.7444e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.883
+train | epoch   0 | Iter:   3078/ 29904 | global iter:   1540/ 14952 | loss: -0.0543 | ds_loss: -0.0543 | lr: 9.7411e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.883
+train | epoch   0 | Iter:   3098/ 29904 | global iter:   1550/ 14952 | loss: -0.0598 | ds_loss: -0.0598 | lr: 9.7378e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.882
+train | epoch   0 | Iter:   3118/ 29904 | global iter:   1560/ 14952 | loss: -0.0378 | ds_loss: -0.0378 | lr: 9.7344e-05 | scale:     1.0000 | micro time: 0.466 | step time: 0.888
+train | epoch   0 | Iter:   3138/ 29904 | global iter:   1570/ 14952 | loss: -0.0614 | ds_loss: -0.0614 | lr: 9.7310e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.886
+train | epoch   0 | Iter:   3158/ 29904 | global iter:   1580/ 14952 | loss: -0.0720 | ds_loss: -0.0720 | lr: 9.7276e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.879
+train | epoch   0 | Iter:   3178/ 29904 | global iter:   1590/ 14952 | loss: -0.0702 | ds_loss: -0.0702 | lr: 9.7242e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.883
+train | epoch   0 | Iter:   3198/ 29904 | global iter:   1600/ 14952 | loss: -0.0505 | ds_loss: -0.0505 | lr: 9.7207e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.885
+train | epoch   0 | Iter:   3218/ 29904 | global iter:   1610/ 14952 | loss: -0.0585 | ds_loss: -0.0585 | lr: 9.7173e-05 | scale:     1.0000 | micro time: 0.445 | step time: 0.886
+train | epoch   0 | Iter:   3238/ 29904 | global iter:   1620/ 14952 | loss: -0.0669 | ds_loss: -0.0669 | lr: 9.7138e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.876
+train | epoch   0 | Iter:   3258/ 29904 | global iter:   1630/ 14952 | loss: -0.0503 | ds_loss: -0.0503 | lr: 9.7103e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.878
+train | epoch   0 | Iter:   3278/ 29904 | global iter:   1640/ 14952 | loss: -0.0632 | ds_loss: -0.0632 | lr: 9.7067e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   3298/ 29904 | global iter:   1650/ 14952 | loss: -0.0476 | ds_loss: -0.0476 | lr: 9.7032e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.878
+train | epoch   0 | Iter:   3318/ 29904 | global iter:   1660/ 14952 | loss: -0.0382 | ds_loss: -0.0382 | lr: 9.6996e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.881
+train | epoch   0 | Iter:   3338/ 29904 | global iter:   1670/ 14952 | loss: -0.0440 | ds_loss: -0.0440 | lr: 9.6960e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.878
+train | epoch   0 | Iter:   3358/ 29904 | global iter:   1680/ 14952 | loss: -0.0594 | ds_loss: -0.0594 | lr: 9.6924e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.881
+train | epoch   0 | Iter:   3378/ 29904 | global iter:   1690/ 14952 | loss: -0.0522 | ds_loss: -0.0522 | lr: 9.6888e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.878
+train | epoch   0 | Iter:   3398/ 29904 | global iter:   1700/ 14952 | loss: -0.0580 | ds_loss: -0.0580 | lr: 9.6851e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.876
+train | epoch   0 | Iter:   3418/ 29904 | global iter:   1710/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.6814e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.878
+train | epoch   0 | Iter:   3438/ 29904 | global iter:   1720/ 14952 | loss: -0.0392 | ds_loss: -0.0392 | lr: 9.6777e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.878
+train | epoch   0 | Iter:   3458/ 29904 | global iter:   1730/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6740e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.879
+train | epoch   0 | Iter:   3478/ 29904 | global iter:   1740/ 14952 | loss: -0.0549 | ds_loss: -0.0549 | lr: 9.6703e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.879
+train | epoch   0 | Iter:   3498/ 29904 | global iter:   1750/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.6665e-05 | scale:     1.0000 | micro time: 0.460 | step time: 0.879
+train | epoch   0 | Iter:   3518/ 29904 | global iter:   1760/ 14952 | loss: -0.0597 | ds_loss: -0.0597 | lr: 9.6627e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.886
+train | epoch   0 | Iter:   3538/ 29904 | global iter:   1770/ 14952 | loss: -0.0563 | ds_loss: -0.0563 | lr: 9.6589e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.879
+train | epoch   0 | Iter:   3558/ 29904 | global iter:   1780/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6551e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.879
+train | epoch   0 | Iter:   3578/ 29904 | global iter:   1790/ 14952 | loss: -0.0351 | ds_loss: -0.0351 | lr: 9.6513e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.883
+train | epoch   0 | Iter:   3598/ 29904 | global iter:   1800/ 14952 | loss: -0.0437 | ds_loss: -0.0437 | lr: 9.6474e-05 | scale:     1.0000 | micro time: 0.445 | step time: 0.876
+train | epoch   0 | Iter:   3618/ 29904 | global iter:   1810/ 14952 | loss: -0.0732 | ds_loss: -0.0732 | lr: 9.6435e-05 | scale:     1.0000 | micro time: 0.445 | step time: 0.874
+train | epoch   0 | Iter:   3638/ 29904 | global iter:   1820/ 14952 | loss: -0.0420 | ds_loss: -0.0420 | lr: 9.6396e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.907
+train | epoch   0 | Iter:   3658/ 29904 | global iter:   1830/ 14952 | loss: -0.0709 | ds_loss: -0.0709 | lr: 9.6357e-05 | scale:     1.0000 | micro time: 0.461 | step time: 0.882
+train | epoch   0 | Iter:   3678/ 29904 | global iter:   1840/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.6317e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.882
+train | epoch   0 | Iter:   3698/ 29904 | global iter:   1850/ 14952 | loss: -0.0676 | ds_loss: -0.0676 | lr: 9.6278e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.880
+train | epoch   0 | Iter:   3718/ 29904 | global iter:   1860/ 14952 | loss: -0.0348 | ds_loss: -0.0348 | lr: 9.6238e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.881
+train | epoch   0 | Iter:   3738/ 29904 | global iter:   1870/ 14952 | loss: -0.0747 | ds_loss: -0.0747 | lr: 9.6198e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.882
+train | epoch   0 | Iter:   3758/ 29904 | global iter:   1880/ 14952 | loss: -0.0535 | ds_loss: -0.0535 | lr: 9.6158e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.884
+train | epoch   0 | Iter:   3778/ 29904 | global iter:   1890/ 14952 | loss: -0.0345 | ds_loss: -0.0345 | lr: 9.6117e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.881
+train | epoch   0 | Iter:   3798/ 29904 | global iter:   1900/ 14952 | loss: -0.0558 | ds_loss: -0.0558 | lr: 9.6076e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.882
+train | epoch   0 | Iter:   3818/ 29904 | global iter:   1910/ 14952 | loss: -0.0464 | ds_loss: -0.0464 | lr: 9.6036e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.886
+train | epoch   0 | Iter:   3838/ 29904 | global iter:   1920/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5994e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.885
+train | epoch   0 | Iter:   3858/ 29904 | global iter:   1930/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5953e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.886
+train | epoch   0 | Iter:   3878/ 29904 | global iter:   1940/ 14952 | loss: -0.0551 | ds_loss: -0.0551 | lr: 9.5912e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.882
+train | epoch   0 | Iter:   3898/ 29904 | global iter:   1950/ 14952 | loss: -0.0415 | ds_loss: -0.0415 | lr: 9.5870e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.882
+train | epoch   0 | Iter:   3918/ 29904 | global iter:   1960/ 14952 | loss: -0.0493 | ds_loss: -0.0493 | lr: 9.5828e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.881
+train | epoch   0 | Iter:   3938/ 29904 | global iter:   1970/ 14952 | loss: -0.0525 | ds_loss: -0.0525 | lr: 9.5786e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.879
+train | epoch   0 | Iter:   3958/ 29904 | global iter:   1980/ 14952 | loss: -0.0581 | ds_loss: -0.0581 | lr: 9.5744e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.882
+train | epoch   0 | Iter:   3978/ 29904 | global iter:   1990/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.5701e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.878
+train | epoch   0 | Iter:   3998/ 29904 | global iter:   2000/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5659e-05 | scale:     1.0000 | micro time: 0.447 | step time: 0.879
+train | epoch   0 | Iter:   4018/ 29904 | global iter:   2010/ 14952 | loss: -0.0667 | ds_loss: -0.0667 | lr: 9.5616e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.878
+train | epoch   0 | Iter:   4038/ 29904 | global iter:   2020/ 14952 | loss: -0.0542 | ds_loss: -0.0542 | lr: 9.5573e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.877
+train | epoch   0 | Iter:   4058/ 29904 | global iter:   2030/ 14952 | loss: -0.0474 | ds_loss: -0.0474 | lr: 9.5529e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.879
+train | epoch   0 | Iter:   4078/ 29904 | global iter:   2040/ 14952 | loss: -0.0355 | ds_loss: -0.0355 | lr: 9.5486e-05 | scale:     1.0000 | micro time: 0.446 | step time: 0.879
+train | epoch   0 | Iter:   4098/ 29904 | global iter:   2050/ 14952 | loss: -0.0495 | ds_loss: -0.0495 | lr: 9.5442e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.877
+train | epoch   0 | Iter:   4118/ 29904 | global iter:   2060/ 14952 | loss: -0.0506 | ds_loss: -0.0506 | lr: 9.5398e-05 | scale:     1.0000 | micro time: 0.472 | step time: 0.878
+train | epoch   0 | Iter:   4138/ 29904 | global iter:   2070/ 14952 | loss: -0.0441 | ds_loss: -0.0441 | lr: 9.5354e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.889
+train | epoch   0 | Iter:   4158/ 29904 | global iter:   2080/ 14952 | loss: -0.0656 | ds_loss: -0.0656 | lr: 9.5310e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   4178/ 29904 | global iter:   2090/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.5265e-05 | scale:     1.0000 | micro time: 0.464 | step time: 0.880
+train | epoch   0 | Iter:   4198/ 29904 | global iter:   2100/ 14952 | loss: -0.0569 | ds_loss: -0.0569 | lr: 9.5221e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   4218/ 29904 | global iter:   2110/ 14952 | loss: -0.0548 | ds_loss: -0.0548 | lr: 9.5176e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.880
+train | epoch   0 | Iter:   4238/ 29904 | global iter:   2120/ 14952 | loss: -0.0414 | ds_loss: -0.0414 | lr: 9.5131e-05 | scale:     1.0000 | micro time: 0.448 | step time: 0.883
+train | epoch   0 | Iter:   4258/ 29904 | global iter:   2130/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.5085e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.880
+train | epoch   0 | Iter:   4278/ 29904 | global iter:   2140/ 14952 | loss: -0.0833 | ds_loss: -0.0833 | lr: 9.5040e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.879
+train | epoch   0 | Iter:   4298/ 29904 | global iter:   2150/ 14952 | loss: -0.0411 | ds_loss: -0.0411 | lr: 9.4994e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:   4318/ 29904 | global iter:   2160/ 14952 | loss: -0.0652 | ds_loss: -0.0652 | lr: 9.4948e-05 | scale:     1.0000 | micro time: 0.449 | step time: 0.879
+train | epoch   0 | Iter:   4338/ 29904 | global iter:   2170/ 14952 | loss: -0.0644 | ds_loss: -0.0644 | lr: 9.4902e-05 | scale:     1.0000 | micro time: 0.452 | step time: 0.884
+train | epoch   0 | Iter:   4358/ 29904 | global iter:   2180/ 14952 | loss: -0.0412 | ds_loss: -0.0412 | lr: 9.4856e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.882
+train | epoch   0 | Iter:   4378/ 29904 | global iter:   2190/ 14952 | loss: -0.0545 | ds_loss: -0.0545 | lr: 9.4809e-05 | scale:     1.0000 | micro time: 0.451 | step time: 0.884
+train | epoch   0 | Iter:   4398/ 29904 | global iter:   2200/ 14952 | loss: -0.0479 | ds_loss: -0.0479 | lr: 9.4763e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.886
+train | epoch   0 | Iter:   4418/ 29904 | global iter:   2210/ 14952 | loss: -0.0536 | ds_loss: -0.0536 | lr: 9.4716e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.884
+train | epoch   0 | Iter:   4438/ 29904 | global iter:   2220/ 14952 | loss: -0.0380 | ds_loss: -0.0380 | lr: 9.4669e-05 | scale:     1.0000 | micro time: 0.450 | step time: 0.884
+train | epoch   0 | Iter:   4458/ 29904 | global iter:   2230/ 14952 | loss: -0.0560 | ds_loss: -0.0560 | lr: 9.4621e-05 | scale:     1.0000 | micro time: 0.457 | step time: 0.883
+train | epoch   0 | Iter:   4478/ 29904 | global iter:   2240/ 14952 | loss: -0.0344 | ds_loss: -0.0344 | lr: 9.4574e-05 | scale:     1.0000 | micro time: 0.453 | step time: 0.881
+train | epoch   0 | Iter:   4498/ 29904 | global iter:   2250/ 14952 | loss: -0.0490 | ds_loss: -0.0490 | lr: 9.4526e-05 | scale:     1.0000 | micro time: 0.456 | step time: 0.883
+train | epoch   0 | Iter:   4518/ 29904 | global iter:   2260/ 14952 | loss: -0.0518 | ds_loss: -0.0518 | lr: 9.4478e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.884
+train | epoch   0 | Iter:   4538/ 29904 | global iter:   2270/ 14952 | loss: -0.0617 | ds_loss: -0.0617 | lr: 9.4430e-05 | scale:     1.0000 | micro time: 0.454 | step time: 0.884
+train | epoch   0 | Iter:   4558/ 29904 | global iter:   2280/ 14952 | loss: -0.0320 | ds_loss: -0.0320 | lr: 9.4382e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.884
+train | epoch   0 | Iter:   4578/ 29904 | global iter:   2290/ 14952 | loss: -0.0521 | ds_loss: -0.0521 | lr: 9.4334e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.883
+train | epoch   0 | Iter:   4598/ 29904 | global iter:   2300/ 14952 | loss: -0.0467 | ds_loss: -0.0467 | lr: 9.4285e-05 | scale:     1.0000 | micro time: 0.455 | step time: 0.883

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:25ffab0951358a2a0ade91fcbb1c4d8f212b82bb0ba74234477d26554ea34c3e
+size 504133205

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/2492/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f403f53d9b3384b42396cdef5cb104c4bcabf4efed72c8a737769d426f7f17ac
+size 504133205

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/added_tokens.json ADDED Viewed

	@@ -0,0 +1,24 @@

+{
+  "</tool_call>": 151658,
+  "<tool_call>": 151657,
+  "<|box_end|>": 151649,
+  "<|box_start|>": 151648,
+  "<|endoftext|>": 151643,
+  "<|file_sep|>": 151664,
+  "<|fim_middle|>": 151660,
+  "<|fim_pad|>": 151662,
+  "<|fim_prefix|>": 151659,
+  "<|fim_suffix|>": 151661,
+  "<|im_end|>": 151645,
+  "<|im_start|>": 151644,
+  "<|image_pad|>": 151655,
+  "<|object_ref_end|>": 151647,
+  "<|object_ref_start|>": 151646,
+  "<|quad_end|>": 151651,
+  "<|quad_start|>": 151650,
+  "<|repo_name|>": 151663,
+  "<|video_pad|>": 151656,
+  "<|vision_end|>": 151653,
+  "<|vision_pad|>": 151654,
+  "<|vision_start|>": 151652
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,54 @@

+{%- if tools %}
+    {{- '<|im_start|>system\n' }}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- messages[0]['content'] }}
+    {%- else %}
+        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
+    {%- endif %}
+    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
+    {%- for tool in tools %}
+        {{- "\n" }}
+        {{- tool | tojson }}
+    {%- endfor %}
+    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
+{%- else %}
+    {%- if messages[0]['role'] == 'system' %}
+        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
+    {%- else %}
+        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
+    {%- endif %}
+{%- endif %}
+{%- for message in messages %}
+    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
+        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
+    {%- elif message.role == "assistant" %}
+        {{- '<|im_start|>' + message.role }}
+        {%- if message.content %}
+            {{- '\n' + message.content }}
+        {%- endif %}
+        {%- for tool_call in message.tool_calls %}
+            {%- if tool_call.function is defined %}
+                {%- set tool_call = tool_call.function %}
+            {%- endif %}
+            {{- '\n<tool_call>\n{"name": "' }}
+            {{- tool_call.name }}
+            {{- '", "arguments": ' }}
+            {{- tool_call.arguments | tojson }}
+            {{- '}\n</tool_call>' }}
+        {%- endfor %}
+        {{- '<|im_end|>\n' }}
+    {%- elif message.role == "tool" %}
+        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
+            {{- '<|im_start|>user' }}
+        {%- endif %}
+        {{- '\n<tool_response>\n' }}
+        {{- message.content }}
+        {{- '\n</tool_response>' }}
+        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
+            {{- '<|im_end|>\n' }}
+        {%- endif %}
+    {%- endif %}
+{%- endfor %}
+{%- if add_generation_prompt %}
+    {{- '<|im_start|>assistant\n' }}
+{%- endif %}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/merges.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": "<|im_end|>"
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9c5ae00e602b8860cbd784ba82a8aa14e8feecec692e7076590d014d7b7fdafa
+size 11421896

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,207 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": {},
+  "model_max_length": 131072,
+  "pad_token": "<|im_end|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/4984/vocab.json ADDED Viewed

The diff for this file is too large to render. See raw diff

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/README.md ADDED Viewed

	@@ -0,0 +1,207 @@

+---
+base_model: Qwen/Qwen2.5-1.5B-Instruct
+library_name: peft
+pipeline_tag: text-generation
+tags:
+- base_model:adapter:Qwen/Qwen2.5-1.5B-Instruct
+- lora
+- transformers
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]
+### Framework versions
+- PEFT 0.18.1

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": false,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 128,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 16,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "q_proj",
+    "down_proj",
+    "up_proj",
+    "v_proj",
+    "gate_proj",
+    "o_proj",
+    "k_proj"
+  ],
+  "target_parameters": null,
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

qwen2.5-1.5B-Instruct#csd/ab_pr_0.5_0.5_8_1e-4/7476/adapter_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cfd4d760f29b46310a656dda6324b650093e68aa899d807abd642782734615a9
+size 504133205