Jerry999 commited on 2 days ago

Commit

e2d07a4

verified ·

1 Parent(s): b4ed115

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +4 -0
checkpoints/math_operations/compositional_full_sft_n_steps_2/chat_template.jinja +4 -0
checkpoints/math_operations/compositional_full_sft_n_steps_2/config.json +71 -0
checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer.json +3 -0
checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer_config.json +29 -0
checkpoints/math_operations/full_sft_50k_lr5e5/README.md +132 -0
checkpoints/math_operations/full_sft_50k_lr5e5/chat_template.jinja +4 -0
checkpoints/math_operations/full_sft_50k_lr5e5/config.json +71 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_results.csv +12 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_summary.json +19 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_B_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_B_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_C_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_C_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_D_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_D_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_E_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_E_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_F_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_F_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_G_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_G_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/full_sft_50k_lr5e5/generation_config.json +12 -0
checkpoints/math_operations/full_sft_50k_lr5e5/model.safetensors +3 -0
checkpoints/math_operations/full_sft_50k_lr5e5/tokenizer.json +3 -0
checkpoints/math_operations/full_sft_50k_lr5e5/tokenizer_config.json +29 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/README.md +157 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/adapter_config.json +46 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/adapter_model.safetensors +3 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/chat_template.jinja +4 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/config.json +71 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/balanced_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/balanced_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/eval_results.csv +11 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/eval_summary.json +19 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_A_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_A_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_B_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_B_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_C_test_alpaca_converted.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_C_test_alpaca_results.jsonl +0 -0
checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_D_test_alpaca_converted.jsonl +0 -0

.gitattributes CHANGED Viewed

@@ -37,3 +37,7 @@ checkpoints/knowledge/atomic_full_sft_50ep/tokenizer.json filter=lfs diff=lfs me
 checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer.json filter=lfs diff=lfs merge=lfs -text

 checkpoints/knowledge/atomic_full_then_2step_full_sft/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/atomic_sft_lora_50ep/merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
 checkpoints/knowledge/atomic_sft_lora_50ep/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/full_sft_50k_lr5e5/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/primitive_atomic_balanced_sft_50k/merged/tokenizer.json filter=lfs diff=lfs merge=lfs -text
+checkpoints/math_operations/primitive_atomic_balanced_sft_50k/tokenizer.json filter=lfs diff=lfs merge=lfs -text

checkpoints/math_operations/compositional_full_sft_n_steps_2/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/compositional_full_sft_n_steps_2/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/compositional_full_sft_n_steps_2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

checkpoints/math_operations/full_sft_50k_lr5e5/README.md ADDED Viewed

	@@ -0,0 +1,132 @@

+---
+library_name: transformers
+tags:
+- generated_from_trainer
+datasets:
+- /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
+model-index:
+- name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.15.0.dev0`
+```yaml
+# Qwen3-4B full fine-tuning SFT — LR 5e-5
+base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+datasets:
+  - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0
+chat_template: chatml
+test_datasets:
+  - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_val_alpaca.jsonl
+    type: alpaca
+output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+gradient_accumulation_steps: 8
+micro_batch_size: 1
+num_epochs: 3
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 5e-5
+bf16: auto
+tf32: true
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+logging_steps: 10
+flash_attention: true
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+save_total_limit: 1
+weight_decay: 0.01
+wandb_project: math_operations_sft
+wandb_name: qwen3-4b-full-sft-50k-lr5e5
+wandb_log_model: "false"
+special_tokens:
+```
+</details><br>
+# home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5
+This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0001
+- Ppl: 1.0001
+- Memory/max Active (gib): 33.95
+- Memory/max Allocated (gib): 33.95
+- Memory/device Reserved (gib): 35.97
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 5e-05
+- train_batch_size: 1
+- eval_batch_size: 1
+- seed: 42
+- gradient_accumulation_steps: 8
+- total_train_batch_size: 8
+- optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 312
+- training_steps: 3123
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Ppl    | Active (gib) | Allocated (gib) | Reserved (gib) |
+|:-------------:|:------:|:----:|:---------------:|:------:|:------------:|:---------------:|:--------------:|
+| No log        | 0      | 0    | 0.8898          | 2.4345 | 10.41        | 10.41           | 10.64          |
+| 0.0029        | 0.5002 | 521  | 0.0023          | 1.0023 | 33.97        | 33.97           | 36.5           |
+| 0.0003        | 1.0    | 1042 | 0.0005          | 1.0005 | 33.95        | 33.95           | 35.97          |
+| 0.0003        | 1.5002 | 1563 | 0.0003          | 1.0003 | 33.95        | 33.95           | 35.97          |
+| 0.0002        | 2.0    | 2084 | 0.0001          | 1.0001 | 33.95        | 33.95           | 35.97          |
+| 0.0001        | 2.5002 | 2605 | 0.0001          | 1.0001 | 33.95        | 33.95           | 35.97          |
+### Framework versions
+- Transformers 5.0.0
+- Pytorch 2.8.0+cu128
+- Datasets 4.5.0
+- Tokenizers 0.22.2

checkpoints/math_operations/full_sft_50k_lr5e5/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/full_sft_50k_lr5e5/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": 151643,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/balanced_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_results.csv ADDED Viewed

	@@ -0,0 +1,12 @@

+category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
+math_operations,balanced_test_alpaca_results,200,198,99.00,200,100.00,2
+math_operations,balanced_test_alpaca_results,200,47,23.50,200,100.00,153
+math_operations,test_alpaca_results,200,0,0.00,196,98.00,200
+math_operations,op_A_test_alpaca_results,200,0,0.00,161,80.50,200
+math_operations,op_B_test_alpaca_results,200,2,1.00,190,95.00,198
+math_operations,op_C_test_alpaca_results,200,0,0.00,198,99.00,200
+math_operations,op_D_test_alpaca_results,200,2,1.00,174,87.00,198
+math_operations,op_E_test_alpaca_results,200,2,1.00,200,100.00,198
+math_operations,op_F_test_alpaca_results,200,1,0.50,198,99.00,199
+math_operations,op_G_test_alpaca_results,200,0,0.00,200,100.00,200
+math_operations,op_H_test_alpaca_results,200,0,0.00,200,100.00,200

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/eval_summary.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "overall": {
+    "total": 200,
+    "correct": 0,
+    "accuracy": 0.0,
+    "format_found": 200,
+    "format_accuracy": 100.0
+  },
+  "per_operation": {
+    "A": {
+      "total": 200,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 200
+    }
+  },
+  "n_errors": 200,
+  "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_results.jsonl"
+}

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_A_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_B_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_B_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_C_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_C_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_D_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_D_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_E_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_E_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_F_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_F_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_G_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_G_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/op_H_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/eval_results/test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/full_sft_50k_lr5e5/generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "do_sample": true,
+  "eos_token_id": [
+    151645,
+    151643
+  ],
+  "pad_token_id": 151643,
+  "temperature": 0.7,
+  "top_k": 20,
+  "top_p": 0.8,
+  "transformers_version": "5.0.0"
+}

checkpoints/math_operations/full_sft_50k_lr5e5/model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:09d049022c056ec8956018a219d3014e48134d7acc9a4e2303db707721354ac9
+size 8044982080

checkpoints/math_operations/full_sft_50k_lr5e5/tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:be75606093db2094d7cd20f3c2f385c212750648bd6ea4fb2bf507a6a4c55506
+size 11422650

checkpoints/math_operations/full_sft_50k_lr5e5/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": null,
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "is_local": true,
+  "model_max_length": 1010000,
+  "pad_token": "<|endoftext|>",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null,
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ]
+}

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/README.md ADDED Viewed

	@@ -0,0 +1,157 @@

+---
+library_name: peft
+tags:
+- axolotl
+- base_model:adapter:/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
+- lora
+- transformers
+datasets:
+- /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
+pipeline_tag: text-generation
+base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
+model-index:
+- name: home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_balanced_sft_50k
+  results: []
+---
+<!-- This model card has been generated automatically according to the information the Trainer had access to. You
+should probably proofread and complete it, then remove this comment. -->
+[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)
+<details><summary>See axolotl config</summary>
+axolotl version: `0.15.0.dev0`
+```yaml
+# Qwen3-4B LoRA SFT on primitive_atomic_balanced_sft_50k (A-H merged, CoT outputs)
+base_model: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507
+# Model loading (full precision, no quantization)
+load_in_8bit: false
+load_in_4bit: false
+strict: false
+# Training dataset (50000 examples, 8 ops balanced @ 6250 each, CoT outputs)
+datasets:
+  - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl
+    type: alpaca
+dataset_prepared_path:
+val_set_size: 0
+chat_template: chatml
+# Validation dataset (200 examples)
+test_datasets:
+  - path: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_val_alpaca.jsonl
+    type: alpaca
+output_dir: /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_balanced_sft_50k
+# Sequence settings
+sequence_len: 2048
+sample_packing: true
+eval_sample_packing: true
+# LoRA configuration (full LoRA, no quantization)
+adapter: lora
+lora_r: 32
+lora_alpha: 64
+lora_dropout: 0.05
+lora_target_linear: true
+# Training hyperparameters
+gradient_accumulation_steps: 4
+micro_batch_size: 2
+num_epochs: 5
+optimizer: adamw_torch_fused
+lr_scheduler: cosine
+learning_rate: 0.0002
+# Precision
+bf16: auto
+tf32: true
+# Memory optimization
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+# Logging and saving
+logging_steps: 10
+flash_attention: true
+warmup_ratio: 0.1
+evals_per_epoch: 2
+saves_per_epoch: 1
+weight_decay: 0.01
+# Wandb logging
+wandb_project: math_operations_sft
+wandb_name: qwen3-4b-primitive-atomic-balanced-lora-sft-50k
+wandb_log_model: "false"
+special_tokens:
+```
+</details><br>
+# home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_balanced_sft_50k
+This model was trained from scratch on the /home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/data/math_operations/primitive_atomic_balanced_sft_50k/balanced_train_alpaca.jsonl dataset.
+It achieves the following results on the evaluation set:
+- Loss: 0.0000
+- Ppl: 1.0000
+- Memory/max Active (gib): 16.23
+- Memory/max Allocated (gib): 16.23
+- Memory/device Reserved (gib): 20.01
+## Model description
+More information needed
+## Intended uses & limitations
+More information needed
+## Training and evaluation data
+More information needed
+## Training procedure
+### Training hyperparameters
+The following hyperparameters were used during training:
+- learning_rate: 0.0002
+- train_batch_size: 2
+- eval_batch_size: 2
+- seed: 42
+- gradient_accumulation_steps: 4
+- total_train_batch_size: 8
+- optimizer: Use adamw_torch_fused with betas=(0.9,0.999) and epsilon=1e-08 and optimizer_args=No additional optimizer arguments
+- lr_scheduler_type: cosine
+- lr_scheduler_warmup_steps: 515
+- training_steps: 5155
+### Training results
+| Training Loss | Epoch  | Step | Validation Loss | Ppl    | Active (gib) | Allocated (gib) | Reserved (gib) |
+|:-------------:|:------:|:----:|:---------------:|:------:|:------------:|:---------------:|:--------------:|
+| No log        | 0      | 0    | 0.8898          | 2.4348 | 13.69        | 13.69           | 13.84          |
+| 0.0009        | 0.5004 | 516  | 0.0009          | 1.0009 | 16.23        | 16.23           | 18.85          |
+| 0.0003        | 1.0019 | 1032 | 0.0005          | 1.0005 | 16.73        | 16.73           | 20.01          |
+| 0.0010        | 1.5023 | 1548 | 0.0007          | 1.0007 | 16.23        | 16.23           | 20.01          |
+| 0.0004        | 2.0039 | 2064 | 0.0003          | 1.0003 | 16.73        | 16.73           | 20.01          |
+| 0.0002        | 2.5042 | 2580 | 0.0001          | 1.0001 | 14.2         | 14.2            | 20.01          |
+| 0.0003        | 3.0039 | 3096 | 0.0001          | 1.0001 | 16.73        | 16.73           | 20.01          |
+| 0.0001        | 3.5042 | 3612 | 0.0002          | 1.0002 | 16.23        | 16.23           | 20.01          |
+| 0.0000        | 4.0058 | 4128 | 0.0000          | 1.0000 | 16.73        | 16.73           | 20.01          |
+| 0.0000        | 4.5062 | 4644 | 0.0000          | 1.0000 | 16.23        | 16.23           | 20.01          |
+### Framework versions
+- PEFT 0.18.1
+- Transformers 5.0.0
+- Pytorch 2.8.0+cu128
+- Datasets 4.5.0
+- Tokenizers 0.22.2

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/adapter_config.json ADDED Viewed

	@@ -0,0 +1,46 @@

+{
+  "alora_invocation_tokens": null,
+  "alpha_pattern": {},
+  "arrow_config": null,
+  "auto_mapping": null,
+  "base_model_name_or_path": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/models/Qwen/Qwen3-4B-Instruct-2507",
+  "bias": "none",
+  "corda_config": null,
+  "ensure_weight_tying": false,
+  "eva_config": null,
+  "exclude_modules": null,
+  "fan_in_fan_out": null,
+  "inference_mode": true,
+  "init_lora_weights": true,
+  "layer_replication": null,
+  "layers_pattern": null,
+  "layers_to_transform": null,
+  "loftq_config": {},
+  "lora_alpha": 64,
+  "lora_bias": false,
+  "lora_dropout": 0.05,
+  "megatron_config": null,
+  "megatron_core": "megatron.core",
+  "modules_to_save": null,
+  "peft_type": "LORA",
+  "peft_version": "0.18.1",
+  "qalora_group_size": 16,
+  "r": 32,
+  "rank_pattern": {},
+  "revision": null,
+  "target_modules": [
+    "down_proj",
+    "gate_proj",
+    "v_proj",
+    "o_proj",
+    "k_proj",
+    "q_proj",
+    "up_proj"
+  ],
+  "target_parameters": [],
+  "task_type": "CAUSAL_LM",
+  "trainable_token_indices": null,
+  "use_dora": false,
+  "use_qalora": false,
+  "use_rslora": false
+}

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/adapter_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:000255b51ab1bcb9bd0dba9e94dbee95aba886cc7127c3bb7beadc80bf8e22b4
+size 264308896

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/chat_template.jinja ADDED Viewed

	@@ -0,0 +1,4 @@

+{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
+' + message['content'] + '<|im_end|>' + '
+'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
+' }}{% endif %}

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/config.json ADDED Viewed

	@@ -0,0 +1,71 @@

+{
+  "architectures": [
+    "Qwen3ForCausalLM"
+  ],
+  "attention_bias": false,
+  "attention_dropout": 0.0,
+  "bos_token_id": null,
+  "dtype": "bfloat16",
+  "eos_token_id": 151645,
+  "head_dim": 128,
+  "hidden_act": "silu",
+  "hidden_size": 2560,
+  "initializer_range": 0.02,
+  "intermediate_size": 9728,
+  "layer_types": [
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention",
+    "full_attention"
+  ],
+  "max_position_embeddings": 262144,
+  "max_window_layers": 36,
+  "model_type": "qwen3",
+  "num_attention_heads": 32,
+  "num_hidden_layers": 36,
+  "num_key_value_heads": 8,
+  "pad_token_id": null,
+  "rms_norm_eps": 1e-06,
+  "rope_parameters": {
+    "rope_theta": 5000000,
+    "rope_type": "default"
+  },
+  "sliding_window": null,
+  "tie_word_embeddings": true,
+  "transformers_version": "5.0.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 151936
+}

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/balanced_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/balanced_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/eval_results.csv ADDED Viewed

	@@ -0,0 +1,11 @@

+category,filename,total,correct,accuracy,format_found,format_accuracy,errors_count
+math_operations,balanced_test_alpaca_results,200,200,100.00,200,100.00,0
+math_operations,test_alpaca_results,200,1,0.50,199,99.50,199
+math_operations,op_A_test_alpaca_results,200,8,4.00,200,100.00,192
+math_operations,op_B_test_alpaca_results,200,1,0.50,200,100.00,199
+math_operations,op_C_test_alpaca_results,200,1,0.50,200,100.00,199
+math_operations,op_D_test_alpaca_results,200,0,0.00,200,100.00,200
+math_operations,op_E_test_alpaca_results,200,0,0.00,200,100.00,200
+math_operations,op_F_test_alpaca_results,200,6,3.00,200,100.00,194
+math_operations,op_G_test_alpaca_results,200,1,0.50,200,100.00,199
+math_operations,op_H_test_alpaca_results,200,0,0.00,198,99.00,200

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/eval_summary.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "overall": {
+    "total": 200,
+    "correct": 0,
+    "accuracy": 0.0,
+    "format_found": 198,
+    "format_accuracy": 99.0
+  },
+  "per_operation": {
+    "A": {
+      "total": 200,
+      "correct": 0,
+      "accuracy": 0.0,
+      "format_found": 198
+    }
+  },
+  "n_errors": 200,
+  "results_file": "/home/jiaruil5/math_rl/mix_teachers/r3lit_rl/mix_teachers/checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_H_test_alpaca_results.jsonl"
+}

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_A_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_A_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_B_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_B_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_C_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_C_test_alpaca_results.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff

checkpoints/math_operations/primitive_atomic_balanced_sft_50k/eval_results/op_D_test_alpaca_converted.jsonl ADDED Viewed

The diff for this file is too large to render. See raw diff